diff options
| -rw-r--r-- | .gitignore | 5 | ||||
| -rw-r--r-- | README.md | 72 | ||||
| -rw-r--r-- | setup.py | 51 | ||||
| -rw-r--r-- | wikiget/__init__.py | 0 | ||||
| -rw-r--r-- | wikiget/version.py | 9 | ||||
| -rw-r--r-- | wikiget/wikiget.py | 148 |
6 files changed, 283 insertions, 2 deletions
@@ -102,3 +102,8 @@ venv.bak/ # mypy .mypy_cache/ + +# IDE files +.vs/ +.vscode/ +.idea/ @@ -1,2 +1,70 @@ -# python-wikiget -Download files from MediaWiki sites +# wikiget + +Something like a wget clone for downloading files from MediaWiki sites (like +Wikipedia or Wikimedia Commons) using only the file name or the URL of its +description page. Requires Python 2 or 3. Install with `pip install --user -U wikiget`. + +## Usage + +`wikiget [-h] [-V] [-q | -v] [-f] [--site SITE] [-o OUTPUT] FILE` + +If `FILE` is in the form `File:Example.jpg` or `Example.jpg`, it will be fetched +from the default site, which is "en.wikipedia.org". If it's the fully-qualified +URL of a file description page, like `https://commons.wikimedia.org/wiki/File:Example.jpg`, +the file is fetched from the specified site, in this case "commons.wikimedia.org". + +The site can also be specified with the `--site` flag, though this will not have +any effect if the full URL is given. + +More detailed information, such as the site used and full URL of the file, can be +displayed with `-v` or `--verbose`. Use `-vv` to display even more detail. `-q` can +be used to silence warnings. + +By default, the program won't overwrite existing files with the same name as the +target, but this can be forced with `-f` or `--force`. Additionally, the file can +be downloaded to a different name with `-o`. + +## Future plans + +- download from any MediaWiki-powered site, not just Wikimedia projects +- download Wikipedia articles, in plain text, wikitext, or other formats + +## Contributing + +It's recommended that you use a virtual environment manager (like virtualenv) to +install dependencies: + +```bash +pip install --user -U virtualenv +git clone https://github.com/clpo13/python-wikiget.git +cd python-wikiget +virtualenv venv +``` + +To activate the virtual environment, use one of the following commands: + +```bash +source venv/bin/activate # Linux and macOS (bash, zsh) +.\venv\Scripts\activate.bat # Windows command prompt +.\venv\Scripts\Activate.ps1 # Windows PowerShell +``` + +Then run `pip install -e .` to invoke an editable install, meaning any changes +made to the source will be reflected immediately. + +## License + +Copyright (C) 2018 Cody Logan + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <https://www.gnu.org/licenses/>. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..784bd16 --- /dev/null +++ b/setup.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""wikiget2 +Simple wget clone for downloading files from Wikimedia sites. +Copyright (C) 2018 Cody Logan; licensed GPLv3+ +SPDX-License-Identifier: GPL-3.0-or-later +""" + +from setuptools import setup, find_packages +from os import path +from io import open + +here = path.abspath(path.dirname(__file__)) +with open(path.join(here, "README.md"), "r") as fr: + long_description = fr.read() + +version = {} +with open("wikiget/version.py") as fv: + exec(fv.read(), version) + +setup( + name="wikiget", + version=version["__version__"], + author="Cody Logan", + author_email="clpo13@gmail.com", + description="Tool for downloading files from MediaWiki sites", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/clpo13/python-wikiget", + keywords="mediawiki wikimedia wikipedia", + packages=find_packages(), + classifiers=[ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 3", + "Topic :: Utilities", + ], + install_requires=["future", "mwclient", "requests", "tqdm"], + project_urls={ + "Bug Reports": "https://github.com/clpo13/python-wikiget/issues", + }, + entry_points={ + "console_scripts": [ + 'wikiget=wikiget.wikiget:main', + ], + }, +) diff --git a/wikiget/__init__.py b/wikiget/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/wikiget/__init__.py diff --git a/wikiget/version.py b/wikiget/version.py new file mode 100644 index 0000000..4607256 --- /dev/null +++ b/wikiget/version.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +"""wikiget2 +Simple wget clone for downloading files from Wikimedia sites. +Copyright (C) 2018 Cody Logan; licensed GPLv3+ +SPDX-License-Identifier: GPL-3.0-or-later +""" + +__version__ = "0.1.0" diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py new file mode 100644 index 0000000..fa49f2b --- /dev/null +++ b/wikiget/wikiget.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +"""wikiget2 +Simple wget clone for downloading files from Wikimedia sites. +Copyright (C) 2018 Cody Logan; licensed GPLv3+ +SPDX-License-Identifier: GPL-3.0-or-later +""" + +from __future__ import absolute_import, division, print_function, unicode_literals +from builtins import open +from future import standard_library +standard_library.install_aliases() + +import argparse +import logging +import os +import re +import sys +from urllib.parse import urlparse + +from mwclient import InvalidResponse, Site, __ver__ as mwclient_version +from requests import ConnectionError +from tqdm import tqdm + +from wikiget.version import __version__ + + +def main(): + default_site = "en.wikipedia.org" + site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) + file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\s]+\.\w+)$") + user_agent = "wikiget/{} (https://github.com/clpo13/wikiget) " \ + "mwclient/{}".format(__version__, mwclient_version) + + parser = argparse.ArgumentParser(description=""" + A tool for downloading files from MediaWiki sites + using the file name or description page URL + """, + epilog=""" + Copyright (C) 2018 Cody Logan. License GPLv3+: GNU GPL version 3 + or later <http://www.gnu.org/licenses/gpl.html>. + This is free software; you are free to change and redistribute it. + There is NO WARRANTY, to the extent permitted by law. + """) + parser.add_argument("FILE", help=""" + name of the file to download with the File: or Image: prefix, + or the URL of its file description page + """) + parser.add_argument("-V", "--version", action="version", version="%(prog)s {}".format(__version__)) + output_options = parser.add_mutually_exclusive_group() + output_options.add_argument("-q", "--quiet", help="suppress warning messages", action="store_true") + output_options.add_argument("-v", "--verbose", + help="print detailed information, use -vv for even more detail", + action="count", default=0) + parser.add_argument("-f", "--force", help="force overwriting existing files", action="store_true") + parser.add_argument("-s", "--site", default=default_site, + help="MediaWiki site to download from (default: %(default)s)") + parser.add_argument("-o", "--output", help="write download to OUTPUT") + args = parser.parse_args() + + # print API and debug messages in verbose mode + if args.verbose >= 2: + logging.basicConfig(level=logging.DEBUG) + elif args.verbose >= 1: + logging.basicConfig(level=logging.WARNING) + + url = urlparse(args.FILE) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not default_site and not args.quiet: + print("Warning: target is a URL, ignoring site specified with --site") + else: + filename = args.FILE + site_name = args.site + + file_match = file_regex.search(filename) + site_match = site_regex.search(site_name) + + # check for valid site parameter + if not site_match: + print("Only Wikimedia sites (wikipedia.org and wikimedia.org) are currently supported.") + sys.exit(1) + + # check if this is a valid file + if file_match: + # get file name without File:/Image: prefix (second match group) + filename = file_match.group(2) + else: + # no file extension or prefix, probably an article + print("Downloading Wikipedia articles is not currently supported. " + "If this is a file, please add the 'File:' prefix.") + sys.exit(1) + + dest = args.output or filename + + if args.verbose >= 2: + print("User agent: {}".format(user_agent)) + + # connect to site and identify ourselves + try: + site = Site(site_name, clients_useragent=user_agent) + except ConnectionError: + # usually this means there is no such site, or there's no network connection + print("Error: couldn't connect to specified site.") + sys.exit(1) + except InvalidResponse as e: + # site exists, but we couldn't communicate with the API endpoint + print(e) + sys.exit(1) + + # get info about the target file + file = site.images[filename] + + if file.imageinfo != {}: + # file exists either locally or at Wikimedia Commons + file_url = file.imageinfo["url"] + file_size = file.imageinfo["size"] + + if args.verbose >= 1: + print("Info: downloading '{}' ({} bytes) from {}".format(filename, file_size, site.host), end="") + if args.output: + print(" to '{}'".format(dest)) + else: + print("\n", end="") + print("Info: {}".format(file_url)) + + if os.path.isfile(dest) and not args.force: + print("File '{}' already exists, skipping download (use -f to ignore)".format(dest)) + else: + try: + # download the file + with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024) as progress_bar: + with open(dest, "wb") as fd: + res = site.connection.get(file_url, stream=True) + progress_bar.set_postfix(file=dest, refresh=False) + for chunk in res.iter_content(1024): + fd.write(chunk) + progress_bar.update(len(chunk)) + except IOError as e: + print("File could not be written. The following error was encountered:") + print(e) + sys.exit(1) + else: + # no file information returned + print("Target does not appear to be a valid file.") + sys.exit(1) |
