From d96ad1118691003506c9b666af7bd93514296916 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 6 Dec 2021 16:19:03 -0800 Subject: Consistent message logging Use Python's logging faciility for messages instead of printing to stdout (except for download progress bars). --- wikiget/dl.py | 81 ++++++++++++++++++++++++------------------------------ wikiget/wikiget.py | 42 +++++++++++++++++++++------- 2 files changed, 68 insertions(+), 55 deletions(-) diff --git a/wikiget/dl.py b/wikiget/dl.py index 0ac8fec..856d8ca 100644 --- a/wikiget/dl.py +++ b/wikiget/dl.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +import logging import os import sys from urllib.parse import unquote, urlparse @@ -33,10 +34,10 @@ def download(dl, args): if url.netloc: filename = url.path site_name = url.netloc - if args.site is not DEFAULT_SITE and not args.quiet: + if args.site is not DEFAULT_SITE: # this will work even if the user specifies 'commons.wikimedia.org' - print('Warning: target is a URL, ' - 'ignoring site specified with --site') + logging.warning("target is a URL, " + "ignoring site specified with --site") else: filename = dl site_name = args.site @@ -49,19 +50,17 @@ def download(dl, args): filename = file_match.group(2) else: # no file extension and/or prefix, probably an article - print(f"Could not parse input '{filename}' as a file. ") + logging.error(f"Could not parse input '{filename}' as a file.") sys.exit(1) filename = unquote(filename) # remove URL encoding for special characters dest = args.output or filename - if args.verbose >= 2: - print(f'User agent: {USER_AGENT}') + logging.debug(f"User agent: {USER_AGENT}") # connect to site and identify ourselves - if args.verbose >= 1: - print(f'Site name: {site_name}') + logging.info(f"Site name: {site_name}") try: site = Site(site_name, path=args.path, clients_useragent=USER_AGENT) if args.username and args.password: @@ -69,24 +68,22 @@ def download(dl, args): except ConnectionError as e: # usually this means there is no such site, or there's no network # connection, though it could be a certificate problem - print("Error: couldn't connect to specified site.") - if args.verbose >= 2: - print('Full error message:') - print(e) + logging.error("Couldn't connect to specified site.") + logging.debug("Full error message:") + logging.debug(e) sys.exit(1) except HTTPError as e: # most likely a 403 forbidden or 404 not found error for api.php - print("Error: couldn't find the specified wiki's api.php. " - "Check the value of --path.") - if args.verbose >= 2: - print('Full error message:') - print(e) + logging.error("Couldn't find the specified wiki's api.php. " + "Check the value of --path.") + logging.debug("Full error message:") + logging.debug(e) sys.exit(1) except (InvalidResponse, LoginError) as e: # InvalidResponse: site exists, but we couldn't communicate with the # API endpoint for some reason other than an HTTP error. # LoginError: missing or invalid credentials - print(e) + logging.error(e) sys.exit(1) # get info about the target file @@ -95,12 +92,11 @@ def download(dl, args): except APIError as e: # an API error at this point likely means access is denied, # which could happen with a private wiki - print('Error: access denied. Try providing credentials with ' - '--username and --password.') - if args.verbose >= 2: - print('Full error message:') - for i in e.args: - print(i) + logging.error("Access denied. Try providing credentials with " + "--username and --password.") + logging.debug("Full error message:") + for i in e.args: + logging.debug(i) sys.exit(1) if file.imageinfo != {}: @@ -110,26 +106,22 @@ def download(dl, args): file_size = file.imageinfo['size'] file_sha1 = file.imageinfo['sha1'] - if args.verbose >= 1: - print(f"Info: downloading '{filename}' " - f"({file_size} bytes) from {site.host}", - end='') - if args.output: - print(f" to '{dest}'") - else: - print('\n', end='') - print(f'Info: {file_url}') + filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" + if args.output: + filename_log += f" to '{dest}'" + logging.info(filename_log) + logging.info(f"{file_url}") if os.path.isfile(dest) and not args.force: - print(f"File '{dest}' already exists, skipping download " - "(use -f to ignore)") + logging.warning(f"File '{dest}' already exists, skipping download " + "(use -f to ignore)") else: try: fd = open(dest, 'wb') except IOError as e: - print('File could not be written. ' - 'The following error was encountered:') - print(e) + logging.error("File could not be written. " + "The following error was encountered:") + logging.error(e) sys.exit(1) else: # download the file(s) @@ -150,18 +142,17 @@ def download(dl, args): # verify file integrity and optionally print details dl_sha1 = verify_hash(dest) - if args.verbose >= 1: - print(f'Info: downloaded file SHA1 is {dl_sha1}') - print(f'Info: server file SHA1 is {file_sha1}') + logging.info(f"Downloaded file SHA1 is {dl_sha1}") + logging.info(f"Server file SHA1 is {file_sha1}") if dl_sha1 == file_sha1: - if args.verbose >= 1: - print('Info: hashes match!') + logging.info("Hashes match!") # at this point, we've successfully downloaded the file else: - print('Error: hash mismatch! Downloaded file may be corrupt.') + logging.error("Hash mismatch! Downloaded file may be corrupt.") sys.exit(1) else: # no file information returned - print(f"Target '{filename}' does not appear to be a valid file.") + logging.error(f"Target '{filename}' does not appear to be " + "a valid file.") sys.exit(1) diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index 1e2e9ed..dfc6027 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -81,29 +81,51 @@ def main(): args = parser.parse_args() - # print API and debug messages in verbose mode + loglevel = logging.WARNING if args.verbose >= 2: - logging.basicConfig(level=logging.DEBUG) + # this includes API and library messages + loglevel = logging.DEBUG elif args.verbose >= 1: - logging.basicConfig(level=logging.WARNING) + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # set up logger + # TODO: optionally save to log file + logging.basicConfig( + level=loglevel, + # format="%(asctime)s [%(levelname)s] %(message)s" + format="[%(levelname)s] %(message)s" + ) if args.batch: # batch download mode input_file = args.FILE - if args.verbose >= 1: - print(f"Info: using batch file '{input_file}'") + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + try: fd = open(input_file, 'r') except IOError as e: - print('File could not be read. ' - 'The following error was encountered:') - print(e) + logging.error("File could not be read. " + "The following error was encountered:") + logging.error(e) sys.exit(1) else: with fd: + # store file contents in memory in case something + # happens to the file while we're downloading for _, line in enumerate(fd): - line = line.strip() - download(line, args) + dl_list.append(line) + + # TODO: validate file contents before download process starts + for line_num, url in enumerate(dl_list, start=1): + url = url.strip() + # keep track of batch file line numbers for + # debugging/logging purposes + logging.info(f"Downloading file {line_num} ({url}):") + download(url, args) else: # single download mode dl = args.FILE -- cgit v1.2.3 From 3b757513dc68a9f846f2d120c3919fb46a89e979 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 6 Dec 2021 16:36:15 -0800 Subject: Initial attempt at logging to file --- wikiget/wikiget.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index dfc6027..4098e03 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -78,6 +78,8 @@ def main(): help='treat FILE as a textfile containing ' 'multiple files to download, one URL or ' 'filename per line', action='store_true') + parser.add_argument('-l', '--logfile', default='', + help='save log output to LOGFILE') args = parser.parse_args() @@ -91,12 +93,17 @@ def main(): loglevel = logging.ERROR # set up logger - # TODO: optionally save to log file - logging.basicConfig( - level=loglevel, - # format="%(asctime)s [%(levelname)s] %(message)s" - format="[%(levelname)s] %(message)s" - ) + if args.logfile: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + filename=args.logfile + ) + else: + logging.basicConfig( + level=loglevel, + format="[%(levelname)s] %(message)s" + ) if args.batch: # batch download mode -- cgit v1.2.3 From 10268e7a76dfe72063d682e6043891b967cbad39 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 7 Dec 2021 15:12:41 -0800 Subject: Different log levels for file and console --- wikiget/wikiget.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index 4098e03..6a537ba 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -92,19 +92,37 @@ def main(): elif args.quiet: loglevel = logging.ERROR - # set up logger + # configure logging: + # console log level is set via -v, -vv, and -q options + # file log level is always info (TODO: add debug option) if args.logfile: + # log to console and file logging.basicConfig( level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", + format="%(asctime)s [%(levelname)-7s] %(message)s", filename=args.logfile ) + + console = logging.StreamHandler() + # TODO: even when loglevel is set to logging.DEBUG, + # debug messages aren't printing to console + console.setLevel(loglevel) + console.setFormatter( + logging.Formatter("[%(levelname)s] %(message)s") + ) + logging.getLogger("").addHandler(console) else: + # log only to console logging.basicConfig( level=loglevel, format="[%(levelname)s] %(message)s" ) + # log events are appended to the file if it already exists, + # so note the start of a new download session + logging.info(f"Starting download session using wikiget {wikiget_version}") + # logging.info(f"Log level is set to {loglevel}") + if args.batch: # batch download mode input_file = args.FILE @@ -131,7 +149,7 @@ def main(): url = url.strip() # keep track of batch file line numbers for # debugging/logging purposes - logging.info(f"Downloading file {line_num} ({url}):") + logging.info(f"Downloading '{url}' at line {line_num}:") download(url, args) else: # single download mode -- cgit v1.2.3 From 3e57a1902f7bf6884662fb2aca403e13787c2d26 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 7 Dec 2021 15:30:18 -0800 Subject: Update README with logging info --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 575bb05..53061b6 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,11 @@ access) are also supported with the use of the `--username` and `--password` flags. More detailed information, such as the site used and full URL of the file, can -be displayed with `-v` or `--verbose`. Use `-vv` to display even more detail. -`-q` can be used to silence warnings. +be displayed with `-v` or `--verbose`. Use `-vv` to display even more detail, +mainly debugging information or API messages. `-q` can be used to silence warnings. +A logfile can be specified with `-l` or `--logfile`. If this option is present, the +logfile will contain the same information as `-v` along with timestamps. New log +entries will be appended to an existing logfile. By default, the program won't overwrite existing files with the same name as the target, but this can be forced with `-f` or `--force`. Additionally, the file @@ -55,6 +58,7 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg ## Future plans +- download multiple files at once in batch mode - continue batch download even if input is malformed or file doesn't exist (possibly by raising exceptions in `download()`) - batch download by (Commons) category or user uploads @@ -62,7 +66,7 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg ## Contributing -Pull requests or bug reports are more than welcome. +Pull requests, bug reports, or feature requests are more than welcome. It's recommended that you use a [virtual environment manager](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) -- cgit v1.2.3 From bb0bf8f0c79c31114a615cb201505de3fae15044 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 7 Dec 2021 15:30:56 -0800 Subject: Standardize on double quotes --- setup.py | 74 ++++++++++++++++++++++++------------------------ test/test_validations.py | 32 ++++++++++----------- wikiget/__init__.py | 10 +++---- wikiget/dl.py | 13 +++++---- wikiget/validations.py | 6 ++-- wikiget/version.py | 2 +- wikiget/wikiget.py | 64 ++++++++++++++++++++--------------------- 7 files changed, 101 insertions(+), 100 deletions(-) diff --git a/setup.py b/setup.py index ab809e2..a10c111 100644 --- a/setup.py +++ b/setup.py @@ -23,56 +23,56 @@ from os import path from setuptools import setup, find_packages here = path.abspath(path.dirname(__file__)) -with open(path.join(here, 'README.md'), 'r') as fr: +with open(path.join(here, "README.md"), "r") as fr: long_description = fr.read() version_file = {} -with open(path.join(here, 'wikiget', 'version.py'), 'r') as fv: +with open(path.join(here, "wikiget", "version.py"), "r") as fv: exec(fv.read(), version_file) setup( - name='wikiget', - version=version_file['__version__'], - author='Cody Logan', - author_email='clpo13@gmail.com', - description='CLI tool for downloading files from MediaWiki sites', + name="wikiget", + version=version_file["__version__"], + author="Cody Logan", + author_email="clpo13@gmail.com", + description="CLI tool for downloading files from MediaWiki sites", long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/clpo13/wikiget', - keywords='commons download mediawiki wikimedia wikipedia', + long_description_content_type="text/markdown", + url="https://github.com/clpo13/wikiget", + keywords="commons download mediawiki wikimedia wikipedia", packages=find_packages(), classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: End Users/Desktop', - 'License :: OSI Approved :: GNU General Public License v3 or later ' - '(GPLv3+)', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Topic :: Internet', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: Multimedia', - 'Topic :: Multimedia :: Graphics', - 'Topic :: Multimedia :: Sound/Audio', - 'Topic :: Multimedia :: Video', - 'Topic :: Utilities', + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: GNU General Public License v3 or later " + "(GPLv3+)", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Topic :: Internet", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Multimedia", + "Topic :: Multimedia :: Graphics", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Multimedia :: Video", + "Topic :: Utilities", ], - python_requires='>=3.6', - install_requires=['mwclient>=0.10.0', 'requests', 'tqdm'], - setup_requires=['pytest-runner'], - tests_require=['pytest', 'pytest-cov'], + python_requires=">=3.6", + install_requires=["mwclient>=0.10.0", "requests", "tqdm"], + setup_requires=["pytest-runner"], + tests_require=["pytest", "pytest-cov"], project_urls={ - 'Bug Reports': 'https://github.com/clpo13/wikiget/issues', + "Bug Reports": "https://github.com/clpo13/wikiget/issues", }, entry_points={ - 'console_scripts': [ - 'wikiget=wikiget.wikiget:main', + "console_scripts": [ + "wikiget=wikiget.wikiget:main", ], }, ) diff --git a/test/test_validations.py b/test/test_validations.py index 5b7d4fc..8dd4d6d 100644 --- a/test/test_validations.py +++ b/test/test_validations.py @@ -23,8 +23,8 @@ def test_invalid_site_input(): """ Invalid site strings should not return regex match objects. """ - invalid_input = ['example.com', 'vim.wikia.com', - 'en.wikipedia.com', 'en.wikimpedia.org'] + invalid_input = ["example.com", "vim.wikia.com", + "en.wikipedia.com", "en.wikimpedia.org"] for i in invalid_input: site_match = valid_site(i) assert site_match is None @@ -34,8 +34,8 @@ def test_valid_site_input(): """ Valid site strings should return regex match objects. """ - valid_input = ['en.wikipedia.org', 'commons.wikimedia.org', - 'de.wikipedia.org', 'meta.wikimedia.org'] + valid_input = ["en.wikipedia.org", "commons.wikimedia.org", + "de.wikipedia.org", "meta.wikimedia.org"] for i in valid_input: site_match = valid_site(i) assert site_match is not None @@ -46,20 +46,20 @@ def test_file_regex(): File regex should return a match object with match groups corresponding to the file prefix and name. """ - i = 'File:Example.jpg' + i = "File:Example.jpg" file_match = valid_file(i) assert file_match is not None - assert file_match.group(0) == 'File:Example.jpg' # entire match - assert file_match.group(1) == 'File:' # first group - assert file_match.group(2) == 'Example.jpg' # second group + assert file_match.group(0) == "File:Example.jpg" # entire match + assert file_match.group(1) == "File:" # first group + assert file_match.group(2) == "Example.jpg" # second group def test_invalid_file_input(): """ Invalid file strings should not return regex match objects. """ - invalid_input = ['file:example', 'example.jpg', 'Foo Bar.gif', - 'Fil:Example.jpg'] + invalid_input = ["file:example", "example.jpg", "Foo Bar.gif", + "Fil:Example.jpg"] for i in invalid_input: file_match = valid_file(i) assert file_match is None @@ -69,9 +69,9 @@ def test_valid_file_input(): """ Valid file strings should return regex match objects. """ - valid_input = ['Image:example.jpg', 'file:example.jpg', - 'File:example.file-01.jpg', 'FILE:FOO.BMP', - 'File:ß handwritten sample.gif', 'File:A (1).jpeg'] + valid_input = ["Image:example.jpg", "file:example.jpg", + "File:example.file-01.jpg", "FILE:FOO.BMP", + "File:ß handwritten sample.gif", "File:A (1).jpeg"] for i in valid_input: file_match = valid_file(i) assert file_match is not None @@ -81,9 +81,9 @@ def test_verify_hash(tmp_path): """ Confirm that verify_hash returns the proper SHA1 hash. """ - file_name = 'testfile' - file_contents = 'foobar' - file_sha1 = '8843d7f92416211de9ebb963ff4ce28125932878' + file_name = "testfile" + file_contents = "foobar" + file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878" tmp_file = tmp_path / file_name tmp_file.write_text(file_contents) diff --git a/wikiget/__init__.py b/wikiget/__init__.py index 8437ebf..4adcae3 100644 --- a/wikiget/__init__.py +++ b/wikiget/__init__.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018, 2019, 2020 Cody Logan and contributors +# Copyright (C) 2018-2021 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -22,7 +22,7 @@ from .version import __version__ as wikiget_version # set some global constants BLOCKSIZE = 65536 CHUNKSIZE = 1024 -DEFAULT_SITE = 'commons.wikimedia.org' -DEFAULT_PATH = '/w/' -USER_AGENT = ('wikiget/{} (https://github.com/clpo13/wikiget) ' - 'mwclient/{}'.format(wikiget_version, mwclient_version)) +DEFAULT_SITE = "commons.wikimedia.org" +DEFAULT_PATH = "/w/" +USER_AGENT = (f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " + f"mwclient/{mwclient_version}") diff --git a/wikiget/dl.py b/wikiget/dl.py index 856d8ca..8f32218 100644 --- a/wikiget/dl.py +++ b/wikiget/dl.py @@ -102,11 +102,12 @@ def download(dl, args): if file.imageinfo != {}: # file exists either locally or at a common repository, # like Wikimedia Commons - file_url = file.imageinfo['url'] - file_size = file.imageinfo['size'] - file_sha1 = file.imageinfo['sha1'] + file_url = file.imageinfo["url"] + file_size = file.imageinfo["size"] + file_sha1 = file.imageinfo["sha1"] - filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" + filename_log = (f"Downloading '{filename}' ({file_size} bytes) " + f"from {site.host}") if args.output: filename_log += f" to '{dest}'" logging.info(filename_log) @@ -117,7 +118,7 @@ def download(dl, args): "(use -f to ignore)") else: try: - fd = open(dest, 'wb') + fd = open(dest, "wb") except IOError as e: logging.error("File could not be written. " "The following error was encountered:") @@ -130,7 +131,7 @@ def download(dl, args): else: leave_bars = False with tqdm(leave=leave_bars, total=file_size, - unit='B', unit_scale=True, + unit="B", unit_scale=True, unit_divisor=CHUNKSIZE) as progress_bar: with fd: res = site.connection.get(file_url, stream=True) diff --git a/wikiget/validations.py b/wikiget/validations.py index 20ef74f..bd99570 100644 --- a/wikiget/validations.py +++ b/wikiget/validations.py @@ -31,7 +31,7 @@ def valid_file(search_string): """ # second group could also restrict to file extensions with three or more # letters with ([^/\r\n\t\f\v]+\.\w{3,}) - file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I) + file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I) return file_regex.search(search_string) @@ -44,7 +44,7 @@ def valid_site(search_string): :param search_string: string to validate :returns: a regex Match object if there's a match or None otherwise """ - site_regex = re.compile(r'wiki[mp]edia\.org$', re.I) + site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) return site_regex.search(search_string) @@ -56,7 +56,7 @@ def verify_hash(filename): :return: hash digest """ hasher = hashlib.sha1() - with open(filename, 'rb') as dl: + with open(filename, "rb") as dl: buf = dl.read(BLOCKSIZE) while len(buf) > 0: hasher.update(buf) diff --git a/wikiget/version.py b/wikiget/version.py index 93b60a1..dd9b22c 100644 --- a/wikiget/version.py +++ b/wikiget/version.py @@ -1 +1 @@ -__version__ = '0.5.1' +__version__ = "0.5.1" diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index 6a537ba..a8679c9 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -44,42 +44,42 @@ def main(): conditions. There is NO WARRANTY, to the extent permitted by law. """) - parser.add_argument('FILE', help=""" + parser.add_argument("FILE", help=""" name of the file to download with the File: prefix, or the URL of its file description page """) - parser.add_argument('-V', '--version', action='version', - version=f'%(prog)s {wikiget_version}') + parser.add_argument("-V", "--version", action="version", + version=f"%(prog)s {wikiget_version}") message_options = parser.add_mutually_exclusive_group() - message_options.add_argument('-q', '--quiet', - help='suppress warning messages', - action='store_true') - message_options.add_argument('-v', '--verbose', - help='print detailed information; ' - 'use -vv for even more detail', - action='count', default=0) - parser.add_argument('-f', '--force', - help='force overwriting existing files', - action='store_true') - parser.add_argument('-s', '--site', default=DEFAULT_SITE, - help='MediaWiki site to download from ' - '(default: %(default)s)') - parser.add_argument('-p', '--path', default=DEFAULT_PATH, - help='MediaWiki site path, where api.php is located ' - '(default: %(default)s)') - parser.add_argument('--username', default='', - help='MediaWiki site username, for private wikis') - parser.add_argument('--password', default='', - help='MediaWiki site password, for private wikis') + message_options.add_argument("-q", "--quiet", + help="suppress warning messages", + action="store_true") + message_options.add_argument("-v", "--verbose", + help="print detailed information; " + "use -vv for even more detail", + action="count", default=0) + parser.add_argument("-f", "--force", + help="force overwriting existing files", + action="store_true") + parser.add_argument("-s", "--site", default=DEFAULT_SITE, + help="MediaWiki site to download from " + "(default: %(default)s)") + parser.add_argument("-p", "--path", default=DEFAULT_PATH, + help="MediaWiki site path, where api.php is located " + "(default: %(default)s)") + parser.add_argument("--username", default="", + help="MediaWiki site username, for private wikis") + parser.add_argument("--password", default="", + help="MediaWiki site password, for private wikis") output_options = parser.add_mutually_exclusive_group() - output_options.add_argument('-o', '--output', - help='write download to OUTPUT') - output_options.add_argument('-a', '--batch', - help='treat FILE as a textfile containing ' - 'multiple files to download, one URL or ' - 'filename per line', action='store_true') - parser.add_argument('-l', '--logfile', default='', - help='save log output to LOGFILE') + output_options.add_argument("-o", "--output", + help="write download to OUTPUT") + output_options.add_argument("-a", "--batch", + help="treat FILE as a textfile containing " + "multiple files to download, one URL or " + "filename per line", action="store_true") + parser.add_argument("-l", "--logfile", default="", + help="save log output to LOGFILE") args = parser.parse_args() @@ -131,7 +131,7 @@ def main(): logging.info(f"Using batch file '{input_file}'.") try: - fd = open(input_file, 'r') + fd = open(input_file, "r") except IOError as e: logging.error("File could not be read. " "The following error was encountered:") -- cgit v1.2.3 From 80633b5e864cf5705a4636dd05601531a5c33a45 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 13:31:48 -0700 Subject: Update link in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a0ccf8d..029aaf7 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ source venv/bin/activate ``` Then run `pip install -e .` to invoke an -["editable" install](https://pip.pypa.io/en/stable/reference/pip_install/#editable-installs), +["editable" install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) meaning any changes made to the source will be reflected immediately in the executable script. Unit tests can be run with `pytest` (make sure to run `pip install pytest` in the virtual environment first.) -- cgit v1.2.3 From a1995912ed24b37a990f3fcd5e91dbf7b46669fb Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:17:04 -0700 Subject: Reorganize file tree --- src/wikiget/__init__.py | 28 ++++++++ src/wikiget/dl.py | 159 +++++++++++++++++++++++++++++++++++++++++++++ src/wikiget/validations.py | 64 ++++++++++++++++++ src/wikiget/version.py | 1 + src/wikiget/wikiget.py | 157 ++++++++++++++++++++++++++++++++++++++++++++ test/test_validations.py | 91 -------------------------- tests/test_validations.py | 91 ++++++++++++++++++++++++++ wikiget/__init__.py | 28 -------- wikiget/dl.py | 159 --------------------------------------------- wikiget/validations.py | 64 ------------------ wikiget/version.py | 1 - wikiget/wikiget.py | 157 -------------------------------------------- 12 files changed, 500 insertions(+), 500 deletions(-) create mode 100644 src/wikiget/__init__.py create mode 100644 src/wikiget/dl.py create mode 100644 src/wikiget/validations.py create mode 100644 src/wikiget/version.py create mode 100644 src/wikiget/wikiget.py delete mode 100644 test/test_validations.py create mode 100644 tests/test_validations.py delete mode 100644 wikiget/__init__.py delete mode 100644 wikiget/dl.py delete mode 100644 wikiget/validations.py delete mode 100644 wikiget/version.py delete mode 100644 wikiget/wikiget.py diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py new file mode 100644 index 0000000..4adcae3 --- /dev/null +++ b/src/wikiget/__init__.py @@ -0,0 +1,28 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +from mwclient import __version__ as mwclient_version + +from .version import __version__ as wikiget_version + +# set some global constants +BLOCKSIZE = 65536 +CHUNKSIZE = 1024 +DEFAULT_SITE = "commons.wikimedia.org" +DEFAULT_PATH = "/w/" +USER_AGENT = (f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " + f"mwclient/{mwclient_version}") diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py new file mode 100644 index 0000000..8f32218 --- /dev/null +++ b/src/wikiget/dl.py @@ -0,0 +1,159 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging +import os +import sys +from urllib.parse import unquote, urlparse + +from mwclient import APIError, InvalidResponse, LoginError, Site +from requests import ConnectionError, HTTPError +from tqdm import tqdm + +from . import CHUNKSIZE, DEFAULT_SITE, USER_AGENT +from .validations import valid_file, verify_hash + + +def download(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not DEFAULT_SITE: + # this will work even if the user specifies 'commons.wikimedia.org' + logging.warning("target is a URL, " + "ignoring site specified with --site") + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + logging.error(f"Could not parse input '{filename}' as a file.") + sys.exit(1) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + logging.debug(f"User agent: {USER_AGENT}") + + # connect to site and identify ourselves + logging.info(f"Site name: {site_name}") + try: + site = Site(site_name, path=args.path, clients_useragent=USER_AGENT) + if args.username and args.password: + site.login(args.username, args.password) + except ConnectionError as e: + # usually this means there is no such site, or there's no network + # connection, though it could be a certificate problem + logging.error("Couldn't connect to specified site.") + logging.debug("Full error message:") + logging.debug(e) + sys.exit(1) + except HTTPError as e: + # most likely a 403 forbidden or 404 not found error for api.php + logging.error("Couldn't find the specified wiki's api.php. " + "Check the value of --path.") + logging.debug("Full error message:") + logging.debug(e) + sys.exit(1) + except (InvalidResponse, LoginError) as e: + # InvalidResponse: site exists, but we couldn't communicate with the + # API endpoint for some reason other than an HTTP error. + # LoginError: missing or invalid credentials + logging.error(e) + sys.exit(1) + + # get info about the target file + try: + file = site.images[filename] + except APIError as e: + # an API error at this point likely means access is denied, + # which could happen with a private wiki + logging.error("Access denied. Try providing credentials with " + "--username and --password.") + logging.debug("Full error message:") + for i in e.args: + logging.debug(i) + sys.exit(1) + + if file.imageinfo != {}: + # file exists either locally or at a common repository, + # like Wikimedia Commons + file_url = file.imageinfo["url"] + file_size = file.imageinfo["size"] + file_sha1 = file.imageinfo["sha1"] + + filename_log = (f"Downloading '{filename}' ({file_size} bytes) " + f"from {site.host}") + if args.output: + filename_log += f" to '{dest}'" + logging.info(filename_log) + logging.info(f"{file_url}") + + if os.path.isfile(dest) and not args.force: + logging.warning(f"File '{dest}' already exists, skipping download " + "(use -f to ignore)") + else: + try: + fd = open(dest, "wb") + except IOError as e: + logging.error("File could not be written. " + "The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + # download the file(s) + if args.verbose >= 1: + leave_bars = True + else: + leave_bars = False + with tqdm(leave=leave_bars, total=file_size, + unit="B", unit_scale=True, + unit_divisor=CHUNKSIZE) as progress_bar: + with fd: + res = site.connection.get(file_url, stream=True) + progress_bar.set_postfix(file=dest, refresh=False) + for chunk in res.iter_content(CHUNKSIZE): + fd.write(chunk) + progress_bar.update(len(chunk)) + + # verify file integrity and optionally print details + dl_sha1 = verify_hash(dest) + + logging.info(f"Downloaded file SHA1 is {dl_sha1}") + logging.info(f"Server file SHA1 is {file_sha1}") + if dl_sha1 == file_sha1: + logging.info("Hashes match!") + # at this point, we've successfully downloaded the file + else: + logging.error("Hash mismatch! Downloaded file may be corrupt.") + sys.exit(1) + + else: + # no file information returned + logging.error(f"Target '{filename}' does not appear to be " + "a valid file.") + sys.exit(1) diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py new file mode 100644 index 0000000..bd99570 --- /dev/null +++ b/src/wikiget/validations.py @@ -0,0 +1,64 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018, 2019, 2020 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import hashlib +import re + +from . import BLOCKSIZE + + +def valid_file(search_string): + """ + Determines if the given string contains a valid file name, defined as a + string ending with a '.' and at least one character, beginning with 'File:' + or 'Image:', the standard file prefixes in MediaWiki. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + # second group could also restrict to file extensions with three or more + # letters with ([^/\r\n\t\f\v]+\.\w{3,}) + file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I) + return file_regex.search(search_string) + + +def valid_site(search_string): + """ + Determines if the given string contains a valid site name, defined as a + string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all + subdomains of those domains. Eventually, it should be possible to support + any MediaWiki site, regardless of domain name. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) + return site_regex.search(search_string) + + +def verify_hash(filename): + """ + Calculates the SHA1 hash of the given file for comparison with a known + value. + :param filename: name of the file to calculate a hash for + :return: hash digest + """ + hasher = hashlib.sha1() + with open(filename, "rb") as dl: + buf = dl.read(BLOCKSIZE) + while len(buf) > 0: + hasher.update(buf) + buf = dl.read(BLOCKSIZE) + return hasher.hexdigest() diff --git a/src/wikiget/version.py b/src/wikiget/version.py new file mode 100644 index 0000000..dd9b22c --- /dev/null +++ b/src/wikiget/version.py @@ -0,0 +1 @@ +__version__ = "0.5.1" diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py new file mode 100644 index 0000000..a8679c9 --- /dev/null +++ b/src/wikiget/wikiget.py @@ -0,0 +1,157 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import argparse +import logging +import sys + +from . import DEFAULT_SITE, DEFAULT_PATH, wikiget_version +from .dl import download + + +def main(): + """ + Main entry point for console script. Automatically compiled by setuptools + when installed with `pip install` or `python setup.py install`. + """ + + parser = argparse.ArgumentParser(description=""" + A tool for downloading files from + MediaWiki sites using the file name or + description page URL + """, + epilog=""" + Copyright (C) 2018-2021 Cody Logan + and contributors. + License GPLv3+: GNU GPL version 3 or later + . + This is free software; you are free to + change and redistribute it under certain + conditions. There is NO WARRANTY, to the + extent permitted by law. + """) + parser.add_argument("FILE", help=""" + name of the file to download with the File: + prefix, or the URL of its file description page + """) + parser.add_argument("-V", "--version", action="version", + version=f"%(prog)s {wikiget_version}") + message_options = parser.add_mutually_exclusive_group() + message_options.add_argument("-q", "--quiet", + help="suppress warning messages", + action="store_true") + message_options.add_argument("-v", "--verbose", + help="print detailed information; " + "use -vv for even more detail", + action="count", default=0) + parser.add_argument("-f", "--force", + help="force overwriting existing files", + action="store_true") + parser.add_argument("-s", "--site", default=DEFAULT_SITE, + help="MediaWiki site to download from " + "(default: %(default)s)") + parser.add_argument("-p", "--path", default=DEFAULT_PATH, + help="MediaWiki site path, where api.php is located " + "(default: %(default)s)") + parser.add_argument("--username", default="", + help="MediaWiki site username, for private wikis") + parser.add_argument("--password", default="", + help="MediaWiki site password, for private wikis") + output_options = parser.add_mutually_exclusive_group() + output_options.add_argument("-o", "--output", + help="write download to OUTPUT") + output_options.add_argument("-a", "--batch", + help="treat FILE as a textfile containing " + "multiple files to download, one URL or " + "filename per line", action="store_true") + parser.add_argument("-l", "--logfile", default="", + help="save log output to LOGFILE") + + args = parser.parse_args() + + loglevel = logging.WARNING + if args.verbose >= 2: + # this includes API and library messages + loglevel = logging.DEBUG + elif args.verbose >= 1: + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # configure logging: + # console log level is set via -v, -vv, and -q options + # file log level is always info (TODO: add debug option) + if args.logfile: + # log to console and file + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)-7s] %(message)s", + filename=args.logfile + ) + + console = logging.StreamHandler() + # TODO: even when loglevel is set to logging.DEBUG, + # debug messages aren't printing to console + console.setLevel(loglevel) + console.setFormatter( + logging.Formatter("[%(levelname)s] %(message)s") + ) + logging.getLogger("").addHandler(console) + else: + # log only to console + logging.basicConfig( + level=loglevel, + format="[%(levelname)s] %(message)s" + ) + + # log events are appended to the file if it already exists, + # so note the start of a new download session + logging.info(f"Starting download session using wikiget {wikiget_version}") + # logging.info(f"Log level is set to {loglevel}") + + if args.batch: + # batch download mode + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file, "r") + except IOError as e: + logging.error("File could not be read. " + "The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something + # happens to the file while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + for line_num, url in enumerate(dl_list, start=1): + url = url.strip() + # keep track of batch file line numbers for + # debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + download(url, args) + else: + # single download mode + dl = args.FILE + download(dl, args) diff --git a/test/test_validations.py b/test/test_validations.py deleted file mode 100644 index 8dd4d6d..0000000 --- a/test/test_validations.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding: utf-8 -*- -# wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Wikiget is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Wikiget is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Wikiget. If not, see . - -from wikiget.validations import valid_file, valid_site, verify_hash - - -def test_invalid_site_input(): - """ - Invalid site strings should not return regex match objects. - """ - invalid_input = ["example.com", "vim.wikia.com", - "en.wikipedia.com", "en.wikimpedia.org"] - for i in invalid_input: - site_match = valid_site(i) - assert site_match is None - - -def test_valid_site_input(): - """ - Valid site strings should return regex match objects. - """ - valid_input = ["en.wikipedia.org", "commons.wikimedia.org", - "de.wikipedia.org", "meta.wikimedia.org"] - for i in valid_input: - site_match = valid_site(i) - assert site_match is not None - - -def test_file_regex(): - """ - File regex should return a match object with match groups corresponding - to the file prefix and name. - """ - i = "File:Example.jpg" - file_match = valid_file(i) - assert file_match is not None - assert file_match.group(0) == "File:Example.jpg" # entire match - assert file_match.group(1) == "File:" # first group - assert file_match.group(2) == "Example.jpg" # second group - - -def test_invalid_file_input(): - """ - Invalid file strings should not return regex match objects. - """ - invalid_input = ["file:example", "example.jpg", "Foo Bar.gif", - "Fil:Example.jpg"] - for i in invalid_input: - file_match = valid_file(i) - assert file_match is None - - -def test_valid_file_input(): - """ - Valid file strings should return regex match objects. - """ - valid_input = ["Image:example.jpg", "file:example.jpg", - "File:example.file-01.jpg", "FILE:FOO.BMP", - "File:ß handwritten sample.gif", "File:A (1).jpeg"] - for i in valid_input: - file_match = valid_file(i) - assert file_match is not None - - -def test_verify_hash(tmp_path): - """ - Confirm that verify_hash returns the proper SHA1 hash. - """ - file_name = "testfile" - file_contents = "foobar" - file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878" - - tmp_file = tmp_path / file_name - tmp_file.write_text(file_contents) - - assert verify_hash(tmp_file) == file_sha1 diff --git a/tests/test_validations.py b/tests/test_validations.py new file mode 100644 index 0000000..8dd4d6d --- /dev/null +++ b/tests/test_validations.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +from wikiget.validations import valid_file, valid_site, verify_hash + + +def test_invalid_site_input(): + """ + Invalid site strings should not return regex match objects. + """ + invalid_input = ["example.com", "vim.wikia.com", + "en.wikipedia.com", "en.wikimpedia.org"] + for i in invalid_input: + site_match = valid_site(i) + assert site_match is None + + +def test_valid_site_input(): + """ + Valid site strings should return regex match objects. + """ + valid_input = ["en.wikipedia.org", "commons.wikimedia.org", + "de.wikipedia.org", "meta.wikimedia.org"] + for i in valid_input: + site_match = valid_site(i) + assert site_match is not None + + +def test_file_regex(): + """ + File regex should return a match object with match groups corresponding + to the file prefix and name. + """ + i = "File:Example.jpg" + file_match = valid_file(i) + assert file_match is not None + assert file_match.group(0) == "File:Example.jpg" # entire match + assert file_match.group(1) == "File:" # first group + assert file_match.group(2) == "Example.jpg" # second group + + +def test_invalid_file_input(): + """ + Invalid file strings should not return regex match objects. + """ + invalid_input = ["file:example", "example.jpg", "Foo Bar.gif", + "Fil:Example.jpg"] + for i in invalid_input: + file_match = valid_file(i) + assert file_match is None + + +def test_valid_file_input(): + """ + Valid file strings should return regex match objects. + """ + valid_input = ["Image:example.jpg", "file:example.jpg", + "File:example.file-01.jpg", "FILE:FOO.BMP", + "File:ß handwritten sample.gif", "File:A (1).jpeg"] + for i in valid_input: + file_match = valid_file(i) + assert file_match is not None + + +def test_verify_hash(tmp_path): + """ + Confirm that verify_hash returns the proper SHA1 hash. + """ + file_name = "testfile" + file_contents = "foobar" + file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878" + + tmp_file = tmp_path / file_name + tmp_file.write_text(file_contents) + + assert verify_hash(tmp_file) == file_sha1 diff --git a/wikiget/__init__.py b/wikiget/__init__.py deleted file mode 100644 index 4adcae3..0000000 --- a/wikiget/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Wikiget is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Wikiget is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Wikiget. If not, see . - -from mwclient import __version__ as mwclient_version - -from .version import __version__ as wikiget_version - -# set some global constants -BLOCKSIZE = 65536 -CHUNKSIZE = 1024 -DEFAULT_SITE = "commons.wikimedia.org" -DEFAULT_PATH = "/w/" -USER_AGENT = (f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " - f"mwclient/{mwclient_version}") diff --git a/wikiget/dl.py b/wikiget/dl.py deleted file mode 100644 index 8f32218..0000000 --- a/wikiget/dl.py +++ /dev/null @@ -1,159 +0,0 @@ -# wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Wikiget is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Wikiget is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Wikiget. If not, see . - -import logging -import os -import sys -from urllib.parse import unquote, urlparse - -from mwclient import APIError, InvalidResponse, LoginError, Site -from requests import ConnectionError, HTTPError -from tqdm import tqdm - -from . import CHUNKSIZE, DEFAULT_SITE, USER_AGENT -from .validations import valid_file, verify_hash - - -def download(dl, args): - url = urlparse(dl) - - if url.netloc: - filename = url.path - site_name = url.netloc - if args.site is not DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, " - "ignoring site specified with --site") - else: - filename = dl - site_name = args.site - - file_match = valid_file(filename) - - # check if this is a valid file - if file_match and file_match.group(1): - # has File:/Image: prefix and extension - filename = file_match.group(2) - else: - # no file extension and/or prefix, probably an article - logging.error(f"Could not parse input '{filename}' as a file.") - sys.exit(1) - - filename = unquote(filename) # remove URL encoding for special characters - - dest = args.output or filename - - logging.debug(f"User agent: {USER_AGENT}") - - # connect to site and identify ourselves - logging.info(f"Site name: {site_name}") - try: - site = Site(site_name, path=args.path, clients_useragent=USER_AGENT) - if args.username and args.password: - site.login(args.username, args.password) - except ConnectionError as e: - # usually this means there is no such site, or there's no network - # connection, though it could be a certificate problem - logging.error("Couldn't connect to specified site.") - logging.debug("Full error message:") - logging.debug(e) - sys.exit(1) - except HTTPError as e: - # most likely a 403 forbidden or 404 not found error for api.php - logging.error("Couldn't find the specified wiki's api.php. " - "Check the value of --path.") - logging.debug("Full error message:") - logging.debug(e) - sys.exit(1) - except (InvalidResponse, LoginError) as e: - # InvalidResponse: site exists, but we couldn't communicate with the - # API endpoint for some reason other than an HTTP error. - # LoginError: missing or invalid credentials - logging.error(e) - sys.exit(1) - - # get info about the target file - try: - file = site.images[filename] - except APIError as e: - # an API error at this point likely means access is denied, - # which could happen with a private wiki - logging.error("Access denied. Try providing credentials with " - "--username and --password.") - logging.debug("Full error message:") - for i in e.args: - logging.debug(i) - sys.exit(1) - - if file.imageinfo != {}: - # file exists either locally or at a common repository, - # like Wikimedia Commons - file_url = file.imageinfo["url"] - file_size = file.imageinfo["size"] - file_sha1 = file.imageinfo["sha1"] - - filename_log = (f"Downloading '{filename}' ({file_size} bytes) " - f"from {site.host}") - if args.output: - filename_log += f" to '{dest}'" - logging.info(filename_log) - logging.info(f"{file_url}") - - if os.path.isfile(dest) and not args.force: - logging.warning(f"File '{dest}' already exists, skipping download " - "(use -f to ignore)") - else: - try: - fd = open(dest, "wb") - except IOError as e: - logging.error("File could not be written. " - "The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - # download the file(s) - if args.verbose >= 1: - leave_bars = True - else: - leave_bars = False - with tqdm(leave=leave_bars, total=file_size, - unit="B", unit_scale=True, - unit_divisor=CHUNKSIZE) as progress_bar: - with fd: - res = site.connection.get(file_url, stream=True) - progress_bar.set_postfix(file=dest, refresh=False) - for chunk in res.iter_content(CHUNKSIZE): - fd.write(chunk) - progress_bar.update(len(chunk)) - - # verify file integrity and optionally print details - dl_sha1 = verify_hash(dest) - - logging.info(f"Downloaded file SHA1 is {dl_sha1}") - logging.info(f"Server file SHA1 is {file_sha1}") - if dl_sha1 == file_sha1: - logging.info("Hashes match!") - # at this point, we've successfully downloaded the file - else: - logging.error("Hash mismatch! Downloaded file may be corrupt.") - sys.exit(1) - - else: - # no file information returned - logging.error(f"Target '{filename}' does not appear to be " - "a valid file.") - sys.exit(1) diff --git a/wikiget/validations.py b/wikiget/validations.py deleted file mode 100644 index bd99570..0000000 --- a/wikiget/validations.py +++ /dev/null @@ -1,64 +0,0 @@ -# wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018, 2019, 2020 Cody Logan -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Wikiget is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Wikiget is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Wikiget. If not, see . - -import hashlib -import re - -from . import BLOCKSIZE - - -def valid_file(search_string): - """ - Determines if the given string contains a valid file name, defined as a - string ending with a '.' and at least one character, beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. - :param search_string: string to validate - :returns: a regex Match object if there's a match or None otherwise - """ - # second group could also restrict to file extensions with three or more - # letters with ([^/\r\n\t\f\v]+\.\w{3,}) - file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I) - return file_regex.search(search_string) - - -def valid_site(search_string): - """ - Determines if the given string contains a valid site name, defined as a - string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all - subdomains of those domains. Eventually, it should be possible to support - any MediaWiki site, regardless of domain name. - :param search_string: string to validate - :returns: a regex Match object if there's a match or None otherwise - """ - site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) - return site_regex.search(search_string) - - -def verify_hash(filename): - """ - Calculates the SHA1 hash of the given file for comparison with a known - value. - :param filename: name of the file to calculate a hash for - :return: hash digest - """ - hasher = hashlib.sha1() - with open(filename, "rb") as dl: - buf = dl.read(BLOCKSIZE) - while len(buf) > 0: - hasher.update(buf) - buf = dl.read(BLOCKSIZE) - return hasher.hexdigest() diff --git a/wikiget/version.py b/wikiget/version.py deleted file mode 100644 index dd9b22c..0000000 --- a/wikiget/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.5.1" diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py deleted file mode 100644 index a8679c9..0000000 --- a/wikiget/wikiget.py +++ /dev/null @@ -1,157 +0,0 @@ -# wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors -# SPDX-License-Identifier: GPL-3.0-or-later -# -# Wikiget is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Wikiget is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Wikiget. If not, see . - -import argparse -import logging -import sys - -from . import DEFAULT_SITE, DEFAULT_PATH, wikiget_version -from .dl import download - - -def main(): - """ - Main entry point for console script. Automatically compiled by setuptools - when installed with `pip install` or `python setup.py install`. - """ - - parser = argparse.ArgumentParser(description=""" - A tool for downloading files from - MediaWiki sites using the file name or - description page URL - """, - epilog=""" - Copyright (C) 2018-2021 Cody Logan - and contributors. - License GPLv3+: GNU GPL version 3 or later - . - This is free software; you are free to - change and redistribute it under certain - conditions. There is NO WARRANTY, to the - extent permitted by law. - """) - parser.add_argument("FILE", help=""" - name of the file to download with the File: - prefix, or the URL of its file description page - """) - parser.add_argument("-V", "--version", action="version", - version=f"%(prog)s {wikiget_version}") - message_options = parser.add_mutually_exclusive_group() - message_options.add_argument("-q", "--quiet", - help="suppress warning messages", - action="store_true") - message_options.add_argument("-v", "--verbose", - help="print detailed information; " - "use -vv for even more detail", - action="count", default=0) - parser.add_argument("-f", "--force", - help="force overwriting existing files", - action="store_true") - parser.add_argument("-s", "--site", default=DEFAULT_SITE, - help="MediaWiki site to download from " - "(default: %(default)s)") - parser.add_argument("-p", "--path", default=DEFAULT_PATH, - help="MediaWiki site path, where api.php is located " - "(default: %(default)s)") - parser.add_argument("--username", default="", - help="MediaWiki site username, for private wikis") - parser.add_argument("--password", default="", - help="MediaWiki site password, for private wikis") - output_options = parser.add_mutually_exclusive_group() - output_options.add_argument("-o", "--output", - help="write download to OUTPUT") - output_options.add_argument("-a", "--batch", - help="treat FILE as a textfile containing " - "multiple files to download, one URL or " - "filename per line", action="store_true") - parser.add_argument("-l", "--logfile", default="", - help="save log output to LOGFILE") - - args = parser.parse_args() - - loglevel = logging.WARNING - if args.verbose >= 2: - # this includes API and library messages - loglevel = logging.DEBUG - elif args.verbose >= 1: - loglevel = logging.INFO - elif args.quiet: - loglevel = logging.ERROR - - # configure logging: - # console log level is set via -v, -vv, and -q options - # file log level is always info (TODO: add debug option) - if args.logfile: - # log to console and file - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)-7s] %(message)s", - filename=args.logfile - ) - - console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, - # debug messages aren't printing to console - console.setLevel(loglevel) - console.setFormatter( - logging.Formatter("[%(levelname)s] %(message)s") - ) - logging.getLogger("").addHandler(console) - else: - # log only to console - logging.basicConfig( - level=loglevel, - format="[%(levelname)s] %(message)s" - ) - - # log events are appended to the file if it already exists, - # so note the start of a new download session - logging.info(f"Starting download session using wikiget {wikiget_version}") - # logging.info(f"Log level is set to {loglevel}") - - if args.batch: - # batch download mode - input_file = args.FILE - dl_list = [] - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file, "r") - except IOError as e: - logging.error("File could not be read. " - "The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - with fd: - # store file contents in memory in case something - # happens to the file while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) - - # TODO: validate file contents before download process starts - for line_num, url in enumerate(dl_list, start=1): - url = url.strip() - # keep track of batch file line numbers for - # debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - download(url, args) - else: - # single download mode - dl = args.FILE - download(dl, args) -- cgit v1.2.3 From 75a79785d851efa319f4216e0d3471d30a02154a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:45:43 -0700 Subject: Style and format fixes --- src/wikiget/__init__.py | 5 +++-- src/wikiget/dl.py | 30 ++++++++++++++++-------------- src/wikiget/wikiget.py | 29 +++++++++++------------------ 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py index 20ea620..5b917cf 100644 --- a/src/wikiget/__init__.py +++ b/src/wikiget/__init__.py @@ -24,8 +24,9 @@ BLOCKSIZE = 65536 CHUNKSIZE = 1024 DEFAULT_SITE = "commons.wikimedia.org" DEFAULT_PATH = "/w/" -USER_AGENT = "wikiget/{} (https://github.com/clpo13/wikiget) mwclient/{}".format( - wikiget_version, mwclient_version +USER_AGENT = ( + f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " + f"mwclient/{mwclient_version}" ) STD_VERBOSE = 1 VERY_VERBOSE = 2 diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 9850ce8..791db61 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -36,8 +36,7 @@ def download(dl, args): site_name = url.netloc if args.site is not wikiget.DEFAULT_SITE: # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, " - "ignoring site specified with --site") + logging.warning("target is a URL, ignoring site specified with --site") else: filename = dl site_name = args.site @@ -74,8 +73,9 @@ def download(dl, args): sys.exit(1) except HTTPError as e: # most likely a 403 forbidden or 404 not found error for api.php - logging.error("Couldn't find the specified wiki's api.php. " - "Check the value of --path.") + logging.error( + "Couldn't find the specified wiki's api.php. Check the value of --path." + ) logging.debug("Full error message:") logging.debug(e) sys.exit(1) @@ -92,8 +92,10 @@ def download(dl, args): except APIError as e: # an API error at this point likely means access is denied, # which could happen with a private wiki - logging.error("Access denied. Try providing credentials with " - "--username and --password.") + logging.error( + "Access denied. Try providing credentials with " + "--username and --password." + ) logging.debug("Full error message:") for i in e.args: logging.debug(i) @@ -106,22 +108,23 @@ def download(dl, args): file_size = file.imageinfo["size"] file_sha1 = file.imageinfo["sha1"] - filename_log = (f"Downloading '{filename}' ({file_size} bytes) " - f"from {site.host}") + filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" if args.output: filename_log += f" to '{dest}'" logging.info(filename_log) logging.info(f"{file_url}") if os.path.isfile(dest) and not args.force: - logging.warning(f"File '{dest}' already exists, skipping download " - "(use -f to ignore)") + logging.warning( + f"File '{dest}' already exists, skipping download (use -f to ignore)" + ) else: try: fd = open(dest, "wb") except OSError as e: - logging.error("File could not be written. " - "The following error was encountered:") + logging.error( + "File could not be written. The following error was encountered:" + ) logging.error(e) sys.exit(1) else: @@ -158,6 +161,5 @@ def download(dl, args): else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be " - "a valid file.") + logging.error(f"Target '{filename}' does not appear to be a valid file.") sys.exit(1) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index b9a227f..bc6de38 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -102,10 +102,7 @@ def main(): action="store_true", ) parser.add_argument( - "-l", - "--logfile", - default="", - help="save log output to LOGFILE" + "-l", "--logfile", default="", help="save log output to LOGFILE" ) args = parser.parse_args() @@ -127,23 +124,18 @@ def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-7s] %(message)s", - filename=args.logfile + filename=args.logfile, ) console = logging.StreamHandler() # TODO: even when loglevel is set to logging.DEBUG, # debug messages aren't printing to console console.setLevel(loglevel) - console.setFormatter( - logging.Formatter("[%(levelname)s] %(message)s") - ) + console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig( - level=loglevel, - format="[%(levelname)s] %(message)s" - ) + logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") # log events are appended to the file if it already exists, # so note the start of a new download session @@ -158,10 +150,11 @@ def main(): logging.info(f"Using batch file '{input_file}'.") try: - fd = open(input_file, "r") + fd = open(input_file) except OSError as e: - logging.error("File could not be read. " - "The following error was encountered:") + logging.error( + "File could not be read. The following error was encountered:" + ) logging.error(e) sys.exit(1) else: @@ -173,11 +166,11 @@ def main(): # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): - url = url.strip() + s_url = url.strip() # keep track of batch file line numbers for # debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - download(url, args) + logging.info(f"Downloading '{s_url}' at line {line_num}:") + download(s_url, args) else: # single download mode dl = args.FILE -- cgit v1.2.3 From 80b6cdae0f5f51e472744d4282aedd8813015fd0 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Wed, 27 Sep 2023 15:24:30 -0700 Subject: Update manifest --- MANIFEST.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 7592f0e..a24252d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ -graft wikiget -graft test +graft src +graft tests global-exclude *.py[cod] -- cgit v1.2.3 From 62b815f0fa083b8336f8ac36225fe0ba9838d202 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 2 Oct 2023 10:01:32 -0700 Subject: Add initial manpage --- Makefile | 2 ++ wikiget.1 | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ wikiget.1.md | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 Makefile create mode 100644 wikiget.1 create mode 100644 wikiget.1.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bac4200 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +man: wikiget.1.md + pandoc -f markdown -t man -o wikiget.1 -s wikiget.1.md diff --git a/wikiget.1 b/wikiget.1 new file mode 100644 index 0000000..b4c9255 --- /dev/null +++ b/wikiget.1 @@ -0,0 +1,101 @@ +.\" Automatically generated by Pandoc 3.1.8 +.\" +.TH "WIKIGET" "1" "October 2, 2023" "Version 0.5.1" "Wikiget User Manual" +.SH NAME +wikiget - download files from MediaWiki sites +.SH SYNOPSIS +\f[B]wikiget\f[R] [-\f[B]h\f[R]] [-\f[B]V\f[R]] +[-\f[B]q\f[R]|-\f[B]v\f[R]] [-\f[B]f\f[R]] [-\f[B]s\f[R] \f[I]SITE\f[R]] +[-\f[B]p\f[R] \f[I]PATH\f[R]] [--\f[B]username\f[R] \f[I]USERNAME\f[R]] +[--\f[B]password\f[R] \f[I]PASSWORD\f[R]] [-\f[B]o\f[R] \f[I]OUTPUT\f[R] +| -\f[B]a\f[R]] [-\f[B]l\f[R] \f[I]LOGFILE\f[R]] \f[I]FILE\f[R] +.SH DESCRIPTION +Something like \f[B]wget\f[R](1) for downloading a file from MediaWiki +sites (like Wikipedia or Wikimedia Commons) using only the file name or +the URL of its description page. +.SH OPTIONS +.TP +\f[I]FILE\f[R] +The file to be downloaded. +If \f[I]FILE\f[R] is in the form \f[I]File:Example.jpg\f[R] or +\f[I]Image:Example.jpg\f[R], it will be fetched from the default site, +which is \[lq]commons.wikimedia.org\[rq]. +If it\[cq]s the fully-qualified URL of a file description page, like +\f[I]https://en.wikipedia.org/wiki/File:Example.jpg\f[R], the file is +fetched from the site in the URL, in this case +\[lq]en.wikipedia.org\[rq]. +.TP +-\f[B]s\f[R], --\f[B]site\f[R] \f[I]SITE\f[R] +MediaWiki site to download from. +Will not have any effect if the full URL is given in the \f[I]FILE\f[R] +parameter. +.TP +-\f[B]p\f[R], --\f[B]path\f[R] \f[I]PATH\f[R] +Script path for the wiki, where \[lq]index.php\[rq] and +\[lq]api.php\[rq] live. +On Wikimedia sites, it\[cq]s \[lq]/w/\[rq], the default, but other sites +may use \[lq]/\[rq] or something else entirely. +.TP +--\f[B]username\f[R] \f[I]USERNAME\f[R] +Username for private wikis that require a login even for read access. +.TP +--\f[B]password\f[R] \f[I]PASSWORD\f[R] +Password for private wikis that require a login even for read access. +.TP +-\f[B]o\f[R], --\f[B]output\f[R] \f[I]OUTPUT\f[R] +By default, the output filename is the same as the remote filename +(without the File: or Image: prefix), but this can be changed with this +option. +.TP +-\f[B]l\f[R], --\f[B]logfile\f[R] \f[I]LOGFILE\f[R] +Specify a logfile, which will contain detailed information about the +download process. +If the logfile already exists, new log information is appended to it. +.TP +-\f[B]f\f[R], --\f[B]force\f[R] +Force overwritng of existing files. +.TP +-\f[B]a\f[R], --\f[B]batch\f[R] +If this flag is set, \f[I]FILE\f[R] will be treated as an input text +file containing multiple files to be downloaded, one filename or URL per +line. +If an error is encountered during download, execution stops immediately +and the offending filename is printed. +.TP +-\f[B]v\f[R], --\f[B]verbose\f[R] +Print additional information, such as the site used and the full URL of +the file. +Additional invocations will increase the level of detail. +.TP +-\f[B]q\f[R], --\f[B]quiet\f[R] +Silence warnings and minimize printed output. +.TP +-\f[B]h\f[R], --\f[B]help\f[R] +Print a brief summary of these options. +.SH EXAMPLES +.IP +.EX +wikiget File:Example.jpg +wikiget --site en.wikipedia.org File:Example.jpg +wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg +.EE +.SH BUG REPORTS + +.SH LICENSE +Copyright (C) 2018-2023 Cody Logan and contributors +.PP +This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your +option) any later version. +.PP +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +See the GNU General Public License for more details. +.PP +You should have received a copy of the GNU General Public License along +with this program. +If not, see . +.SH AUTHORS +Cody Logan . diff --git a/wikiget.1.md b/wikiget.1.md new file mode 100644 index 0000000..01dcd5e --- /dev/null +++ b/wikiget.1.md @@ -0,0 +1,108 @@ +% WIKIGET(1) Version 0.5.1 | Wikiget User Manual +% Cody Logan +% October 2, 2023 + +# NAME + +wikiget - download files from MediaWiki sites + +# SYNOPSIS + +**wikiget** +[\-**h**] [\-**V**] [\-**q**|\-**v**] [\-**f**] [\-**s** *SITE*] [\-**p** *PATH*] +[\-\-**username** *USERNAME*] [\-\-**password** *PASSWORD*] +[\-**o** *OUTPUT* | \-**a**] [\-**l** *LOGFILE*] +*FILE* + +# DESCRIPTION + +Something like **wget**(1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) +using only the file name or the URL of its description page. + +# OPTIONS + +*FILE* + +: The file to be downloaded. If *FILE* is in the form *File:Example.jpg* or *Image:Example.jpg*, it will be + fetched from the default site, which is "commons.wikimedia.org". If it's the fully-qualified URL of a file + description page, like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site + in the URL, in this case "en.wikipedia.org". + +\-**s**, \-\-**site** *SITE* + +: MediaWiki site to download from. Will not have any effect if the full URL is given in the *FILE* parameter. + +\-**p**, \-\-**path** *PATH* + +: Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, + but other sites may use "/" or something else entirely. + +\-\-**username** *USERNAME* + +: Username for private wikis that require a login even for read access. + +\-\-**password** *PASSWORD* + +: Password for private wikis that require a login even for read access. + +\-**o**, \-\-**output** *OUTPUT* + +: By default, the output filename is the same as the remote filename (without the File: or Image: prefix), + but this can be changed with this option. + +\-**l**, \-\-**logfile** *LOGFILE* + +: Specify a logfile, which will contain detailed information about the download process. If the logfile already + exists, new log information is appended to it. + +\-**f**, \-\-**force** + +: Force overwritng of existing files. + +\-**a**, \-\-**batch** + +: If this flag is set, *FILE* will be treated as an input text file containing multiple files to be downloaded, + one filename or URL per line. If an error is encountered during download, execution stops immediately and the + offending filename is printed. + +\-**v**, \-\-**verbose** + +: Print additional information, such as the site used and the full URL of the file. Additional invocations will + increase the level of detail. + +\-**q**, \-\-**quiet** + +: Silence warnings and minimize printed output. + +\-**h**, \-\-**help** + +: Print a brief summary of these options. + +# EXAMPLES + +``` +wikiget File:Example.jpg +wikiget --site en.wikipedia.org File:Example.jpg +wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg +``` + +# BUG REPORTS + + + +# LICENSE + +Copyright (C) 2018-2023 Cody Logan and contributors + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . -- cgit v1.2.3 From 32bceeefb667b1966956b49981d256359afe0177 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 2 Oct 2023 10:30:53 -0700 Subject: Update man page with alternate invocations --- Makefile | 8 ++++++-- wikiget.1 | 34 ++++++++++++++++++++++++---------- wikiget.1.md | 19 +++++++++++-------- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index bac4200..6ce62df 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,6 @@ -man: wikiget.1.md - pandoc -f markdown -t man -o wikiget.1 -s wikiget.1.md +.PHONY: man + +man: wikiget.1 + +wikiget.1: wikiget.1.md + pandoc -s -f markdown -t man -o wikiget.1 wikiget.1.md diff --git a/wikiget.1 b/wikiget.1 index b4c9255..c5dc933 100644 --- a/wikiget.1 +++ b/wikiget.1 @@ -4,11 +4,21 @@ .SH NAME wikiget - download files from MediaWiki sites .SH SYNOPSIS -\f[B]wikiget\f[R] [-\f[B]h\f[R]] [-\f[B]V\f[R]] -[-\f[B]q\f[R]|-\f[B]v\f[R]] [-\f[B]f\f[R]] [-\f[B]s\f[R] \f[I]SITE\f[R]] -[-\f[B]p\f[R] \f[I]PATH\f[R]] [--\f[B]username\f[R] \f[I]USERNAME\f[R]] -[--\f[B]password\f[R] \f[I]PASSWORD\f[R]] [-\f[B]o\f[R] \f[I]OUTPUT\f[R] -| -\f[B]a\f[R]] [-\f[B]l\f[R] \f[I]LOGFILE\f[R]] \f[I]FILE\f[R] +.PP +\f[B]wikiget\f[R] [\f[I]options\f[R]] \f[I]FILE\f[R] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [\f[I]options\f[R]] [-\f[B]a\f[R]|--\f[B]batch\f[R]] +\f[I]BATCHFILE\f[R] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [-\f[B]V\f[R]|--\f[B]version\f[R]] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [-\f[B]h\f[R]|--\f[B]help\f[R]] .SH DESCRIPTION Something like \f[B]wget\f[R](1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only the file name or @@ -25,6 +35,13 @@ If it\[cq]s the fully-qualified URL of a file description page, like fetched from the site in the URL, in this case \[lq]en.wikipedia.org\[rq]. .TP +\f[I]BATCHFILE\f[R] +In batch download mode (activated with -\f[B]a\f[R] or +--\f[B]batch\f[R]), this is a text file containing multiple file names +or URLs to be downloaded, one per line. +If an error is encountered during download, execution stops immediately +and the offending filename is printed. +.TP -\f[B]s\f[R], --\f[B]site\f[R] \f[I]SITE\f[R] MediaWiki site to download from. Will not have any effect if the full URL is given in the \f[I]FILE\f[R] @@ -56,11 +73,8 @@ If the logfile already exists, new log information is appended to it. Force overwritng of existing files. .TP -\f[B]a\f[R], --\f[B]batch\f[R] -If this flag is set, \f[I]FILE\f[R] will be treated as an input text -file containing multiple files to be downloaded, one filename or URL per -line. -If an error is encountered during download, execution stops immediately -and the offending filename is printed. +If this flag is set, \f[B]wikiget\f[R] will run in batch download mode +(see \f[I]BATCHFILE\f[R]). .TP -\f[B]v\f[R], --\f[B]verbose\f[R] Print additional information, such as the site used and the full URL of diff --git a/wikiget.1.md b/wikiget.1.md index 01dcd5e..11ab708 100644 --- a/wikiget.1.md +++ b/wikiget.1.md @@ -8,11 +8,10 @@ wikiget - download files from MediaWiki sites # SYNOPSIS -**wikiget** -[\-**h**] [\-**V**] [\-**q**|\-**v**] [\-**f**] [\-**s** *SITE*] [\-**p** *PATH*] -[\-\-**username** *USERNAME*] [\-\-**password** *PASSWORD*] -[\-**o** *OUTPUT* | \-**a**] [\-**l** *LOGFILE*] -*FILE* +| **wikiget** \[*options*] *FILE* +| **wikiget** \[*options*] \[\-**a**|\-\-**batch**] *BATCHFILE* +| **wikiget** \[\-**V**|\-\-**version**] +| **wikiget** \[\-**h**|\-\-**help**] # DESCRIPTION @@ -28,6 +27,12 @@ using only the file name or the URL of its description page. description page, like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site in the URL, in this case "en.wikipedia.org". +*BATCHFILE* + +: In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple + file names or URLs to be downloaded, one per line. If an error is encountered during download, execution + stops immediately and the offending filename is printed. + \-**s**, \-\-**site** *SITE* : MediaWiki site to download from. Will not have any effect if the full URL is given in the *FILE* parameter. @@ -61,9 +66,7 @@ using only the file name or the URL of its description page. \-**a**, \-\-**batch** -: If this flag is set, *FILE* will be treated as an input text file containing multiple files to be downloaded, - one filename or URL per line. If an error is encountered during download, execution stops immediately and the - offending filename is printed. +: If this flag is set, **wikiget** will run in batch download mode (see *BATCHFILE*). \-**v**, \-\-**verbose** -- cgit v1.2.3 From a45cca029dc56516615c2915cd613b8ad70d0ae6 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 2 Oct 2023 11:48:15 -0700 Subject: Include man page, readme, and license when installing wheel --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index aab4b3f..ec7b235 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,11 @@ exclude = [ "/.github", ] +[tool.hatch.build.targets.wheel.shared-data] +"wikiget.1" = "share/man/man1/wikiget.1" +"README.md" = "share/doc/wikiget/README.md" +"LICENSE" = "share/doc/wikiget/LICENSE" + [tool.hatch.envs.default] dependencies = [ "coverage[toml]>=6.5", -- cgit v1.2.3 From b6fac1b7c0962e48a8f708efc9f535bb8552c9c6 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:11:10 -0700 Subject: Update README --- README.md | 2 +- pyproject.toml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index fd4d464..67fe0a2 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Requires Python 3.7+. Get it with `pip install --user wikiget` or `pipx install ## Usage `wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-p PATH] [--username USERNAME] -[--password PASSWORD] [-o OUTPUT | -a] FILE` +[--password PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` If `FILE` is in the form `File:Example.jpg` or `Image:Example.jpg`, it will be fetched from the default site, which is "commons.wikimedia.org". If it's the diff --git a/pyproject.toml b/pyproject.toml index ec7b235..070d406 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,8 +152,6 @@ ignore = [ "S105", "S106", "S107", # Ignore complexity "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", - # FIXME: temporarily ignore usage of `print()` - "T201", ] unfixable = [ # Don't touch unused imports -- cgit v1.2.3 From 485df31f095a9b629a1dcc04af13956325856d8c Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:51:58 -0700 Subject: Update README and do some code cleanup --- README.md | 102 +++++++++++++++++++-------------------------- src/wikiget/__init__.py | 2 +- src/wikiget/dl.py | 17 ++++---- src/wikiget/validations.py | 2 +- src/wikiget/wikiget.py | 47 +++++++++------------ wikiget.1 | 8 ++-- wikiget.1.md | 49 ++++++++++------------ 7 files changed, 99 insertions(+), 128 deletions(-) diff --git a/README.md b/README.md index 67fe0a2..876c319 100644 --- a/README.md +++ b/README.md @@ -3,49 +3,39 @@ [![Python package](https://github.com/clpo13/wikiget/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/clpo13/wikiget/actions/workflows/python.yml) [![PyPI version](https://badge.fury.io/py/wikiget.svg)](https://badge.fury.io/py/wikiget) -Something like wget for downloading a file from MediaWiki sites (like Wikipedia -or Wikimedia Commons) using only the file name or the URL of its description -page. +Something like wget for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only the +file name or the URL of its description page. Requires Python 3.7+. Get it with `pip install --user wikiget` or `pipx install wikiget`. ## Usage -`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-p PATH] [--username USERNAME] -[--password PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` - -If `FILE` is in the form `File:Example.jpg` or `Image:Example.jpg`, it will be -fetched from the default site, which is "commons.wikimedia.org". If it's the -fully-qualified URL of a file description page, like -`https://en.wikipedia.org/wiki/File:Example.jpg`, the file is fetched from the -specified site, in this case "en.wikipedia.org". Full URLs may contain -characters your shell interprets differently, so you can either escape those -characters with a backslash `\` or surround the entire URL with single `'` or -double `"` quotes. Use of a fully-qualified URL like this may require setting -the `--path` flag (see next paragraph). - -The site can also be specified with the `--site` flag, though this will not have -any effect if the full URL is given. Non-Wikimedia sites should work, but you -may need to specify the wiki's script path with `--path` (where `index.php` and -`api.php` live; on Wikimedia sites it's `/w/`, but other sites may use `/` or -something else entirely). Private wikis (those requiring login even for read -access) are also supported with the use of the `--username` and `--password` -flags. - -More detailed information, such as the site used and full URL of the file, can -be displayed with `-v` or `--verbose`. Use `-vv` to display even more detail, -mainly debugging information or API messages. `-q` can be used to silence warnings. -A logfile can be specified with `-l` or `--logfile`. If this option is present, the -logfile will contain the same information as `-v` along with timestamps. New log -entries will be appended to an existing logfile. - -By default, the program won't overwrite existing files with the same name as the -target, but this can be forced with `-f` or `--force`. Additionally, the file -can be downloaded to a different name with `-o`. - -Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, -`FILE` will be treated as an input file containing multiple files to download, -one filename or URL per line. If an error is encountered, execution stops +`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-p PATH] [--username USERNAME] [--password PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` + +The only required parameter is `FILE`, which is the file you want to download. It can either be the name of the file on +the wiki, including the namespace prefix, or a link to the file description page. If `FILE` is in the form +`File:Example.jpg` or `Image:Example.jpg`, it will be fetched from the default site, which is "commons.wikimedia.org". +If it's the fully-qualified URL of a file description page, like `https://en.wikipedia.org/wiki/File:Example.jpg`, the +file is fetched from the site in the URL, in this case "en.wikipedia.org". Note: full URLs may contain characters your +shell interprets differently, so you can either escape those characters with a backslash `\` or surround the entire URL +with single `'` or double `"` quotes. Use of a fully-qualified URL like this may require setting the `--path` flag (see +next paragraph). + +The site can also be specified with the `--site` flag, though this will not have any effect if the full URL is given. +Non-Wikimedia sites should work, but you may need to specify the wiki's script path with `--path` (where `index.php` and +`api.php` live; on Wikimedia sites it's `/w/`, but other sites may use `/` or something else entirely). Private wikis +(those requiring login even for read access) are also supported with the use of the `--username` and `--password` flags. + +More detailed information, such as the site used and full URL of the file, can be displayed with `-v` or `--verbose`. +Use `-vv` to display even more detail, mainly debugging information or API messages. `-q` can be used to silence +warnings. A logfile can be specified with `-l` or `--logfile`. If this option is present, the logfile will contain the +same information as `-v` along with timestamps. New log entries will be appended to an existing logfile. + +By default, the program won't overwrite existing files with the same name as the target, but this can be forced with +`-f` or `--force`. Additionally, the file can be downloaded to a different name with `-o`. + +Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE` will be treated as an input file +containing multiple files to download, one filename or URL per line. If an error is encountered, execution stops immediately and the offending filename is printed. ### Example usage @@ -70,13 +60,11 @@ Pull requests, bug reports, or feature requests are more than welcome. It's recommended that you use a [virtual environment manager](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) -like venv or [virtualenv](https://virtualenv.pypa.io/en/latest/) to create an -isolated environment in which to install this package's dependencies so as not -to clutter your system Python environment: +like venv or [virtualenv](https://virtualenv.pypa.io/en/latest/) to create an isolated environment in which to install +this package's dependencies so as not to clutter your system Python environment: ```bash -# if you plan on submitting pull requests, fork the repo on GitHub -# and clone that instead +# if you plan on submitting pull requests, fork the repo on GitHub and clone that instead git clone https://github.com/clpo13/wikiget cd wikiget @@ -97,28 +85,22 @@ source venv/bin/activate Then run `pip install -e .` to invoke an ["editable" install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) -meaning any changes made to the source will be reflected immediately in the executable -script. Unit tests can be run with `pytest` (make sure to run `pip install pytest` -in the virtual environment first.) +meaning any changes made to the source will be reflected immediately in the executable script. Unit tests can be run +with `pytest` (make sure to run `pip install pytest` in the virtual environment first.) -Alternatively, using [Hatch](https://hatch.pypa.io/latest/), simply clone the repository -and run `hatch run test` to create the environment and run pytest. Also try `hatch shell` -or `hatch run wikiget --help`. +Alternatively, using [Hatch](https://hatch.pypa.io/latest/), simply clone the repository and run `hatch run test` to +create the environment and run pytest. Also try `hatch shell` or `hatch run wikiget --help`. ## License Copyright (C) 2018-2023 Cody Logan and contributors -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public +License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later +version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied +warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program (see [LICENSE](LICENSE)). If not, see -. +You should have received a copy of the GNU General Public License along with this program (see [LICENSE](LICENSE)). +If not, see . diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py index 5b917cf..3946868 100644 --- a/src/wikiget/__init__.py +++ b/src/wikiget/__init__.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 791db61..d32736f 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -65,8 +65,8 @@ def download(dl, args): if args.username and args.password: site.login(args.username, args.password) except ConnectionError as e: - # usually this means there is no such site, or there's no network - # connection, though it could be a certificate problem + # usually this means there is no such site, or there's no network connection, + # though it could be a certificate problem logging.error("Couldn't connect to specified site.") logging.debug("Full error message:") logging.debug(e) @@ -80,8 +80,8 @@ def download(dl, args): logging.debug(e) sys.exit(1) except (InvalidResponse, LoginError) as e: - # InvalidResponse: site exists, but we couldn't communicate with the - # API endpoint for some reason other than an HTTP error. + # InvalidResponse: site exists, but we couldn't communicate with the API + # endpoint for some reason other than an HTTP error. # LoginError: missing or invalid credentials logging.error(e) sys.exit(1) @@ -90,8 +90,8 @@ def download(dl, args): try: file = site.images[filename] except APIError as e: - # an API error at this point likely means access is denied, - # which could happen with a private wiki + # an API error at this point likely means access is denied, which could happen + # with a private wiki logging.error( "Access denied. Try providing credentials with " "--username and --password." @@ -102,8 +102,7 @@ def download(dl, args): sys.exit(1) if file.imageinfo != {}: - # file exists either locally or at a common repository, - # like Wikimedia Commons + # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] file_size = file.imageinfo["size"] file_sha1 = file.imageinfo["sha1"] diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index dc70df4..8ebd996 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018, 2019, 2020 Cody Logan +# Copyright (C) 2018-2020 Cody Logan # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index bc6de38..934107e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -25,32 +25,27 @@ from wikiget.dl import download def main(): """ - Main entry point for console script. Automatically compiled by setuptools - when installed with `pip install` or `python setup.py install`. + Main entry point for console script. Automatically compiled by setuptools when + installed with `pip install` or `python setup.py install`. """ parser = argparse.ArgumentParser( description=""" - A tool for downloading files from - MediaWiki sites using the file name or + A tool for downloading files from MediaWiki sites using the file name or description page URL """, epilog=""" - Copyright (C) 2018-2023 Cody Logan - and contributors. - License GPLv3+: GNU GPL version 3 or later - . - This is free software; you are free to - change and redistribute it under certain - conditions. There is NO WARRANTY, to the - extent permitted by law. + Copyright (C) 2018-2023 Cody Logan and contributors. License GPLv3+: GNU GPL + version 3 or later . This is free + software; you are free to change and redistribute it under certain conditions. + There is NO WARRANTY, to the extent permitted by law. """, ) parser.add_argument( "FILE", help=""" - name of the file to download with the File: - prefix, or the URL of its file description page + name of the file to download with the File: prefix, or the URL of its file + description page """, ) parser.add_argument( @@ -96,9 +91,8 @@ def main(): output_options.add_argument( "-a", "--batch", - help="treat FILE as a textfile containing " - "multiple files to download, one URL or " - "filename per line", + help="treat FILE as a textfile containing multiple files to download, one URL " + "or filename per line", action="store_true", ) parser.add_argument( @@ -117,7 +111,7 @@ def main(): loglevel = logging.ERROR # configure logging: - # console log level is set via -v, -vv, and -q options + # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) if args.logfile: # log to console and file @@ -128,8 +122,8 @@ def main(): ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, - # debug messages aren't printing to console + # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't + # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) @@ -137,8 +131,8 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") - # log events are appended to the file if it already exists, - # so note the start of a new download session + # log events are appended to the file if it already exists, so note the start of a + # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") # logging.info(f"Log level is set to {loglevel}") @@ -159,16 +153,15 @@ def main(): sys.exit(1) else: with fd: - # store file contents in memory in case something - # happens to the file while we're downloading + # store file contents in memory in case something happens to the file + # while we're downloading for _, line in enumerate(fd): dl_list.append(line) # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): s_url = url.strip() - # keep track of batch file line numbers for - # debugging/logging purposes + # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{s_url}' at line {line_num}:") download(s_url, args) else: diff --git a/wikiget.1 b/wikiget.1 index c5dc933..03afc8b 100644 --- a/wikiget.1 +++ b/wikiget.1 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 3.1.8 .\" -.TH "WIKIGET" "1" "October 2, 2023" "Version 0.5.1" "Wikiget User Manual" +.TH "WIKIGET" "1" "October 3, 2023" "Version 0.5.1" "Wikiget User Manual" .SH NAME wikiget - download files from MediaWiki sites .SH SYNOPSIS @@ -70,7 +70,7 @@ download process. If the logfile already exists, new log information is appended to it. .TP -\f[B]f\f[R], --\f[B]force\f[R] -Force overwritng of existing files. +Force existing files to be overwritten. .TP -\f[B]a\f[R], --\f[B]batch\f[R] If this flag is set, \f[B]wikiget\f[R] will run in batch download mode @@ -94,7 +94,7 @@ wikiget --site en.wikipedia.org File:Example.jpg wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg .EE .SH BUG REPORTS - +https://github.com/clpo13/wikiget/issues .SH LICENSE Copyright (C) 2018-2023 Cody Logan and contributors .PP @@ -110,6 +110,6 @@ See the GNU General Public License for more details. .PP You should have received a copy of the GNU General Public License along with this program. -If not, see . +If not, see https://www.gnu.org/licenses/. .SH AUTHORS Cody Logan . diff --git a/wikiget.1.md b/wikiget.1.md index 11ab708..66227dc 100644 --- a/wikiget.1.md +++ b/wikiget.1.md @@ -1,6 +1,6 @@ % WIKIGET(1) Version 0.5.1 | Wikiget User Manual % Cody Logan -% October 2, 2023 +% October 3, 2023 # NAME @@ -15,23 +15,23 @@ wikiget - download files from MediaWiki sites # DESCRIPTION -Something like **wget**(1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) -using only the file name or the URL of its description page. +Something like **wget**(1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only +the file name or the URL of its description page. # OPTIONS *FILE* -: The file to be downloaded. If *FILE* is in the form *File:Example.jpg* or *Image:Example.jpg*, it will be - fetched from the default site, which is "commons.wikimedia.org". If it's the fully-qualified URL of a file - description page, like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site - in the URL, in this case "en.wikipedia.org". +: The file to be downloaded. If *FILE* is in the form *File:Example.jpg* or *Image:Example.jpg*, it will be fetched + from the default site, which is "commons.wikimedia.org". If it's the fully-qualified URL of a file description page, + like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site in the URL, in this case + "en.wikipedia.org". *BATCHFILE* -: In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple - file names or URLs to be downloaded, one per line. If an error is encountered during download, execution - stops immediately and the offending filename is printed. +: In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple file names + or URLs to be downloaded, one per line. If an error is encountered during download, execution stops immediately and + the offending filename is printed. \-**s**, \-\-**site** *SITE* @@ -39,8 +39,8 @@ using only the file name or the URL of its description page. \-**p**, \-\-**path** *PATH* -: Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, - but other sites may use "/" or something else entirely. +: Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, but + other sites may use "/" or something else entirely. \-\-**username** *USERNAME* @@ -52,8 +52,8 @@ using only the file name or the URL of its description page. \-**o**, \-\-**output** *OUTPUT* -: By default, the output filename is the same as the remote filename (without the File: or Image: prefix), - but this can be changed with this option. +: By default, the output filename is the same as the remote filename (without the File: or Image: prefix), but this + can be changed with this option. \-**l**, \-\-**logfile** *LOGFILE* @@ -62,7 +62,7 @@ using only the file name or the URL of its description page. \-**f**, \-\-**force** -: Force overwritng of existing files. +: Force existing files to be overwritten. \-**a**, \-\-**batch** @@ -91,21 +91,18 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg # BUG REPORTS - +https://github.com/clpo13/wikiget/issues # LICENSE Copyright (C) 2018-2023 Cody Logan and contributors -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public +License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later +version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied +warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program. If not, see . +You should have received a copy of the GNU General Public License along with this program. If not, see +https://www.gnu.org/licenses/. -- cgit v1.2.3 From e18222daecca1656390652cbd1c7f6985080241a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:58:37 -0700 Subject: Add short user and pass options Swapped path short option from -p to -P and added -u for username and -p for password --- README.md | 2 +- src/wikiget/wikiget.py | 12 +++++++++--- wikiget.1 | 6 +++--- wikiget.1.md | 6 +++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 876c319..9bf9250 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Requires Python 3.7+. Get it with `pip install --user wikiget` or `pipx install ## Usage -`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-p PATH] [--username USERNAME] [--password PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` +`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-P PATH] [-u USERNAME] [-p PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` The only required parameter is `FILE`, which is the file you want to download. It can either be the name of the file on the wiki, including the namespace prefix, or a link to the file description page. If `FILE` is in the form diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 934107e..f482280 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -75,16 +75,22 @@ def main(): help="MediaWiki site to download from (default: %(default)s)", ) parser.add_argument( - "-p", + "-P", "--path", default=wikiget.DEFAULT_PATH, help="MediaWiki site path, where api.php is located (default: %(default)s)", ) parser.add_argument( - "--username", default="", help="MediaWiki site username, for private wikis" + "-u", + "--username", + default="", + help="MediaWiki site username, for private wikis" ) parser.add_argument( - "--password", default="", help="MediaWiki site password, for private wikis" + "-p", + "--password", + default="", + help="MediaWiki site password, for private wikis" ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") diff --git a/wikiget.1 b/wikiget.1 index 03afc8b..060fd54 100644 --- a/wikiget.1 +++ b/wikiget.1 @@ -47,16 +47,16 @@ MediaWiki site to download from. Will not have any effect if the full URL is given in the \f[I]FILE\f[R] parameter. .TP --\f[B]p\f[R], --\f[B]path\f[R] \f[I]PATH\f[R] +-\f[B]P\f[R], --\f[B]path\f[R] \f[I]PATH\f[R] Script path for the wiki, where \[lq]index.php\[rq] and \[lq]api.php\[rq] live. On Wikimedia sites, it\[cq]s \[lq]/w/\[rq], the default, but other sites may use \[lq]/\[rq] or something else entirely. .TP ---\f[B]username\f[R] \f[I]USERNAME\f[R] +-\f[B]u\f[R], --\f[B]username\f[R] \f[I]USERNAME\f[R] Username for private wikis that require a login even for read access. .TP ---\f[B]password\f[R] \f[I]PASSWORD\f[R] +-\f[B]p\f[R], --\f[B]password\f[R] \f[I]PASSWORD\f[R] Password for private wikis that require a login even for read access. .TP -\f[B]o\f[R], --\f[B]output\f[R] \f[I]OUTPUT\f[R] diff --git a/wikiget.1.md b/wikiget.1.md index 66227dc..d05aaf1 100644 --- a/wikiget.1.md +++ b/wikiget.1.md @@ -37,16 +37,16 @@ the file name or the URL of its description page. : MediaWiki site to download from. Will not have any effect if the full URL is given in the *FILE* parameter. -\-**p**, \-\-**path** *PATH* +\-**P**, \-\-**path** *PATH* : Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, but other sites may use "/" or something else entirely. -\-\-**username** *USERNAME* +\-**u**, \-\-**username** *USERNAME* : Username for private wikis that require a login even for read access. -\-\-**password** *PASSWORD* +\-**p**, \-\-**password** *PASSWORD* : Password for private wikis that require a login even for read access. -- cgit v1.2.3 From 865088207b39427b6b932de4f312d82bd5e05a53 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:26:09 -0700 Subject: Refactor for better code organization --- src/wikiget/dl.py | 23 ++++++++++++++++++++++- src/wikiget/file.py | 27 +++++++++++++++++++++++++++ src/wikiget/wikiget.py | 38 +++++++++++++++++++++++--------------- tests/test_file_class.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 16 deletions(-) create mode 100644 src/wikiget/file.py create mode 100644 tests/test_file_class.py diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index d32736f..2b2befa 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -25,10 +25,11 @@ from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.file import File from wikiget.validations import valid_file, verify_hash -def download(dl, args): +def get_dest(dl, args): url = urlparse(dl) if url.netloc: @@ -56,6 +57,10 @@ def download(dl, args): dest = args.output or filename + return filename, dest, site_name + + +def query_api(filename, site_name, args): logging.debug(f"User agent: {wikiget.USER_AGENT}") # connect to site and identify ourselves @@ -101,6 +106,22 @@ def download(dl, args): logging.debug(i) sys.exit(1) + return file, site + + +def prep_download(dl, args): + filename, dest, site_name = get_dest(dl, args) + file = File(filename, dest) + file.object, file.site = query_api(file.name, site_name, args) + return file + + +def download(f, args): + file = f.object + filename = f.name + site = f.site + dest = f.dest + if file.imageinfo != {}: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] diff --git a/src/wikiget/file.py b/src/wikiget/file.py new file mode 100644 index 0000000..60a71e0 --- /dev/null +++ b/src/wikiget/file.py @@ -0,0 +1,27 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + + +class File: + def __init__(self, name, dest=None): + self.object = None + self.site = None + self.name = name + if dest is None: + self.dest = name + else: + self.dest = dest diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index f482280..80d5057 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,15 +20,10 @@ import logging import sys import wikiget -from wikiget.dl import download +from wikiget.dl import download, prep_download -def main(): - """ - Main entry point for console script. Automatically compiled by setuptools when - installed with `pip install` or `python setup.py install`. - """ - +def construct_parser(): parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -84,13 +79,13 @@ def main(): "-u", "--username", default="", - help="MediaWiki site username, for private wikis" + help="MediaWiki site username, for private wikis", ) parser.add_argument( "-p", "--password", default="", - help="MediaWiki site password, for private wikis" + help="MediaWiki site password, for private wikis", ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") @@ -104,7 +99,19 @@ def main(): parser.add_argument( "-l", "--logfile", default="", help="save log output to LOGFILE" ) + parser.add_argument( + "-j", + "--threads", + default=1, + help="Number of parallel downloads to attempt in batch mode", + type=int, + ) + return parser + + +def main(): + parser = construct_parser() args = parser.parse_args() loglevel = logging.WARNING @@ -165,12 +172,13 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, url in enumerate(dl_list, start=1): - s_url = url.strip() + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{s_url}' at line {line_num}:") - download(s_url, args) + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + download(file, args) else: # single download mode - dl = args.FILE - download(dl, args) + file = prep_download(args.FILE, args) + download(file, args) diff --git a/tests/test_file_class.py b/tests/test_file_class.py new file mode 100644 index 0000000..7ad0b87 --- /dev/null +++ b/tests/test_file_class.py @@ -0,0 +1,31 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +from wikiget.file import File + + +def test_file_with_name_only(): + file = File("foobar.jpg") + assert file.name == "foobar.jpg" + assert file.dest == file.name + + +def test_file_with_name_and_dest(): + file = File("foobar.jpg", "bazqux.jpg") + assert file.name == "foobar.jpg" + assert file.dest == "bazqux.jpg" + assert file.dest != file.name -- cgit v1.2.3 From 93e879e30ec2776c5d347e72be32f3ef30bd1410 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:28:23 -0700 Subject: Add parallel download option in batch mode Number of download threads can be set with new -j option. Unfortunately, it's not that much faster than downloading in serial, since the API calls made before the downloads actually start are not (and ideally should not be) parallelized. Still, for large batches, it saves a bit of time. Known issue: due to the download threads writing to the log asynchronously, the messages get jumbled up. This will be fixed eventually. --- src/wikiget/wikiget.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 80d5057..c16d3f6 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,6 +18,7 @@ import argparse import logging import sys +from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download @@ -172,12 +173,18 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - download(file, args) + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 43c1fc258499f54977a1b7b594b295c2dae03114 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 16:07:54 -0700 Subject: Reduce repeated code in log configuration --- .gitignore | 6 ++++++ src/wikiget/wikiget.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 3d6c39b..308c5e5 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,9 @@ venv.bak/ .vs/ .vscode/ .idea/ + +# downloaded images, batch test files +*.jpg +*.jpeg +*.png +batch.txt \ No newline at end of file diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c16d3f6..51c870a 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -127,11 +127,13 @@ def main(): # configure logging: # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) + base_format = "%(threadName)s - %(message)s" + log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( level=logging.INFO, - format="%(asctime)s [%(levelname)-7s] %(message)s", + format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) @@ -139,11 +141,11 @@ def main(): # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't # printing to console console.setLevel(loglevel) - console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") + logging.basicConfig(level=loglevel, format=log_format) # log events are appended to the file if it already exists, so note the start of a # new download session @@ -173,7 +175,10 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - with ThreadPoolExecutor(max_workers=args.threads) as executor: + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: futures = [] for line_num, line in enumerate(dl_list, start=1): url = line.strip() -- cgit v1.2.3 From 0f45fe2526d7be48d77ef60b7505a196d533b4f4 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Thu, 5 Oct 2023 15:27:46 -0700 Subject: Add new options to readme and man page --- README.md | 8 +++++--- wikiget.1 | 9 ++++++++- wikiget.1.md | 10 +++++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9bf9250..4faadf4 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Requires Python 3.7+. Get it with `pip install --user wikiget` or `pipx install ## Usage -`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-P PATH] [-u USERNAME] [-p PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] FILE` +`wikiget [-h] [-V] [-q | -v] [-f] [-s SITE] [-P PATH] [-u USERNAME] [-p PASSWORD] [-o OUTPUT | -a] [-l LOGFILE] [-j THREADS] FILE` The only required parameter is `FILE`, which is the file you want to download. It can either be the name of the file on the wiki, including the namespace prefix, or a link to the file description page. If `FILE` is in the form @@ -36,7 +36,10 @@ By default, the program won't overwrite existing files with the same name as the Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE` will be treated as an input file containing multiple files to download, one filename or URL per line. If an error is encountered, execution stops -immediately and the offending filename is printed. +immediately and the offending filename is printed. For large batches, the process can be sped up by downloading files +in parallel. The number of parallel downloads can be set with `-j`. For instance, with `-a -j4`, wikiget will download +four files at once. Without `-j` or with `-j` by itself without a number, wikiget will download the files one at a +time. ### Example usage @@ -48,7 +51,6 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg ## Future plans -- download multiple files at once in batch mode - continue batch download even if input is malformed or file doesn't exist (possibly by raising exceptions in `download()`) - batch download by (Commons) category or user uploads diff --git a/wikiget.1 b/wikiget.1 index 060fd54..03a0c41 100644 --- a/wikiget.1 +++ b/wikiget.1 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 3.1.8 .\" -.TH "WIKIGET" "1" "October 3, 2023" "Version 0.5.1" "Wikiget User Manual" +.TH "WIKIGET" "1" "October 5, 2023" "Version 0.5.1" "Wikiget User Manual" .SH NAME wikiget - download files from MediaWiki sites .SH SYNOPSIS @@ -76,6 +76,10 @@ Force existing files to be overwritten. If this flag is set, \f[B]wikiget\f[R] will run in batch download mode (see \f[I]BATCHFILE\f[R]). .TP +-\f[B]j\f[R], --\f[B]threads\f[R] +Number of parallel downloads to attempt in batch mode. +This option has no effect if -\f[B]a\f[R] is not also set. +.TP -\f[B]v\f[R], --\f[B]verbose\f[R] Print additional information, such as the site used and the full URL of the file. @@ -84,6 +88,9 @@ Additional invocations will increase the level of detail. -\f[B]q\f[R], --\f[B]quiet\f[R] Silence warnings and minimize printed output. .TP +-\f[B]V\f[R], --\f[B]version\f[R] +Print the version number of the program. +.TP -\f[B]h\f[R], --\f[B]help\f[R] Print a brief summary of these options. .SH EXAMPLES diff --git a/wikiget.1.md b/wikiget.1.md index d05aaf1..45184f4 100644 --- a/wikiget.1.md +++ b/wikiget.1.md @@ -1,6 +1,6 @@ % WIKIGET(1) Version 0.5.1 | Wikiget User Manual % Cody Logan -% October 3, 2023 +% October 5, 2023 # NAME @@ -68,6 +68,10 @@ the file name or the URL of its description page. : If this flag is set, **wikiget** will run in batch download mode (see *BATCHFILE*). +\-**j**, \-\-**threads** + +: Number of parallel downloads to attempt in batch mode. This option has no effect if \-**a** is not also set. + \-**v**, \-\-**verbose** : Print additional information, such as the site used and the full URL of the file. Additional invocations will @@ -77,6 +81,10 @@ the file name or the URL of its description page. : Silence warnings and minimize printed output. +\-**V**, \-\-**version** + +: Print the version number of the program. + \-**h**, \-\-**help** : Print a brief summary of these options. -- cgit v1.2.3 From 43489400d94ba68477ec2040ec8e1625192780ef Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 6 Oct 2023 14:03:55 -0700 Subject: Update actions for dev branch and Python 3.12 --- .github/workflows/python.yml | 8 ++++---- pyproject.toml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bef2670..499c01a 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -5,9 +5,9 @@ name: Python package on: push: - branches: [ "master" ] + branches: [ "dev" ] pull_request: - branches: [ "master" ] + branches: [ "master", "dev" ] jobs: build: @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 @@ -32,7 +32,7 @@ jobs: - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names - flake8 . --count --show-source --statistics + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - name: Test with pytest run: | pytest diff --git a/pyproject.toml b/pyproject.toml index 070d406..0b13074 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies = [ "mwclient>=0.10.0", -- cgit v1.2.3 From 1fb093e26709573e03fa0feb4afa53a22568d92e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 6 Oct 2023 14:07:30 -0700 Subject: Remove unneeded dependency in tests --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 499c01a..321ed44 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install flake8 pytest pytest-cov + python -m pip install flake8 pytest python -m pip install . - name: Lint with flake8 run: | -- cgit v1.2.3 From b1cc0b5e6f8f761b806885f646602ac24d74243e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 10:08:40 -0700 Subject: Add Codecov report to actions file --- .github/workflows/python.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 321ed44..d02c1b5 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install flake8 pytest + python -m pip install flake8 pytest "coverage[toml]" python -m pip install . - name: Lint with flake8 run: | @@ -35,4 +35,8 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - name: Test with pytest run: | - pytest + coverage run -m pytest + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} -- cgit v1.2.3 From 6e5febfbebea23ee9c7ba5593dbb393034c2e955 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 10:17:44 -0700 Subject: Add coverage conversion step --- .github/workflows/python.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index d02c1b5..7f44810 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -36,6 +36,10 @@ jobs: - name: Test with pytest run: | coverage run -m pytest + - name: Convert coverage file to XML + run: | + coverage combine + coverage xml - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 env: -- cgit v1.2.3 From 8e6bb3dcc1c929327425750057c411192c1e69d7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 11:16:05 -0700 Subject: Add some tests for download functions --- pyproject.toml | 2 ++ tests/test_dl.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tests/test_dl.py diff --git a/pyproject.toml b/pyproject.toml index 0b13074..3364ec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,9 +79,11 @@ cov-report = [ "- coverage combine", "coverage report", ] +htmlcov = "coverage html" cov = [ "test-cov", "cov-report", + "htmlcov", ] [[tool.hatch.envs.all.matrix]] diff --git a/tests/test_dl.py b/tests/test_dl.py new file mode 100644 index 0000000..396041d --- /dev/null +++ b/tests/test_dl.py @@ -0,0 +1,78 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +# import logging +import pytest + +# from wikiget import USER_AGENT +from wikiget.wikiget import construct_parser +# from wikiget.dl import get_dest, query_api, prep_download +from wikiget.dl import get_dest + + +class TestGetDest: + parser = construct_parser() + + def test_get_dest_with_filename(self): + args = self.parser.parse_args(["File:Example.jpg"]) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "commons.wikimedia.org" + + def test_get_dest_with_url(self): + args = self.parser.parse_args([ + "https://en.wikipedia.org/wiki/File:Example.jpg", + ]) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "en.wikipedia.org" + + def test_get_dest_with_bad_filename(self): + args = self.parser.parse_args(["Example.jpg"]) + with pytest.raises(SystemExit): + filename, dest, site_name = get_dest(args.FILE, args) + + def test_get_dest_with_different_site(self, caplog): + args = self.parser.parse_args([ + "https://commons.wikimedia.org/wiki/File:Example.jpg", + "--site", + "commons.wikimedia.org", + ]) + filename, dest, site_name = get_dest(args.FILE, args) + assert "target is a URL, ignoring site specified with --site" in caplog.text + + +# TODO: don't hit the actual API when doing tests +# class TestQueryApi: +# parser = construct_parser() +# +# def test_query_api(self, caplog): +# caplog.set_level(logging.DEBUG) +# args = self.parser.parse_args(["File:Example.jpg"]) +# file, site = query_api("Example.jpg", "commons.wikimedia.org", args) +# assert USER_AGENT in caplog.text +# +# +# class TestPrepDownload(): +# parser = construct_parser() +# +# def test_prep_download(self): +# args = self.parser.parse_args(["File:Example.jpg"]) +# file = prep_download(args.FILE, args) +# assert file is not None -- cgit v1.2.3 From 206f0fe0b97610fc371ad0acdd5146ac12eacfe7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 13:50:30 -0700 Subject: Style cleanup --- pyproject.toml | 2 +- src/wikiget/wikiget.py | 2 +- tests/test_dl.py | 27 ++++++++++++++------------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3364ec0..8dc16ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -188,5 +188,5 @@ exclude_lines = [ ] [[tool.mypy.overrides]] -module = ["mwclient"] +module = ["mwclient", "pytest"] ignore_missing_imports = true diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 51c870a..8c067e0 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -104,7 +104,7 @@ def construct_parser(): "-j", "--threads", default=1, - help="Number of parallel downloads to attempt in batch mode", + help="number of parallel downloads to attempt in batch mode", type=int, ) diff --git a/tests/test_dl.py b/tests/test_dl.py index 396041d..abf8763 100644 --- a/tests/test_dl.py +++ b/tests/test_dl.py @@ -15,13 +15,10 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . -# import logging import pytest -# from wikiget import USER_AGENT -from wikiget.wikiget import construct_parser -# from wikiget.dl import get_dest, query_api, prep_download from wikiget.dl import get_dest +from wikiget.wikiget import construct_parser class TestGetDest: @@ -35,9 +32,11 @@ class TestGetDest: assert site_name == "commons.wikimedia.org" def test_get_dest_with_url(self): - args = self.parser.parse_args([ - "https://en.wikipedia.org/wiki/File:Example.jpg", - ]) + args = self.parser.parse_args( + [ + "https://en.wikipedia.org/wiki/File:Example.jpg", + ] + ) filename, dest, site_name = get_dest(args.FILE, args) assert filename == "Example.jpg" assert dest == "Example.jpg" @@ -48,12 +47,14 @@ class TestGetDest: with pytest.raises(SystemExit): filename, dest, site_name = get_dest(args.FILE, args) - def test_get_dest_with_different_site(self, caplog): - args = self.parser.parse_args([ - "https://commons.wikimedia.org/wiki/File:Example.jpg", - "--site", - "commons.wikimedia.org", - ]) + def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): + args = self.parser.parse_args( + [ + "https://commons.wikimedia.org/wiki/File:Example.jpg", + "--site", + "commons.wikimedia.org", + ] + ) filename, dest, site_name = get_dest(args.FILE, args) assert "target is a URL, ignoring site specified with --site" in caplog.text -- cgit v1.2.3 From 878d4174248711906cfaf4e7bf10c0bf83af85f3 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 10 Oct 2023 11:04:14 -0700 Subject: Move man-related files to docs directory --- MANIFEST.in | 1 + Makefile | 6 --- docs/Makefile | 6 +++ docs/wikiget.1 | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ docs/wikiget.1.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- wikiget.1 | 122 ------------------------------------------------------ wikiget.1.md | 116 --------------------------------------------------- 8 files changed, 246 insertions(+), 245 deletions(-) delete mode 100644 Makefile create mode 100644 docs/Makefile create mode 100644 docs/wikiget.1 create mode 100644 docs/wikiget.1.md delete mode 100644 wikiget.1 delete mode 100644 wikiget.1.md diff --git a/MANIFEST.in b/MANIFEST.in index a24252d..429c6c8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ graft src graft tests +graft docs global-exclude *.py[cod] diff --git a/Makefile b/Makefile deleted file mode 100644 index 6ce62df..0000000 --- a/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -.PHONY: man - -man: wikiget.1 - -wikiget.1: wikiget.1.md - pandoc -s -f markdown -t man -o wikiget.1 wikiget.1.md diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..6ce62df --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,6 @@ +.PHONY: man + +man: wikiget.1 + +wikiget.1: wikiget.1.md + pandoc -s -f markdown -t man -o wikiget.1 wikiget.1.md diff --git a/docs/wikiget.1 b/docs/wikiget.1 new file mode 100644 index 0000000..03a0c41 --- /dev/null +++ b/docs/wikiget.1 @@ -0,0 +1,122 @@ +.\" Automatically generated by Pandoc 3.1.8 +.\" +.TH "WIKIGET" "1" "October 5, 2023" "Version 0.5.1" "Wikiget User Manual" +.SH NAME +wikiget - download files from MediaWiki sites +.SH SYNOPSIS +.PP +\f[B]wikiget\f[R] [\f[I]options\f[R]] \f[I]FILE\f[R] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [\f[I]options\f[R]] [-\f[B]a\f[R]|--\f[B]batch\f[R]] +\f[I]BATCHFILE\f[R] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [-\f[B]V\f[R]|--\f[B]version\f[R]] +.PD 0 +.P +.PD +\f[B]wikiget\f[R] [-\f[B]h\f[R]|--\f[B]help\f[R]] +.SH DESCRIPTION +Something like \f[B]wget\f[R](1) for downloading a file from MediaWiki +sites (like Wikipedia or Wikimedia Commons) using only the file name or +the URL of its description page. +.SH OPTIONS +.TP +\f[I]FILE\f[R] +The file to be downloaded. +If \f[I]FILE\f[R] is in the form \f[I]File:Example.jpg\f[R] or +\f[I]Image:Example.jpg\f[R], it will be fetched from the default site, +which is \[lq]commons.wikimedia.org\[rq]. +If it\[cq]s the fully-qualified URL of a file description page, like +\f[I]https://en.wikipedia.org/wiki/File:Example.jpg\f[R], the file is +fetched from the site in the URL, in this case +\[lq]en.wikipedia.org\[rq]. +.TP +\f[I]BATCHFILE\f[R] +In batch download mode (activated with -\f[B]a\f[R] or +--\f[B]batch\f[R]), this is a text file containing multiple file names +or URLs to be downloaded, one per line. +If an error is encountered during download, execution stops immediately +and the offending filename is printed. +.TP +-\f[B]s\f[R], --\f[B]site\f[R] \f[I]SITE\f[R] +MediaWiki site to download from. +Will not have any effect if the full URL is given in the \f[I]FILE\f[R] +parameter. +.TP +-\f[B]P\f[R], --\f[B]path\f[R] \f[I]PATH\f[R] +Script path for the wiki, where \[lq]index.php\[rq] and +\[lq]api.php\[rq] live. +On Wikimedia sites, it\[cq]s \[lq]/w/\[rq], the default, but other sites +may use \[lq]/\[rq] or something else entirely. +.TP +-\f[B]u\f[R], --\f[B]username\f[R] \f[I]USERNAME\f[R] +Username for private wikis that require a login even for read access. +.TP +-\f[B]p\f[R], --\f[B]password\f[R] \f[I]PASSWORD\f[R] +Password for private wikis that require a login even for read access. +.TP +-\f[B]o\f[R], --\f[B]output\f[R] \f[I]OUTPUT\f[R] +By default, the output filename is the same as the remote filename +(without the File: or Image: prefix), but this can be changed with this +option. +.TP +-\f[B]l\f[R], --\f[B]logfile\f[R] \f[I]LOGFILE\f[R] +Specify a logfile, which will contain detailed information about the +download process. +If the logfile already exists, new log information is appended to it. +.TP +-\f[B]f\f[R], --\f[B]force\f[R] +Force existing files to be overwritten. +.TP +-\f[B]a\f[R], --\f[B]batch\f[R] +If this flag is set, \f[B]wikiget\f[R] will run in batch download mode +(see \f[I]BATCHFILE\f[R]). +.TP +-\f[B]j\f[R], --\f[B]threads\f[R] +Number of parallel downloads to attempt in batch mode. +This option has no effect if -\f[B]a\f[R] is not also set. +.TP +-\f[B]v\f[R], --\f[B]verbose\f[R] +Print additional information, such as the site used and the full URL of +the file. +Additional invocations will increase the level of detail. +.TP +-\f[B]q\f[R], --\f[B]quiet\f[R] +Silence warnings and minimize printed output. +.TP +-\f[B]V\f[R], --\f[B]version\f[R] +Print the version number of the program. +.TP +-\f[B]h\f[R], --\f[B]help\f[R] +Print a brief summary of these options. +.SH EXAMPLES +.IP +.EX +wikiget File:Example.jpg +wikiget --site en.wikipedia.org File:Example.jpg +wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg +.EE +.SH BUG REPORTS +https://github.com/clpo13/wikiget/issues +.SH LICENSE +Copyright (C) 2018-2023 Cody Logan and contributors +.PP +This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your +option) any later version. +.PP +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +See the GNU General Public License for more details. +.PP +You should have received a copy of the GNU General Public License along +with this program. +If not, see https://www.gnu.org/licenses/. +.SH AUTHORS +Cody Logan . diff --git a/docs/wikiget.1.md b/docs/wikiget.1.md new file mode 100644 index 0000000..45184f4 --- /dev/null +++ b/docs/wikiget.1.md @@ -0,0 +1,116 @@ +% WIKIGET(1) Version 0.5.1 | Wikiget User Manual +% Cody Logan +% October 5, 2023 + +# NAME + +wikiget - download files from MediaWiki sites + +# SYNOPSIS + +| **wikiget** \[*options*] *FILE* +| **wikiget** \[*options*] \[\-**a**|\-\-**batch**] *BATCHFILE* +| **wikiget** \[\-**V**|\-\-**version**] +| **wikiget** \[\-**h**|\-\-**help**] + +# DESCRIPTION + +Something like **wget**(1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only +the file name or the URL of its description page. + +# OPTIONS + +*FILE* + +: The file to be downloaded. If *FILE* is in the form *File:Example.jpg* or *Image:Example.jpg*, it will be fetched + from the default site, which is "commons.wikimedia.org". If it's the fully-qualified URL of a file description page, + like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site in the URL, in this case + "en.wikipedia.org". + +*BATCHFILE* + +: In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple file names + or URLs to be downloaded, one per line. If an error is encountered during download, execution stops immediately and + the offending filename is printed. + +\-**s**, \-\-**site** *SITE* + +: MediaWiki site to download from. Will not have any effect if the full URL is given in the *FILE* parameter. + +\-**P**, \-\-**path** *PATH* + +: Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, but + other sites may use "/" or something else entirely. + +\-**u**, \-\-**username** *USERNAME* + +: Username for private wikis that require a login even for read access. + +\-**p**, \-\-**password** *PASSWORD* + +: Password for private wikis that require a login even for read access. + +\-**o**, \-\-**output** *OUTPUT* + +: By default, the output filename is the same as the remote filename (without the File: or Image: prefix), but this + can be changed with this option. + +\-**l**, \-\-**logfile** *LOGFILE* + +: Specify a logfile, which will contain detailed information about the download process. If the logfile already + exists, new log information is appended to it. + +\-**f**, \-\-**force** + +: Force existing files to be overwritten. + +\-**a**, \-\-**batch** + +: If this flag is set, **wikiget** will run in batch download mode (see *BATCHFILE*). + +\-**j**, \-\-**threads** + +: Number of parallel downloads to attempt in batch mode. This option has no effect if \-**a** is not also set. + +\-**v**, \-\-**verbose** + +: Print additional information, such as the site used and the full URL of the file. Additional invocations will + increase the level of detail. + +\-**q**, \-\-**quiet** + +: Silence warnings and minimize printed output. + +\-**V**, \-\-**version** + +: Print the version number of the program. + +\-**h**, \-\-**help** + +: Print a brief summary of these options. + +# EXAMPLES + +``` +wikiget File:Example.jpg +wikiget --site en.wikipedia.org File:Example.jpg +wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg +``` + +# BUG REPORTS + +https://github.com/clpo13/wikiget/issues + +# LICENSE + +Copyright (C) 2018-2023 Cody Logan and contributors + +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public +License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied +warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program. If not, see +https://www.gnu.org/licenses/. diff --git a/pyproject.toml b/pyproject.toml index 8dc16ce..e8c63d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,7 @@ exclude = [ ] [tool.hatch.build.targets.wheel.shared-data] -"wikiget.1" = "share/man/man1/wikiget.1" +"docs/wikiget.1" = "share/man/man1/wikiget.1" "README.md" = "share/doc/wikiget/README.md" "LICENSE" = "share/doc/wikiget/LICENSE" diff --git a/wikiget.1 b/wikiget.1 deleted file mode 100644 index 03a0c41..0000000 --- a/wikiget.1 +++ /dev/null @@ -1,122 +0,0 @@ -.\" Automatically generated by Pandoc 3.1.8 -.\" -.TH "WIKIGET" "1" "October 5, 2023" "Version 0.5.1" "Wikiget User Manual" -.SH NAME -wikiget - download files from MediaWiki sites -.SH SYNOPSIS -.PP -\f[B]wikiget\f[R] [\f[I]options\f[R]] \f[I]FILE\f[R] -.PD 0 -.P -.PD -\f[B]wikiget\f[R] [\f[I]options\f[R]] [-\f[B]a\f[R]|--\f[B]batch\f[R]] -\f[I]BATCHFILE\f[R] -.PD 0 -.P -.PD -\f[B]wikiget\f[R] [-\f[B]V\f[R]|--\f[B]version\f[R]] -.PD 0 -.P -.PD -\f[B]wikiget\f[R] [-\f[B]h\f[R]|--\f[B]help\f[R]] -.SH DESCRIPTION -Something like \f[B]wget\f[R](1) for downloading a file from MediaWiki -sites (like Wikipedia or Wikimedia Commons) using only the file name or -the URL of its description page. -.SH OPTIONS -.TP -\f[I]FILE\f[R] -The file to be downloaded. -If \f[I]FILE\f[R] is in the form \f[I]File:Example.jpg\f[R] or -\f[I]Image:Example.jpg\f[R], it will be fetched from the default site, -which is \[lq]commons.wikimedia.org\[rq]. -If it\[cq]s the fully-qualified URL of a file description page, like -\f[I]https://en.wikipedia.org/wiki/File:Example.jpg\f[R], the file is -fetched from the site in the URL, in this case -\[lq]en.wikipedia.org\[rq]. -.TP -\f[I]BATCHFILE\f[R] -In batch download mode (activated with -\f[B]a\f[R] or ---\f[B]batch\f[R]), this is a text file containing multiple file names -or URLs to be downloaded, one per line. -If an error is encountered during download, execution stops immediately -and the offending filename is printed. -.TP --\f[B]s\f[R], --\f[B]site\f[R] \f[I]SITE\f[R] -MediaWiki site to download from. -Will not have any effect if the full URL is given in the \f[I]FILE\f[R] -parameter. -.TP --\f[B]P\f[R], --\f[B]path\f[R] \f[I]PATH\f[R] -Script path for the wiki, where \[lq]index.php\[rq] and -\[lq]api.php\[rq] live. -On Wikimedia sites, it\[cq]s \[lq]/w/\[rq], the default, but other sites -may use \[lq]/\[rq] or something else entirely. -.TP --\f[B]u\f[R], --\f[B]username\f[R] \f[I]USERNAME\f[R] -Username for private wikis that require a login even for read access. -.TP --\f[B]p\f[R], --\f[B]password\f[R] \f[I]PASSWORD\f[R] -Password for private wikis that require a login even for read access. -.TP --\f[B]o\f[R], --\f[B]output\f[R] \f[I]OUTPUT\f[R] -By default, the output filename is the same as the remote filename -(without the File: or Image: prefix), but this can be changed with this -option. -.TP --\f[B]l\f[R], --\f[B]logfile\f[R] \f[I]LOGFILE\f[R] -Specify a logfile, which will contain detailed information about the -download process. -If the logfile already exists, new log information is appended to it. -.TP --\f[B]f\f[R], --\f[B]force\f[R] -Force existing files to be overwritten. -.TP --\f[B]a\f[R], --\f[B]batch\f[R] -If this flag is set, \f[B]wikiget\f[R] will run in batch download mode -(see \f[I]BATCHFILE\f[R]). -.TP --\f[B]j\f[R], --\f[B]threads\f[R] -Number of parallel downloads to attempt in batch mode. -This option has no effect if -\f[B]a\f[R] is not also set. -.TP --\f[B]v\f[R], --\f[B]verbose\f[R] -Print additional information, such as the site used and the full URL of -the file. -Additional invocations will increase the level of detail. -.TP --\f[B]q\f[R], --\f[B]quiet\f[R] -Silence warnings and minimize printed output. -.TP --\f[B]V\f[R], --\f[B]version\f[R] -Print the version number of the program. -.TP --\f[B]h\f[R], --\f[B]help\f[R] -Print a brief summary of these options. -.SH EXAMPLES -.IP -.EX -wikiget File:Example.jpg -wikiget --site en.wikipedia.org File:Example.jpg -wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg -.EE -.SH BUG REPORTS -https://github.com/clpo13/wikiget/issues -.SH LICENSE -Copyright (C) 2018-2023 Cody Logan and contributors -.PP -This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation, either version 3 of the License, or (at your -option) any later version. -.PP -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -See the GNU General Public License for more details. -.PP -You should have received a copy of the GNU General Public License along -with this program. -If not, see https://www.gnu.org/licenses/. -.SH AUTHORS -Cody Logan . diff --git a/wikiget.1.md b/wikiget.1.md deleted file mode 100644 index 45184f4..0000000 --- a/wikiget.1.md +++ /dev/null @@ -1,116 +0,0 @@ -% WIKIGET(1) Version 0.5.1 | Wikiget User Manual -% Cody Logan -% October 5, 2023 - -# NAME - -wikiget - download files from MediaWiki sites - -# SYNOPSIS - -| **wikiget** \[*options*] *FILE* -| **wikiget** \[*options*] \[\-**a**|\-\-**batch**] *BATCHFILE* -| **wikiget** \[\-**V**|\-\-**version**] -| **wikiget** \[\-**h**|\-\-**help**] - -# DESCRIPTION - -Something like **wget**(1) for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only -the file name or the URL of its description page. - -# OPTIONS - -*FILE* - -: The file to be downloaded. If *FILE* is in the form *File:Example.jpg* or *Image:Example.jpg*, it will be fetched - from the default site, which is "commons.wikimedia.org". If it's the fully-qualified URL of a file description page, - like *https://en.wikipedia.org/wiki/File:Example.jpg*, the file is fetched from the site in the URL, in this case - "en.wikipedia.org". - -*BATCHFILE* - -: In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple file names - or URLs to be downloaded, one per line. If an error is encountered during download, execution stops immediately and - the offending filename is printed. - -\-**s**, \-\-**site** *SITE* - -: MediaWiki site to download from. Will not have any effect if the full URL is given in the *FILE* parameter. - -\-**P**, \-\-**path** *PATH* - -: Script path for the wiki, where "index.php" and "api.php" live. On Wikimedia sites, it's "/w/", the default, but - other sites may use "/" or something else entirely. - -\-**u**, \-\-**username** *USERNAME* - -: Username for private wikis that require a login even for read access. - -\-**p**, \-\-**password** *PASSWORD* - -: Password for private wikis that require a login even for read access. - -\-**o**, \-\-**output** *OUTPUT* - -: By default, the output filename is the same as the remote filename (without the File: or Image: prefix), but this - can be changed with this option. - -\-**l**, \-\-**logfile** *LOGFILE* - -: Specify a logfile, which will contain detailed information about the download process. If the logfile already - exists, new log information is appended to it. - -\-**f**, \-\-**force** - -: Force existing files to be overwritten. - -\-**a**, \-\-**batch** - -: If this flag is set, **wikiget** will run in batch download mode (see *BATCHFILE*). - -\-**j**, \-\-**threads** - -: Number of parallel downloads to attempt in batch mode. This option has no effect if \-**a** is not also set. - -\-**v**, \-\-**verbose** - -: Print additional information, such as the site used and the full URL of the file. Additional invocations will - increase the level of detail. - -\-**q**, \-\-**quiet** - -: Silence warnings and minimize printed output. - -\-**V**, \-\-**version** - -: Print the version number of the program. - -\-**h**, \-\-**help** - -: Print a brief summary of these options. - -# EXAMPLES - -``` -wikiget File:Example.jpg -wikiget --site en.wikipedia.org File:Example.jpg -wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg -``` - -# BUG REPORTS - -https://github.com/clpo13/wikiget/issues - -# LICENSE - -Copyright (C) 2018-2023 Cody Logan and contributors - -This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public -License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied -warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with this program. If not, see -https://www.gnu.org/licenses/. -- cgit v1.2.3 From 5dc9b79bd68d2f7cf0dcf1adfaffd8e07b27c6ba Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 09:37:10 -0700 Subject: Add Python 3.12 to matrix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e8c63d5..06e9503 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,7 @@ cov = [ ] [[tool.hatch.envs.all.matrix]] -python = ["3.7", "3.8", "3.9", "3.10", "3.11"] +python = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] [tool.hatch.envs.lint] detached = true -- cgit v1.2.3 From 8b70abecb543099528ecc8c3b1edfe0330d3d223 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 10:11:20 -0700 Subject: Refactor code and improve docstrings --- src/wikiget/file.py | 16 ++++++--- src/wikiget/validations.py | 26 +++++++++----- src/wikiget/wikiget.py | 87 +++++++++++++++++++++++++--------------------- 3 files changed, 75 insertions(+), 54 deletions(-) diff --git a/src/wikiget/file.py b/src/wikiget/file.py index 60a71e0..c1b9ae6 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -17,11 +17,17 @@ class File: - def __init__(self, name, dest=None): + def __init__(self, name, dest=""): + """ + Initializes a new file with the specified name and an optional destination name. + + :param name: name of the file + :type name: str + :param dest: destination of the file, if different from the name; if not + specified, defaults to the name + :type dest: str, optional + """ self.object = None self.site = None self.name = name - if dest is None: - self.dest = name - else: - self.dest = dest + self.dest = dest if dest else name diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 8ebd996..1610417 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -23,11 +23,14 @@ from wikiget import BLOCKSIZE def valid_file(search_string): """ - Determines if the given string contains a valid file name, defined as a - string ending with a '.' and at least one character, beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. + Determines if the given string contains a valid file name, defined as a string + ending with a '.' and at least one character, beginning with 'File:' or 'Image:', + the standard file prefixes in MediaWiki. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ # second group could also restrict to file extensions with three or more # letters with ([^/\r\n\t\f\v]+\.\w{3,}) @@ -37,12 +40,15 @@ def valid_file(search_string): def valid_site(search_string): """ - Determines if the given string contains a valid site name, defined as a - string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all - subdomains of those domains. Eventually, it should be possible to support - any MediaWiki site, regardless of domain name. + Determines if the given string contains a valid site name, defined as a string + ending with 'wikipedia.org' or 'wikimedia.org'. This covers all subdomains of those + domains. Eventually, it should be possible to support any MediaWiki site, regardless + of domain name. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) return site_regex.search(search_string) @@ -50,10 +56,12 @@ def valid_site(search_string): def verify_hash(filename): """ - Calculates the SHA1 hash of the given file for comparison with a known - value. + Calculates the SHA1 hash of the given file for comparison with a known value. + :param filename: name of the file to calculate a hash for + :type filename: str :return: hash digest + :rtype: str """ hasher = hashlib.sha1() # noqa: S324 with open(filename, "rb") as dl: diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 8c067e0..c470b46 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -111,10 +111,7 @@ def construct_parser(): return parser -def main(): - parser = construct_parser() - args = parser.parse_args() - +def configure_logging(args): loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages @@ -147,6 +144,51 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format=log_format) + +def batch_download(args): + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something happens to the file + # while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() + + +def main(): + # setup + parser = construct_parser() + args = parser.parse_args() + + configure_logging(args) + # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") @@ -154,42 +196,7 @@ def main(): if args.batch: # batch download mode - input_file = args.FILE - dl_list = [] - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error( - "File could not be read. The following error was encountered:" - ) - logging.error(e) - sys.exit(1) - else: - with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() + batch_download(args) else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 226b7cb84070c6d073e153ad410fca7798c8e334 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:04 -0700 Subject: Change logfile log level to debug --- src/wikiget/wikiget.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c470b46..5b36ce5 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -123,20 +123,18 @@ def configure_logging(args): # configure logging: # console log level is set via -v, -vv, and -q options; - # file log level is always info (TODO: add debug option) + # file log level is always debug (TODO: make this user configurable) base_format = "%(threadName)s - %(message)s" log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( - level=logging.INFO, + level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't - # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) @@ -192,7 +190,6 @@ def main(): # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") - # logging.info(f"Log level is set to {loglevel}") if args.batch: # batch download mode -- cgit v1.2.3 From 87052196874cc1bf82f70a6f5aa8e6df59bc1537 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:31 -0700 Subject: Revise batch file parsing to ignore blank and commented lines Previously, blank lines would cause an error and lines prepended with "#" would be downloaded like any other, assuming they were valid. Now, "#" can be used to mark ignored files or comments. --- README.md | 10 +++++----- docs/wikiget.1 | 1 + docs/wikiget.1.md | 4 ++-- src/wikiget/wikiget.py | 19 ++++++++++--------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 4faadf4..eecea90 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,11 @@ By default, the program won't overwrite existing files with the same name as the `-f` or `--force`. Additionally, the file can be downloaded to a different name with `-o`. Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE` will be treated as an input file -containing multiple files to download, one filename or URL per line. If an error is encountered, execution stops -immediately and the offending filename is printed. For large batches, the process can be sped up by downloading files -in parallel. The number of parallel downloads can be set with `-j`. For instance, with `-a -j4`, wikiget will download -four files at once. Without `-j` or with `-j` by itself without a number, wikiget will download the files one at a -time. +containing multiple files to download, one filename or URL per line. Blank lines and lines starting with "#" are +ignored. If an error is encountered, execution stops immediately and the offending filename is printed. For large +batches, the process can be sped up by downloading files in parallel. The number of parallel downloads can be set with +`-j`. For instance, with `-a -j4`, wikiget will download four files at once. Without `-j` or with `-j` by itself without +a number, wikiget will download the files one at a time. ### Example usage diff --git a/docs/wikiget.1 b/docs/wikiget.1 index 03a0c41..fa1a33d 100644 --- a/docs/wikiget.1 +++ b/docs/wikiget.1 @@ -39,6 +39,7 @@ fetched from the site in the URL, in this case In batch download mode (activated with -\f[B]a\f[R] or --\f[B]batch\f[R]), this is a text file containing multiple file names or URLs to be downloaded, one per line. +Blank lines and lines starting with \[lq]#\[rq] are ignored. If an error is encountered during download, execution stops immediately and the offending filename is printed. .TP diff --git a/docs/wikiget.1.md b/docs/wikiget.1.md index 45184f4..fd274d5 100644 --- a/docs/wikiget.1.md +++ b/docs/wikiget.1.md @@ -30,8 +30,8 @@ the file name or the URL of its description page. *BATCHFILE* : In batch download mode (activated with \-**a** or \-\-**batch**), this is a text file containing multiple file names - or URLs to be downloaded, one per line. If an error is encountered during download, execution stops immediately and - the offending filename is printed. + or URLs to be downloaded, one per line. Blank lines and lines starting with "#" are ignored. If an error is + encountered during download, execution stops immediately and the offending filename is printed. \-**s**, \-\-**site** *SITE* diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b36ce5..fba9509 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -145,7 +145,7 @@ def configure_logging(args): def batch_download(args): input_file = args.FILE - dl_list = [] + dl_list = {} logging.info(f"Using batch file '{input_file}'.") @@ -157,10 +157,12 @@ def batch_download(args): sys.exit(1) else: with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s # TODO: validate file contents before download process starts with ThreadPoolExecutor( @@ -168,11 +170,10 @@ def batch_download(args): thread_name_prefix="download", ) as executor: futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() + for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) + logging.info(f"Downloading '{line}' at line {line_num}") + file = prep_download(line, args) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 875748228e509e244c8f444114387f1a03cbb393 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:19:41 -0700 Subject: Update copyright year --- src/wikiget/validations.py | 2 +- src/wikiget/version.py | 17 +++++++++++++++++ tests/test_validations.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 1610417..2bce34e 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2020 Cody Logan +# Copyright (C) 2018-2023 Cody Logan # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/version.py b/src/wikiget/version.py index dd9b22c..34dabb7 100644 --- a/src/wikiget/version.py +++ b/src/wikiget/version.py @@ -1 +1,18 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2023 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + __version__ = "0.5.1" diff --git a/tests/test_validations.py b/tests/test_validations.py index 1abd96a..9d70f6e 100644 --- a/tests/test_validations.py +++ b/tests/test_validations.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan +# Copyright (C) 2018-2023 Cody Logan # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify -- cgit v1.2.3 From 630541499a58f98c55d5cc372d21e745c106d250 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:24:13 -0700 Subject: Refactor parsing logic and revise exception handling --- src/wikiget/dl.py | 58 +++++++++++++-------------------------- src/wikiget/exceptions.py | 20 ++++++++++++++ src/wikiget/parse.py | 54 +++++++++++++++++++++++++++++++++++++ src/wikiget/wikiget.py | 12 +++++++-- tests/test_dl.py | 69 +++++++++++++---------------------------------- tests/test_parse.py | 60 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 93 deletions(-) create mode 100644 src/wikiget/exceptions.py create mode 100644 src/wikiget/parse.py create mode 100644 tests/test_parse.py diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 2b2befa..50b7460 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,46 +18,16 @@ import logging import os import sys -from urllib.parse import unquote, urlparse from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File -from wikiget.validations import valid_file, verify_hash - - -def get_dest(dl, args): - url = urlparse(dl) - - if url.netloc: - filename = url.path - site_name = url.netloc - if args.site is not wikiget.DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, ignoring site specified with --site") - else: - filename = dl - site_name = args.site - - file_match = valid_file(filename) - - # check if this is a valid file - if file_match and file_match.group(1): - # has File:/Image: prefix and extension - filename = file_match.group(2) - else: - # no file extension and/or prefix, probably an article - logging.error(f"Could not parse input '{filename}' as a file.") - sys.exit(1) - - filename = unquote(filename) # remove URL encoding for special characters - - dest = args.output or filename - - return filename, dest, site_name +from wikiget.parse import get_dest +from wikiget.validations import verify_hash def query_api(filename, site_name, args): @@ -98,8 +68,7 @@ def query_api(filename, site_name, args): # an API error at this point likely means access is denied, which could happen # with a private wiki logging.error( - "Access denied. Try providing credentials with " - "--username and --password." + "Access denied. Try providing credentials with --username and --password." ) logging.debug("Full error message:") for i in e.args: @@ -110,7 +79,10 @@ def query_api(filename, site_name, args): def prep_download(dl, args): - filename, dest, site_name = get_dest(dl, args) + try: + filename, dest, site_name = get_dest(dl, args) + except ParseError: + raise file = File(filename, dest) file.object, file.site = query_api(file.name, site_name, args) return file @@ -136,7 +108,7 @@ def download(f, args): if os.path.isfile(dest) and not args.force: logging.warning( - f"File '{dest}' already exists, skipping download (use -f to ignore)" + f"File '{dest}' already exists, skipping download (use -f to force)" ) else: try: @@ -167,19 +139,25 @@ def download(f, args): fd.write(chunk) progress_bar.update(len(chunk)) - # verify file integrity and optionally print details + # verify file integrity and log details dl_sha1 = verify_hash(dest) - logging.info(f"Downloaded file SHA1 is {dl_sha1}") - logging.info(f"Server file SHA1 is {file_sha1}") + logging.info(f"Remote file SHA1 is {file_sha1}") + logging.info(f"Local file SHA1 is {dl_sha1}") if dl_sha1 == file_sha1: logging.info("Hashes match!") # at this point, we've successfully downloaded the file + success_log = f"'{filename}' downloaded" + if args.output: + success_log += f" to '{dest}'" + logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") + # TODO: log but don't quit while in batch mode sys.exit(1) else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file.") + # TODO: log but don't quit while in batch mode sys.exit(1) diff --git a/src/wikiget/exceptions.py b/src/wikiget/exceptions.py new file mode 100644 index 0000000..94ed6b2 --- /dev/null +++ b/src/wikiget/exceptions.py @@ -0,0 +1,20 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + + +class ParseError(Exception): + pass diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py new file mode 100644 index 0000000..09c0767 --- /dev/null +++ b/src/wikiget/parse.py @@ -0,0 +1,54 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging +from urllib.parse import unquote, urlparse + +import wikiget +from wikiget.exceptions import ParseError +from wikiget.validations import valid_file + + +def get_dest(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not wikiget.DEFAULT_SITE: + # this will work even if the user specifies 'commons.wikimedia.org' + logging.warning("target is a URL, ignoring site specified with --site") + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + msg = f"Could not parse input '{filename}' as a file" + raise ParseError(msg) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + return filename, dest, site_name diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index fba9509..68e0233 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download +from wikiget.exceptions import ParseError def construct_parser(): @@ -173,7 +174,10 @@ def batch_download(args): for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{line}' at line {line_num}") - file = prep_download(line, args) + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -197,5 +201,9 @@ def main(): batch_download(args) else: # single download mode - file = prep_download(args.FILE, args) + try: + file = prep_download(args.FILE, args) + except ParseError as e: + logging.error(e) + sys.exit(1) download(file, args) diff --git a/tests/test_dl.py b/tests/test_dl.py index abf8763..fc68733 100644 --- a/tests/test_dl.py +++ b/tests/test_dl.py @@ -15,65 +15,32 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +import logging + import pytest -from wikiget.dl import get_dest +from wikiget import USER_AGENT +from wikiget.dl import prep_download, query_api from wikiget.wikiget import construct_parser -class TestGetDest: +# TODO: don't hit the actual API when doing tests +@pytest.mark.skip +class TestQueryApi: parser = construct_parser() - def test_get_dest_with_filename(self): + def test_query_api(self, caplog): + caplog.set_level(logging.DEBUG) args = self.parser.parse_args(["File:Example.jpg"]) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "commons.wikimedia.org" - - def test_get_dest_with_url(self): - args = self.parser.parse_args( - [ - "https://en.wikipedia.org/wiki/File:Example.jpg", - ] - ) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "en.wikipedia.org" + file, site = query_api("Example.jpg", "commons.wikimedia.org", args) + assert USER_AGENT in caplog.text - def test_get_dest_with_bad_filename(self): - args = self.parser.parse_args(["Example.jpg"]) - with pytest.raises(SystemExit): - filename, dest, site_name = get_dest(args.FILE, args) - - def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): - args = self.parser.parse_args( - [ - "https://commons.wikimedia.org/wiki/File:Example.jpg", - "--site", - "commons.wikimedia.org", - ] - ) - filename, dest, site_name = get_dest(args.FILE, args) - assert "target is a URL, ignoring site specified with --site" in caplog.text +@pytest.mark.skip +class TestPrepDownload: + parser = construct_parser() -# TODO: don't hit the actual API when doing tests -# class TestQueryApi: -# parser = construct_parser() -# -# def test_query_api(self, caplog): -# caplog.set_level(logging.DEBUG) -# args = self.parser.parse_args(["File:Example.jpg"]) -# file, site = query_api("Example.jpg", "commons.wikimedia.org", args) -# assert USER_AGENT in caplog.text -# -# -# class TestPrepDownload(): -# parser = construct_parser() -# -# def test_prep_download(self): -# args = self.parser.parse_args(["File:Example.jpg"]) -# file = prep_download(args.FILE, args) -# assert file is not None + def test_prep_download(self): + args = self.parser.parse_args(["File:Example.jpg"]) + file = prep_download(args.FILE, args) + assert file is not None diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..064b85c --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,60 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import pytest + +from wikiget.exceptions import ParseError +from wikiget.parse import get_dest +from wikiget.wikiget import construct_parser + + +class TestGetDest: + parser = construct_parser() + + def test_get_dest_with_filename(self): + args = self.parser.parse_args(["File:Example.jpg"]) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "commons.wikimedia.org" + + def test_get_dest_with_url(self): + args = self.parser.parse_args( + [ + "https://en.wikipedia.org/wiki/File:Example.jpg", + ] + ) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "en.wikipedia.org" + + def test_get_dest_with_bad_filename(self): + args = self.parser.parse_args(["Example.jpg"]) + with pytest.raises(ParseError): + filename, dest, site_name = get_dest(args.FILE, args) + + def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): + args = self.parser.parse_args( + [ + "https://commons.wikimedia.org/wiki/File:Example.jpg", + "--site", + "commons.wikimedia.org", + ] + ) + filename, dest, site_name = get_dest(args.FILE, args) + assert "target is a URL, ignoring site specified with --site" in caplog.text -- cgit v1.2.3 From 45a550899e0adf6958764d8a5133da4e21aa7fea Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 14:14:12 -0700 Subject: Add note to README about dev branch --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index eecea90..bd8ece2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ Something like wget for downloading a file from MediaWiki sites (like Wikipedia or Wikimedia Commons) using only the file name or the URL of its description page. -Requires Python 3.7+. Get it with `pip install --user wikiget` or `pipx install wikiget`. +Requires Python 3.7+ and pip. Get it with `pip install wikiget` or `pipx install wikiget`. For the latest features, at +the risk of bugs and undocumented behavior, you can try the dev branch: +`pip install https://github.com/clpo13/wikiget/archive/refs/heads/dev.zip` ## Usage -- cgit v1.2.3 From 06335ba0176cabd84f5b548995f465ac1c09bc8e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 17 Oct 2023 14:00:14 -0700 Subject: Clean up exception handling and error messages --- src/wikiget/dl.py | 23 ++++++++--------------- src/wikiget/parse.py | 5 +++-- src/wikiget/wikiget.py | 11 +++++++++++ tests/test_parse.py | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 50b7460..4521b72 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -24,7 +24,6 @@ from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget -from wikiget.exceptions import ParseError from wikiget.file import File from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -42,24 +41,22 @@ def query_api(filename, site_name, args): except ConnectionError as e: # usually this means there is no such site, or there's no network connection, # though it could be a certificate problem - logging.error("Couldn't connect to specified site.") - logging.debug("Full error message:") + logging.error("Could not connect to specified site") logging.debug(e) - sys.exit(1) + raise except HTTPError as e: # most likely a 403 forbidden or 404 not found error for api.php logging.error( - "Couldn't find the specified wiki's api.php. Check the value of --path." + "Could not find the specified wiki's api.php. Check the value of --path." ) - logging.debug("Full error message:") logging.debug(e) - sys.exit(1) + raise except (InvalidResponse, LoginError) as e: # InvalidResponse: site exists, but we couldn't communicate with the API # endpoint for some reason other than an HTTP error. # LoginError: missing or invalid credentials logging.error(e) - sys.exit(1) + raise # get info about the target file try: @@ -70,19 +67,15 @@ def query_api(filename, site_name, args): logging.error( "Access denied. Try providing credentials with --username and --password." ) - logging.debug("Full error message:") for i in e.args: logging.debug(i) - sys.exit(1) + raise return file, site def prep_download(dl, args): - try: - filename, dest, site_name = get_dest(dl, args) - except ParseError: - raise + filename, dest, site_name = get_dest(dl, args) file = File(filename, dest) file.object, file.site = query_api(file.name, site_name, args) return file @@ -158,6 +151,6 @@ def download(f, args): else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be a valid file.") + logging.error(f"Target '{filename}' does not appear to be a valid file") # TODO: log but don't quit while in batch mode sys.exit(1) diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index 09c0767..f5c221d 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -30,8 +30,9 @@ def get_dest(dl, args): filename = url.path site_name = url.netloc if args.site is not wikiget.DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, ignoring site specified with --site") + # this will work even if the user specifies 'commons.wikimedia.org' since + # we're comparing objects instead of values (is not vs. !=) + logging.warning("Target is a URL, ignoring site specified with --site") else: filename = dl site_name = args.site diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 68e0233..4446f96 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,6 +20,9 @@ import logging import sys from concurrent.futures import ThreadPoolExecutor +from mwclient import APIError, InvalidResponse, LoginError +from requests import ConnectionError, HTTPError + import wikiget from wikiget.dl import download, prep_download from wikiget.exceptions import ParseError @@ -178,6 +181,10 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.error( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -198,6 +205,8 @@ def main(): if args.batch: # batch download mode + # TODO: return non-zero exit code if any errors were encountered, even if some + # downloads completed successfully batch_download(args) else: # single download mode @@ -206,4 +215,6 @@ def main(): except ParseError as e: logging.error(e) sys.exit(1) + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + sys.exit(1) download(file, args) diff --git a/tests/test_parse.py b/tests/test_parse.py index 064b85c..c47678e 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -57,4 +57,4 @@ class TestGetDest: ] ) filename, dest, site_name = get_dest(args.FILE, args) - assert "target is a URL, ignoring site specified with --site" in caplog.text + assert "Target is a URL, ignoring site specified with --site" in caplog.text -- cgit v1.2.3 From ba1f10666554316c262efd2ee6950560560317c7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 12:59:08 -0700 Subject: Fix bug in batch downloading An invalid line in the batch file would cause the last valid file to be downloaded twice. --- src/wikiget/wikiget.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 4446f96..af13bc8 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -181,10 +181,12 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + continue except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): logging.error( f"Unable to download '{line}' (line {line_num}) due to an error" ) + continue future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 59b97c7bef21cf68935b68fa7de6fd67653e21af Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:09:28 -0700 Subject: Tweak pyproject.toml and setup.py --- pyproject.toml | 58 +++++++++++++++++++++++++++++++--------------------------- setup.py | 17 +++++++++++++++++ 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06e9503..11fcaad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,37 +8,37 @@ dynamic = ["version"] description = "CLI tool for downloading files from MediaWiki sites" readme = "README.md" authors = [ - {name = "Cody Logan", email = "clpo13@gmail.com"} + {name = "Cody Logan", email = "clpo13@gmail.com"} ] requires-python = ">=3.7" license = {text = "GPL-3.0-or-later"} keywords = ["commons", "mediawiki", "wikimedia", "wikipedia"] classifiers = [ - "Development Status :: 4 - Beta", - "Environment :: Console", - "Intended Audience :: End Users/Desktop", - "Operating System :: OS Independent", - "Topic :: Internet", - "Topic :: Internet :: WWW/HTTP", - "Topic :: Multimedia", - "Topic :: Multimedia :: Graphics", - "Topic :: Multimedia :: Sound/Audio", - "Topic :: Multimedia :: Video", - "Topic :: Utilities", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "Operating System :: OS Independent", + "Topic :: Internet", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Multimedia", + "Topic :: Multimedia :: Graphics", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Multimedia :: Video", + "Topic :: Utilities", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies = [ - "mwclient>=0.10.0", - "requests", - "tqdm", + "mwclient>=0.10.0", + "requests", + "tqdm", ] [project.urls] @@ -53,13 +53,13 @@ path = "src/wikiget/version.py" [tool.pytest.ini_options] addopts = [ - "--import-mode=importlib", + "--import-mode=importlib", ] testpaths = ["tests"] [tool.hatch.build.targets.sdist] exclude = [ - "/.github", + "/.github", ] [tool.hatch.build.targets.wheel.shared-data] @@ -188,5 +188,9 @@ exclude_lines = [ ] [[tool.mypy.overrides]] -module = ["mwclient", "pytest"] +module = [ + "mwclient", + "mwclient.image", + "pytest", +] ignore_missing_imports = true diff --git a/setup.py b/setup.py index 6068493..a73e48c 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,20 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + from setuptools import setup setup() -- cgit v1.2.3 From 05457af0d73ff3a820c0b465e6607fc5832a6e74 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:23:28 -0700 Subject: Reorganize File class --- src/wikiget/dl.py | 19 ++++++++----------- src/wikiget/file.py | 12 +++++++++--- src/wikiget/parse.py | 4 +++- src/wikiget/wikiget.py | 4 ++-- tests/test_file_class.py | 10 ++++++++-- tests/test_parse.py | 20 ++++++++++---------- 6 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 4521b72..171b017 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -30,10 +30,8 @@ from wikiget.validations import verify_hash def query_api(filename, site_name, args): - logging.debug(f"User agent: {wikiget.USER_AGENT}") - # connect to site and identify ourselves - logging.info(f"Site name: {site_name}") + logging.info(f"Connecting to {site_name}") try: site = Site(site_name, path=args.path, clients_useragent=wikiget.USER_AGENT) if args.username and args.password: @@ -60,7 +58,7 @@ def query_api(filename, site_name, args): # get info about the target file try: - file = site.images[filename] + image = site.images[filename] except APIError as e: # an API error at this point likely means access is denied, which could happen # with a private wiki @@ -71,23 +69,22 @@ def query_api(filename, site_name, args): logging.debug(i) raise - return file, site + return image def prep_download(dl, args): - filename, dest, site_name = get_dest(dl, args) - file = File(filename, dest) - file.object, file.site = query_api(file.name, site_name, args) + file = get_dest(dl, args) + file.image = query_api(file.name, file.site, args) return file def download(f, args): - file = f.object + file = f.image filename = f.name - site = f.site dest = f.dest + site = file.site - if file.imageinfo != {}: + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] file_size = file.imageinfo["size"] diff --git a/src/wikiget/file.py b/src/wikiget/file.py index c1b9ae6..b890e63 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -15,9 +15,13 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +from mwclient.image import Image + +from wikiget import DEFAULT_SITE + class File: - def __init__(self, name, dest=""): + def __init__(self, name: str, dest: str = "", site: str = "") -> None: """ Initializes a new file with the specified name and an optional destination name. @@ -26,8 +30,10 @@ class File: :param dest: destination of the file, if different from the name; if not specified, defaults to the name :type dest: str, optional + :param site: name of the site hosting the file; if not specified, defaults to + the global default site """ - self.object = None - self.site = None + self.image: Image = None self.name = name self.dest = dest if dest else name + self.site = site if site else DEFAULT_SITE diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index f5c221d..4e9b195 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -52,4 +52,6 @@ def get_dest(dl, args): dest = args.output or filename - return filename, dest, site_name + file = File(filename, dest, site_name) + + return file diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index af13bc8..90078e1 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -195,15 +195,15 @@ def batch_download(args): def main(): - # setup + # setup our environment parser = construct_parser() args = parser.parse_args() - configure_logging(args) # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") + logging.debug(f"User agent: {wikiget.USER_AGENT}") if args.batch: # batch download mode diff --git a/tests/test_file_class.py b/tests/test_file_class.py index 7ad0b87..dd30207 100644 --- a/tests/test_file_class.py +++ b/tests/test_file_class.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +from wikiget import DEFAULT_SITE from wikiget.file import File @@ -22,10 +23,15 @@ def test_file_with_name_only(): file = File("foobar.jpg") assert file.name == "foobar.jpg" assert file.dest == file.name + assert file.site == DEFAULT_SITE def test_file_with_name_and_dest(): - file = File("foobar.jpg", "bazqux.jpg") - assert file.name == "foobar.jpg" + file = File("foobar.jpg", dest="bazqux.jpg") assert file.dest == "bazqux.jpg" assert file.dest != file.name + + +def test_file_with_name_and_site(): + file = File("foobar.jpg", site="en.wikipedia.org") + assert file.site == "en.wikipedia.org" diff --git a/tests/test_parse.py b/tests/test_parse.py index c47678e..757b361 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -27,10 +27,10 @@ class TestGetDest: def test_get_dest_with_filename(self): args = self.parser.parse_args(["File:Example.jpg"]) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "commons.wikimedia.org" + file = get_dest(args.FILE, args) + assert file.name == "Example.jpg" + assert file.dest == "Example.jpg" + assert file.site == "commons.wikimedia.org" def test_get_dest_with_url(self): args = self.parser.parse_args( @@ -38,15 +38,15 @@ class TestGetDest: "https://en.wikipedia.org/wiki/File:Example.jpg", ] ) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "en.wikipedia.org" + file = get_dest(args.FILE, args) + assert file.name == "Example.jpg" + assert file.dest == "Example.jpg" + assert file.site == "en.wikipedia.org" def test_get_dest_with_bad_filename(self): args = self.parser.parse_args(["Example.jpg"]) with pytest.raises(ParseError): - filename, dest, site_name = get_dest(args.FILE, args) + _ = get_dest(args.FILE, args) def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): args = self.parser.parse_args( @@ -56,5 +56,5 @@ class TestGetDest: "commons.wikimedia.org", ] ) - filename, dest, site_name = get_dest(args.FILE, args) + _ = get_dest(args.FILE, args) assert "Target is a URL, ignoring site specified with --site" in caplog.text -- cgit v1.2.3 From b136af078208882ae696b21c0d8aac009e7468d4 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:28:23 -0700 Subject: Move batch_download function to proper file --- src/wikiget/dl.py | 63 +++++++++++++++++++++++++++++++++++++++++++---- src/wikiget/wikiget.py | 67 +++++++++----------------------------------------- 2 files changed, 70 insertions(+), 60 deletions(-) diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 171b017..83aef9f 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,12 +18,14 @@ import logging import os import sys +from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -78,12 +80,62 @@ def prep_download(dl, args): return file +def batch_download(args): + input_file = args.FILE + dl_list = {} + errors = 0 + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in dl_list.items(): + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Processing '{line}' at line {line_num}") + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") + errors += 1 + continue + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.warning( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) + errors += 1 + continue + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + errors += future.result() + return errors + + def download(f, args): file = f.image filename = f.name dest = f.dest site = file.site + errors = 0 + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] @@ -100,6 +152,7 @@ def download(f, args): logging.warning( f"File '{dest}' already exists, skipping download (use -f to force)" ) + errors += 1 else: try: fd = open(dest, "wb") @@ -108,7 +161,7 @@ def download(f, args): "File could not be written. The following error was encountered:" ) logging.error(e) - sys.exit(1) + errors += 1 else: # download the file(s) if args.verbose >= wikiget.STD_VERBOSE: @@ -143,11 +196,11 @@ def download(f, args): logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 + + return errors diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 90078e1..e9a1147 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,13 +18,12 @@ import argparse import logging import sys -from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError from requests import ConnectionError, HTTPError import wikiget -from wikiget.dl import download, prep_download +from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError @@ -145,55 +144,6 @@ def configure_logging(args): else: # log only to console logging.basicConfig(level=loglevel, format=log_format) - - -def batch_download(args): - input_file = args.FILE - dl_list = {} - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error("File could not be read. The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - with fd: - # read the file into memory and process each line as we go - for line_num, line in enumerate(fd, start=1): - line_s = line.strip() - # ignore blank lines and lines starting with "#" (for comments) - if line_s and not line_s.startswith("#"): - dl_list[line_num] = line_s - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in dl_list.items(): - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{line}' at line {line_num}") - try: - file = prep_download(line, args) - except ParseError as e: - logging.warning(f"{e} (line {line_num})") - continue - except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): - logging.error( - f"Unable to download '{line}' (line {line_num}) due to an error" - ) - continue - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() - - def main(): # setup our environment parser = construct_parser() @@ -207,9 +157,14 @@ def main(): if args.batch: # batch download mode - # TODO: return non-zero exit code if any errors were encountered, even if some - # downloads completed successfully - batch_download(args) + errors = batch_download(args) + if errors: + # return non-zero exit code if any problems were encountered, even if some + # downloads completed successfully + logging.warning( + f"{errors} problem{'s'[:errors^1]} encountered during batch processing" + ) + sys.exit(1) else: # single download mode try: @@ -219,4 +174,6 @@ def main(): sys.exit(1) except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): sys.exit(1) - download(file, args) + errors = download(file, args) + if errors: + sys.exit(1) -- cgit v1.2.3 From 3d37cf6f86eb6c48a3a0a094c42ade6d7aed1daf Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:31:56 -0700 Subject: Move logging configuration to new file Also, use a LoggerAdapter to add contextual info (such as filenames) to log messages when downloading, especially useful with threaded batch processing. --- src/wikiget/dl.py | 29 +++++++++++++------------ src/wikiget/logging.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/wikiget/wikiget.py | 31 +-------------------------- 3 files changed, 72 insertions(+), 45 deletions(-) create mode 100644 src/wikiget/logging.py diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 83aef9f..5491378 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -27,6 +27,7 @@ from tqdm import tqdm import wikiget from wikiget.exceptions import ParseError from wikiget.file import File +from wikiget.logging import FileLogAdapter from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -136,6 +137,9 @@ def download(f, args): errors = 0 + logger = logging.getLogger("") + adapter = FileLogAdapter(logger, {"filename": filename}) + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] @@ -145,22 +149,17 @@ def download(f, args): filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" if args.output: filename_log += f" to '{dest}'" - logging.info(filename_log) - logging.info(f"{file_url}") + adapter.info(filename_log) + adapter.info(f"{file_url}") if os.path.isfile(dest) and not args.force: - logging.warning( - f"File '{dest}' already exists, skipping download (use -f to force)" - ) + adapter.warning("File already exists, skipping download (use -f to force)") errors += 1 else: try: fd = open(dest, "wb") except OSError as e: - logging.error( - "File could not be written. The following error was encountered:" - ) - logging.error(e) + adapter.error(f"File could not be written. {e}") errors += 1 else: # download the file(s) @@ -185,22 +184,22 @@ def download(f, args): # verify file integrity and log details dl_sha1 = verify_hash(dest) - logging.info(f"Remote file SHA1 is {file_sha1}") - logging.info(f"Local file SHA1 is {dl_sha1}") + adapter.info(f"Remote file SHA1 is {file_sha1}") + adapter.info(f"Local file SHA1 is {dl_sha1}") if dl_sha1 == file_sha1: - logging.info("Hashes match!") + adapter.info("Hashes match!") # at this point, we've successfully downloaded the file success_log = f"'{filename}' downloaded" if args.output: success_log += f" to '{dest}'" - logging.info(success_log) + adapter.info(success_log) else: - logging.error("Hash mismatch! Downloaded file may be corrupt.") + adapter.error("Hash mismatch! Downloaded file may be corrupt.") errors += 1 else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be a valid file") + adapter.warning("Target does not appear to be a valid file") errors += 1 return errors diff --git a/src/wikiget/logging.py b/src/wikiget/logging.py new file mode 100644 index 0000000..1536156 --- /dev/null +++ b/src/wikiget/logging.py @@ -0,0 +1,57 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging + +import wikiget + + +class FileLogAdapter(logging.LoggerAdapter): + def process(self, msg, kwargs): + return f"[{self.extra['filename']}] {msg}", kwargs + + +def configure_logging(args): + loglevel = logging.WARNING + if args.verbose >= wikiget.VERY_VERBOSE: + # this includes API and library messages + loglevel = logging.DEBUG + elif args.verbose >= wikiget.STD_VERBOSE: + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # configure logging: + # console log level is set via -v, -vv, and -q options; + # file log level is always debug (TODO: make this user configurable) + base_format = "%(message)s" + log_format = "[%(levelname)s] " + base_format + if args.logfile: + # log to console and file + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s [%(levelname)-7s] " + base_format, + filename=args.logfile, + ) + + console = logging.StreamHandler() + console.setLevel(loglevel) + console.setFormatter(logging.Formatter(log_format)) + logging.getLogger("").addHandler(console) + else: + # log only to console + logging.basicConfig(level=loglevel, format=log_format) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index e9a1147..5b84dac 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -25,6 +25,7 @@ from requests import ConnectionError, HTTPError import wikiget from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError +from wikiget.logging import configure_logging def construct_parser(): @@ -114,36 +115,6 @@ def construct_parser(): return parser -def configure_logging(args): - loglevel = logging.WARNING - if args.verbose >= wikiget.VERY_VERBOSE: - # this includes API and library messages - loglevel = logging.DEBUG - elif args.verbose >= wikiget.STD_VERBOSE: - loglevel = logging.INFO - elif args.quiet: - loglevel = logging.ERROR - - # configure logging: - # console log level is set via -v, -vv, and -q options; - # file log level is always debug (TODO: make this user configurable) - base_format = "%(threadName)s - %(message)s" - log_format = "[%(levelname)s] " + base_format - if args.logfile: - # log to console and file - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s [%(levelname)-7s] " + base_format, - filename=args.logfile, - ) - - console = logging.StreamHandler() - console.setLevel(loglevel) - console.setFormatter(logging.Formatter(log_format)) - logging.getLogger("").addHandler(console) - else: - # log only to console - logging.basicConfig(level=loglevel, format=log_format) def main(): # setup our environment parser = construct_parser() -- cgit v1.2.3 From c1820026f97eaf671c29ab30f02879de0ac4df89 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:36:14 -0700 Subject: Add type annotations to source files --- src/wikiget/dl.py | 10 ++++++---- src/wikiget/logging.py | 3 ++- src/wikiget/parse.py | 4 +++- src/wikiget/validations.py | 7 ++++--- src/wikiget/wikiget.py | 4 ++-- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 5491378..5b5b43b 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,9 +18,11 @@ import logging import os import sys +from argparse import Namespace from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError, Site +from mwclient.image import Image from requests import ConnectionError, HTTPError from tqdm import tqdm @@ -32,7 +34,7 @@ from wikiget.parse import get_dest from wikiget.validations import verify_hash -def query_api(filename, site_name, args): +def query_api(filename: str, site_name: str, args: Namespace) -> Image: # connect to site and identify ourselves logging.info(f"Connecting to {site_name}") try: @@ -75,13 +77,13 @@ def query_api(filename, site_name, args): return image -def prep_download(dl, args): +def prep_download(dl: str, args: Namespace) -> File: file = get_dest(dl, args) file.image = query_api(file.name, file.site, args) return file -def batch_download(args): +def batch_download(args: Namespace) -> int: input_file = args.FILE dl_list = {} errors = 0 @@ -129,7 +131,7 @@ def batch_download(args): return errors -def download(f, args): +def download(f: File, args: Namespace) -> int: file = f.image filename = f.name dest = f.dest diff --git a/src/wikiget/logging.py b/src/wikiget/logging.py index 1536156..87b917c 100644 --- a/src/wikiget/logging.py +++ b/src/wikiget/logging.py @@ -16,6 +16,7 @@ # along with Wikiget. If not, see . import logging +from argparse import Namespace import wikiget @@ -25,7 +26,7 @@ class FileLogAdapter(logging.LoggerAdapter): return f"[{self.extra['filename']}] {msg}", kwargs -def configure_logging(args): +def configure_logging(args: Namespace) -> None: loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index 4e9b195..fe3fe43 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -16,14 +16,16 @@ # along with Wikiget. If not, see . import logging +from argparse import Namespace from urllib.parse import unquote, urlparse import wikiget from wikiget.exceptions import ParseError +from wikiget.file import File from wikiget.validations import valid_file -def get_dest(dl, args): +def get_dest(dl: str, args: Namespace) -> File: url = urlparse(dl) if url.netloc: diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 2bce34e..c9e7bcf 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -17,11 +17,12 @@ import hashlib import re +from typing import Optional from wikiget import BLOCKSIZE -def valid_file(search_string): +def valid_file(search_string: str) -> Optional[re.Match]: """ Determines if the given string contains a valid file name, defined as a string ending with a '.' and at least one character, beginning with 'File:' or 'Image:', @@ -38,7 +39,7 @@ def valid_file(search_string): return file_regex.search(search_string) -def valid_site(search_string): +def valid_site(search_string: str) -> Optional[re.Match]: """ Determines if the given string contains a valid site name, defined as a string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all subdomains of those @@ -54,7 +55,7 @@ def valid_site(search_string): return site_regex.search(search_string) -def verify_hash(filename): +def verify_hash(filename: str) -> str: """ Calculates the SHA1 hash of the given file for comparison with a known value. diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b84dac..e64d00e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -28,7 +28,7 @@ from wikiget.exceptions import ParseError from wikiget.logging import configure_logging -def construct_parser(): +def construct_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -115,7 +115,7 @@ def construct_parser(): return parser -def main(): +def main() -> None: # setup our environment parser = construct_parser() args = parser.parse_args() -- cgit v1.2.3