From a1995912ed24b37a990f3fcd5e91dbf7b46669fb Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:17:04 -0700 Subject: Reorganize file tree --- src/wikiget/__init__.py | 28 ++++++++ src/wikiget/dl.py | 159 +++++++++++++++++++++++++++++++++++++++++++++ src/wikiget/validations.py | 64 ++++++++++++++++++ src/wikiget/version.py | 1 + src/wikiget/wikiget.py | 157 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 409 insertions(+) create mode 100644 src/wikiget/__init__.py create mode 100644 src/wikiget/dl.py create mode 100644 src/wikiget/validations.py create mode 100644 src/wikiget/version.py create mode 100644 src/wikiget/wikiget.py (limited to 'src') diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py new file mode 100644 index 0000000..4adcae3 --- /dev/null +++ b/src/wikiget/__init__.py @@ -0,0 +1,28 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +from mwclient import __version__ as mwclient_version + +from .version import __version__ as wikiget_version + +# set some global constants +BLOCKSIZE = 65536 +CHUNKSIZE = 1024 +DEFAULT_SITE = "commons.wikimedia.org" +DEFAULT_PATH = "/w/" +USER_AGENT = (f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " + f"mwclient/{mwclient_version}") diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py new file mode 100644 index 0000000..8f32218 --- /dev/null +++ b/src/wikiget/dl.py @@ -0,0 +1,159 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging +import os +import sys +from urllib.parse import unquote, urlparse + +from mwclient import APIError, InvalidResponse, LoginError, Site +from requests import ConnectionError, HTTPError +from tqdm import tqdm + +from . import CHUNKSIZE, DEFAULT_SITE, USER_AGENT +from .validations import valid_file, verify_hash + + +def download(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not DEFAULT_SITE: + # this will work even if the user specifies 'commons.wikimedia.org' + logging.warning("target is a URL, " + "ignoring site specified with --site") + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + logging.error(f"Could not parse input '{filename}' as a file.") + sys.exit(1) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + logging.debug(f"User agent: {USER_AGENT}") + + # connect to site and identify ourselves + logging.info(f"Site name: {site_name}") + try: + site = Site(site_name, path=args.path, clients_useragent=USER_AGENT) + if args.username and args.password: + site.login(args.username, args.password) + except ConnectionError as e: + # usually this means there is no such site, or there's no network + # connection, though it could be a certificate problem + logging.error("Couldn't connect to specified site.") + logging.debug("Full error message:") + logging.debug(e) + sys.exit(1) + except HTTPError as e: + # most likely a 403 forbidden or 404 not found error for api.php + logging.error("Couldn't find the specified wiki's api.php. " + "Check the value of --path.") + logging.debug("Full error message:") + logging.debug(e) + sys.exit(1) + except (InvalidResponse, LoginError) as e: + # InvalidResponse: site exists, but we couldn't communicate with the + # API endpoint for some reason other than an HTTP error. + # LoginError: missing or invalid credentials + logging.error(e) + sys.exit(1) + + # get info about the target file + try: + file = site.images[filename] + except APIError as e: + # an API error at this point likely means access is denied, + # which could happen with a private wiki + logging.error("Access denied. Try providing credentials with " + "--username and --password.") + logging.debug("Full error message:") + for i in e.args: + logging.debug(i) + sys.exit(1) + + if file.imageinfo != {}: + # file exists either locally or at a common repository, + # like Wikimedia Commons + file_url = file.imageinfo["url"] + file_size = file.imageinfo["size"] + file_sha1 = file.imageinfo["sha1"] + + filename_log = (f"Downloading '{filename}' ({file_size} bytes) " + f"from {site.host}") + if args.output: + filename_log += f" to '{dest}'" + logging.info(filename_log) + logging.info(f"{file_url}") + + if os.path.isfile(dest) and not args.force: + logging.warning(f"File '{dest}' already exists, skipping download " + "(use -f to ignore)") + else: + try: + fd = open(dest, "wb") + except IOError as e: + logging.error("File could not be written. " + "The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + # download the file(s) + if args.verbose >= 1: + leave_bars = True + else: + leave_bars = False + with tqdm(leave=leave_bars, total=file_size, + unit="B", unit_scale=True, + unit_divisor=CHUNKSIZE) as progress_bar: + with fd: + res = site.connection.get(file_url, stream=True) + progress_bar.set_postfix(file=dest, refresh=False) + for chunk in res.iter_content(CHUNKSIZE): + fd.write(chunk) + progress_bar.update(len(chunk)) + + # verify file integrity and optionally print details + dl_sha1 = verify_hash(dest) + + logging.info(f"Downloaded file SHA1 is {dl_sha1}") + logging.info(f"Server file SHA1 is {file_sha1}") + if dl_sha1 == file_sha1: + logging.info("Hashes match!") + # at this point, we've successfully downloaded the file + else: + logging.error("Hash mismatch! Downloaded file may be corrupt.") + sys.exit(1) + + else: + # no file information returned + logging.error(f"Target '{filename}' does not appear to be " + "a valid file.") + sys.exit(1) diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py new file mode 100644 index 0000000..bd99570 --- /dev/null +++ b/src/wikiget/validations.py @@ -0,0 +1,64 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018, 2019, 2020 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import hashlib +import re + +from . import BLOCKSIZE + + +def valid_file(search_string): + """ + Determines if the given string contains a valid file name, defined as a + string ending with a '.' and at least one character, beginning with 'File:' + or 'Image:', the standard file prefixes in MediaWiki. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + # second group could also restrict to file extensions with three or more + # letters with ([^/\r\n\t\f\v]+\.\w{3,}) + file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I) + return file_regex.search(search_string) + + +def valid_site(search_string): + """ + Determines if the given string contains a valid site name, defined as a + string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all + subdomains of those domains. Eventually, it should be possible to support + any MediaWiki site, regardless of domain name. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) + return site_regex.search(search_string) + + +def verify_hash(filename): + """ + Calculates the SHA1 hash of the given file for comparison with a known + value. + :param filename: name of the file to calculate a hash for + :return: hash digest + """ + hasher = hashlib.sha1() + with open(filename, "rb") as dl: + buf = dl.read(BLOCKSIZE) + while len(buf) > 0: + hasher.update(buf) + buf = dl.read(BLOCKSIZE) + return hasher.hexdigest() diff --git a/src/wikiget/version.py b/src/wikiget/version.py new file mode 100644 index 0000000..dd9b22c --- /dev/null +++ b/src/wikiget/version.py @@ -0,0 +1 @@ +__version__ = "0.5.1" diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py new file mode 100644 index 0000000..a8679c9 --- /dev/null +++ b/src/wikiget/wikiget.py @@ -0,0 +1,157 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import argparse +import logging +import sys + +from . import DEFAULT_SITE, DEFAULT_PATH, wikiget_version +from .dl import download + + +def main(): + """ + Main entry point for console script. Automatically compiled by setuptools + when installed with `pip install` or `python setup.py install`. + """ + + parser = argparse.ArgumentParser(description=""" + A tool for downloading files from + MediaWiki sites using the file name or + description page URL + """, + epilog=""" + Copyright (C) 2018-2021 Cody Logan + and contributors. + License GPLv3+: GNU GPL version 3 or later + . + This is free software; you are free to + change and redistribute it under certain + conditions. There is NO WARRANTY, to the + extent permitted by law. + """) + parser.add_argument("FILE", help=""" + name of the file to download with the File: + prefix, or the URL of its file description page + """) + parser.add_argument("-V", "--version", action="version", + version=f"%(prog)s {wikiget_version}") + message_options = parser.add_mutually_exclusive_group() + message_options.add_argument("-q", "--quiet", + help="suppress warning messages", + action="store_true") + message_options.add_argument("-v", "--verbose", + help="print detailed information; " + "use -vv for even more detail", + action="count", default=0) + parser.add_argument("-f", "--force", + help="force overwriting existing files", + action="store_true") + parser.add_argument("-s", "--site", default=DEFAULT_SITE, + help="MediaWiki site to download from " + "(default: %(default)s)") + parser.add_argument("-p", "--path", default=DEFAULT_PATH, + help="MediaWiki site path, where api.php is located " + "(default: %(default)s)") + parser.add_argument("--username", default="", + help="MediaWiki site username, for private wikis") + parser.add_argument("--password", default="", + help="MediaWiki site password, for private wikis") + output_options = parser.add_mutually_exclusive_group() + output_options.add_argument("-o", "--output", + help="write download to OUTPUT") + output_options.add_argument("-a", "--batch", + help="treat FILE as a textfile containing " + "multiple files to download, one URL or " + "filename per line", action="store_true") + parser.add_argument("-l", "--logfile", default="", + help="save log output to LOGFILE") + + args = parser.parse_args() + + loglevel = logging.WARNING + if args.verbose >= 2: + # this includes API and library messages + loglevel = logging.DEBUG + elif args.verbose >= 1: + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # configure logging: + # console log level is set via -v, -vv, and -q options + # file log level is always info (TODO: add debug option) + if args.logfile: + # log to console and file + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)-7s] %(message)s", + filename=args.logfile + ) + + console = logging.StreamHandler() + # TODO: even when loglevel is set to logging.DEBUG, + # debug messages aren't printing to console + console.setLevel(loglevel) + console.setFormatter( + logging.Formatter("[%(levelname)s] %(message)s") + ) + logging.getLogger("").addHandler(console) + else: + # log only to console + logging.basicConfig( + level=loglevel, + format="[%(levelname)s] %(message)s" + ) + + # log events are appended to the file if it already exists, + # so note the start of a new download session + logging.info(f"Starting download session using wikiget {wikiget_version}") + # logging.info(f"Log level is set to {loglevel}") + + if args.batch: + # batch download mode + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file, "r") + except IOError as e: + logging.error("File could not be read. " + "The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something + # happens to the file while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + for line_num, url in enumerate(dl_list, start=1): + url = url.strip() + # keep track of batch file line numbers for + # debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + download(url, args) + else: + # single download mode + dl = args.FILE + download(dl, args) -- cgit v1.2.3 From 75a79785d851efa319f4216e0d3471d30a02154a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:45:43 -0700 Subject: Style and format fixes --- src/wikiget/__init__.py | 5 +++-- src/wikiget/dl.py | 30 ++++++++++++++++-------------- src/wikiget/wikiget.py | 29 +++++++++++------------------ 3 files changed, 30 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py index 20ea620..5b917cf 100644 --- a/src/wikiget/__init__.py +++ b/src/wikiget/__init__.py @@ -24,8 +24,9 @@ BLOCKSIZE = 65536 CHUNKSIZE = 1024 DEFAULT_SITE = "commons.wikimedia.org" DEFAULT_PATH = "/w/" -USER_AGENT = "wikiget/{} (https://github.com/clpo13/wikiget) mwclient/{}".format( - wikiget_version, mwclient_version +USER_AGENT = ( + f"wikiget/{wikiget_version} (https://github.com/clpo13/wikiget) " + f"mwclient/{mwclient_version}" ) STD_VERBOSE = 1 VERY_VERBOSE = 2 diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 9850ce8..791db61 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -36,8 +36,7 @@ def download(dl, args): site_name = url.netloc if args.site is not wikiget.DEFAULT_SITE: # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, " - "ignoring site specified with --site") + logging.warning("target is a URL, ignoring site specified with --site") else: filename = dl site_name = args.site @@ -74,8 +73,9 @@ def download(dl, args): sys.exit(1) except HTTPError as e: # most likely a 403 forbidden or 404 not found error for api.php - logging.error("Couldn't find the specified wiki's api.php. " - "Check the value of --path.") + logging.error( + "Couldn't find the specified wiki's api.php. Check the value of --path." + ) logging.debug("Full error message:") logging.debug(e) sys.exit(1) @@ -92,8 +92,10 @@ def download(dl, args): except APIError as e: # an API error at this point likely means access is denied, # which could happen with a private wiki - logging.error("Access denied. Try providing credentials with " - "--username and --password.") + logging.error( + "Access denied. Try providing credentials with " + "--username and --password." + ) logging.debug("Full error message:") for i in e.args: logging.debug(i) @@ -106,22 +108,23 @@ def download(dl, args): file_size = file.imageinfo["size"] file_sha1 = file.imageinfo["sha1"] - filename_log = (f"Downloading '{filename}' ({file_size} bytes) " - f"from {site.host}") + filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" if args.output: filename_log += f" to '{dest}'" logging.info(filename_log) logging.info(f"{file_url}") if os.path.isfile(dest) and not args.force: - logging.warning(f"File '{dest}' already exists, skipping download " - "(use -f to ignore)") + logging.warning( + f"File '{dest}' already exists, skipping download (use -f to ignore)" + ) else: try: fd = open(dest, "wb") except OSError as e: - logging.error("File could not be written. " - "The following error was encountered:") + logging.error( + "File could not be written. The following error was encountered:" + ) logging.error(e) sys.exit(1) else: @@ -158,6 +161,5 @@ def download(dl, args): else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be " - "a valid file.") + logging.error(f"Target '{filename}' does not appear to be a valid file.") sys.exit(1) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index b9a227f..bc6de38 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -102,10 +102,7 @@ def main(): action="store_true", ) parser.add_argument( - "-l", - "--logfile", - default="", - help="save log output to LOGFILE" + "-l", "--logfile", default="", help="save log output to LOGFILE" ) args = parser.parse_args() @@ -127,23 +124,18 @@ def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-7s] %(message)s", - filename=args.logfile + filename=args.logfile, ) console = logging.StreamHandler() # TODO: even when loglevel is set to logging.DEBUG, # debug messages aren't printing to console console.setLevel(loglevel) - console.setFormatter( - logging.Formatter("[%(levelname)s] %(message)s") - ) + console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig( - level=loglevel, - format="[%(levelname)s] %(message)s" - ) + logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") # log events are appended to the file if it already exists, # so note the start of a new download session @@ -158,10 +150,11 @@ def main(): logging.info(f"Using batch file '{input_file}'.") try: - fd = open(input_file, "r") + fd = open(input_file) except OSError as e: - logging.error("File could not be read. " - "The following error was encountered:") + logging.error( + "File could not be read. The following error was encountered:" + ) logging.error(e) sys.exit(1) else: @@ -173,11 +166,11 @@ def main(): # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): - url = url.strip() + s_url = url.strip() # keep track of batch file line numbers for # debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - download(url, args) + logging.info(f"Downloading '{s_url}' at line {line_num}:") + download(s_url, args) else: # single download mode dl = args.FILE -- cgit v1.2.3 From 485df31f095a9b629a1dcc04af13956325856d8c Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:51:58 -0700 Subject: Update README and do some code cleanup --- src/wikiget/__init__.py | 2 +- src/wikiget/dl.py | 17 ++++++++--------- src/wikiget/validations.py | 2 +- src/wikiget/wikiget.py | 47 ++++++++++++++++++++-------------------------- 4 files changed, 30 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/wikiget/__init__.py b/src/wikiget/__init__.py index 5b917cf..3946868 100644 --- a/src/wikiget/__init__.py +++ b/src/wikiget/__init__.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 791db61..d32736f 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -65,8 +65,8 @@ def download(dl, args): if args.username and args.password: site.login(args.username, args.password) except ConnectionError as e: - # usually this means there is no such site, or there's no network - # connection, though it could be a certificate problem + # usually this means there is no such site, or there's no network connection, + # though it could be a certificate problem logging.error("Couldn't connect to specified site.") logging.debug("Full error message:") logging.debug(e) @@ -80,8 +80,8 @@ def download(dl, args): logging.debug(e) sys.exit(1) except (InvalidResponse, LoginError) as e: - # InvalidResponse: site exists, but we couldn't communicate with the - # API endpoint for some reason other than an HTTP error. + # InvalidResponse: site exists, but we couldn't communicate with the API + # endpoint for some reason other than an HTTP error. # LoginError: missing or invalid credentials logging.error(e) sys.exit(1) @@ -90,8 +90,8 @@ def download(dl, args): try: file = site.images[filename] except APIError as e: - # an API error at this point likely means access is denied, - # which could happen with a private wiki + # an API error at this point likely means access is denied, which could happen + # with a private wiki logging.error( "Access denied. Try providing credentials with " "--username and --password." @@ -102,8 +102,7 @@ def download(dl, args): sys.exit(1) if file.imageinfo != {}: - # file exists either locally or at a common repository, - # like Wikimedia Commons + # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] file_size = file.imageinfo["size"] file_sha1 = file.imageinfo["sha1"] diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index dc70df4..8ebd996 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018, 2019, 2020 Cody Logan +# Copyright (C) 2018-2020 Cody Logan # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index bc6de38..934107e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -25,32 +25,27 @@ from wikiget.dl import download def main(): """ - Main entry point for console script. Automatically compiled by setuptools - when installed with `pip install` or `python setup.py install`. + Main entry point for console script. Automatically compiled by setuptools when + installed with `pip install` or `python setup.py install`. """ parser = argparse.ArgumentParser( description=""" - A tool for downloading files from - MediaWiki sites using the file name or + A tool for downloading files from MediaWiki sites using the file name or description page URL """, epilog=""" - Copyright (C) 2018-2023 Cody Logan - and contributors. - License GPLv3+: GNU GPL version 3 or later - . - This is free software; you are free to - change and redistribute it under certain - conditions. There is NO WARRANTY, to the - extent permitted by law. + Copyright (C) 2018-2023 Cody Logan and contributors. License GPLv3+: GNU GPL + version 3 or later . This is free + software; you are free to change and redistribute it under certain conditions. + There is NO WARRANTY, to the extent permitted by law. """, ) parser.add_argument( "FILE", help=""" - name of the file to download with the File: - prefix, or the URL of its file description page + name of the file to download with the File: prefix, or the URL of its file + description page """, ) parser.add_argument( @@ -96,9 +91,8 @@ def main(): output_options.add_argument( "-a", "--batch", - help="treat FILE as a textfile containing " - "multiple files to download, one URL or " - "filename per line", + help="treat FILE as a textfile containing multiple files to download, one URL " + "or filename per line", action="store_true", ) parser.add_argument( @@ -117,7 +111,7 @@ def main(): loglevel = logging.ERROR # configure logging: - # console log level is set via -v, -vv, and -q options + # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) if args.logfile: # log to console and file @@ -128,8 +122,8 @@ def main(): ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, - # debug messages aren't printing to console + # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't + # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) @@ -137,8 +131,8 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") - # log events are appended to the file if it already exists, - # so note the start of a new download session + # log events are appended to the file if it already exists, so note the start of a + # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") # logging.info(f"Log level is set to {loglevel}") @@ -159,16 +153,15 @@ def main(): sys.exit(1) else: with fd: - # store file contents in memory in case something - # happens to the file while we're downloading + # store file contents in memory in case something happens to the file + # while we're downloading for _, line in enumerate(fd): dl_list.append(line) # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): s_url = url.strip() - # keep track of batch file line numbers for - # debugging/logging purposes + # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{s_url}' at line {line_num}:") download(s_url, args) else: -- cgit v1.2.3 From e18222daecca1656390652cbd1c7f6985080241a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:58:37 -0700 Subject: Add short user and pass options Swapped path short option from -p to -P and added -u for username and -p for password --- src/wikiget/wikiget.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 934107e..f482280 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -75,16 +75,22 @@ def main(): help="MediaWiki site to download from (default: %(default)s)", ) parser.add_argument( - "-p", + "-P", "--path", default=wikiget.DEFAULT_PATH, help="MediaWiki site path, where api.php is located (default: %(default)s)", ) parser.add_argument( - "--username", default="", help="MediaWiki site username, for private wikis" + "-u", + "--username", + default="", + help="MediaWiki site username, for private wikis" ) parser.add_argument( - "--password", default="", help="MediaWiki site password, for private wikis" + "-p", + "--password", + default="", + help="MediaWiki site password, for private wikis" ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") -- cgit v1.2.3 From 865088207b39427b6b932de4f312d82bd5e05a53 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:26:09 -0700 Subject: Refactor for better code organization --- src/wikiget/dl.py | 23 ++++++++++++++++++++++- src/wikiget/file.py | 27 +++++++++++++++++++++++++++ src/wikiget/wikiget.py | 38 +++++++++++++++++++++++--------------- 3 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 src/wikiget/file.py (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index d32736f..2b2befa 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -25,10 +25,11 @@ from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.file import File from wikiget.validations import valid_file, verify_hash -def download(dl, args): +def get_dest(dl, args): url = urlparse(dl) if url.netloc: @@ -56,6 +57,10 @@ def download(dl, args): dest = args.output or filename + return filename, dest, site_name + + +def query_api(filename, site_name, args): logging.debug(f"User agent: {wikiget.USER_AGENT}") # connect to site and identify ourselves @@ -101,6 +106,22 @@ def download(dl, args): logging.debug(i) sys.exit(1) + return file, site + + +def prep_download(dl, args): + filename, dest, site_name = get_dest(dl, args) + file = File(filename, dest) + file.object, file.site = query_api(file.name, site_name, args) + return file + + +def download(f, args): + file = f.object + filename = f.name + site = f.site + dest = f.dest + if file.imageinfo != {}: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] diff --git a/src/wikiget/file.py b/src/wikiget/file.py new file mode 100644 index 0000000..60a71e0 --- /dev/null +++ b/src/wikiget/file.py @@ -0,0 +1,27 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + + +class File: + def __init__(self, name, dest=None): + self.object = None + self.site = None + self.name = name + if dest is None: + self.dest = name + else: + self.dest = dest diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index f482280..80d5057 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,15 +20,10 @@ import logging import sys import wikiget -from wikiget.dl import download +from wikiget.dl import download, prep_download -def main(): - """ - Main entry point for console script. Automatically compiled by setuptools when - installed with `pip install` or `python setup.py install`. - """ - +def construct_parser(): parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -84,13 +79,13 @@ def main(): "-u", "--username", default="", - help="MediaWiki site username, for private wikis" + help="MediaWiki site username, for private wikis", ) parser.add_argument( "-p", "--password", default="", - help="MediaWiki site password, for private wikis" + help="MediaWiki site password, for private wikis", ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") @@ -104,7 +99,19 @@ def main(): parser.add_argument( "-l", "--logfile", default="", help="save log output to LOGFILE" ) + parser.add_argument( + "-j", + "--threads", + default=1, + help="Number of parallel downloads to attempt in batch mode", + type=int, + ) + return parser + + +def main(): + parser = construct_parser() args = parser.parse_args() loglevel = logging.WARNING @@ -165,12 +172,13 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, url in enumerate(dl_list, start=1): - s_url = url.strip() + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{s_url}' at line {line_num}:") - download(s_url, args) + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + download(file, args) else: # single download mode - dl = args.FILE - download(dl, args) + file = prep_download(args.FILE, args) + download(file, args) -- cgit v1.2.3 From 93e879e30ec2776c5d347e72be32f3ef30bd1410 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:28:23 -0700 Subject: Add parallel download option in batch mode Number of download threads can be set with new -j option. Unfortunately, it's not that much faster than downloading in serial, since the API calls made before the downloads actually start are not (and ideally should not be) parallelized. Still, for large batches, it saves a bit of time. Known issue: due to the download threads writing to the log asynchronously, the messages get jumbled up. This will be fixed eventually. --- src/wikiget/wikiget.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 80d5057..c16d3f6 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,6 +18,7 @@ import argparse import logging import sys +from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download @@ -172,12 +173,18 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - download(file, args) + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 43c1fc258499f54977a1b7b594b295c2dae03114 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 16:07:54 -0700 Subject: Reduce repeated code in log configuration --- src/wikiget/wikiget.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c16d3f6..51c870a 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -127,11 +127,13 @@ def main(): # configure logging: # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) + base_format = "%(threadName)s - %(message)s" + log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( level=logging.INFO, - format="%(asctime)s [%(levelname)-7s] %(message)s", + format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) @@ -139,11 +141,11 @@ def main(): # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't # printing to console console.setLevel(loglevel) - console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") + logging.basicConfig(level=loglevel, format=log_format) # log events are appended to the file if it already exists, so note the start of a # new download session @@ -173,7 +175,10 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - with ThreadPoolExecutor(max_workers=args.threads) as executor: + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: futures = [] for line_num, line in enumerate(dl_list, start=1): url = line.strip() -- cgit v1.2.3 From 206f0fe0b97610fc371ad0acdd5146ac12eacfe7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 13:50:30 -0700 Subject: Style cleanup --- src/wikiget/wikiget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 51c870a..8c067e0 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -104,7 +104,7 @@ def construct_parser(): "-j", "--threads", default=1, - help="Number of parallel downloads to attempt in batch mode", + help="number of parallel downloads to attempt in batch mode", type=int, ) -- cgit v1.2.3 From 8b70abecb543099528ecc8c3b1edfe0330d3d223 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 10:11:20 -0700 Subject: Refactor code and improve docstrings --- src/wikiget/file.py | 16 ++++++--- src/wikiget/validations.py | 26 +++++++++----- src/wikiget/wikiget.py | 87 +++++++++++++++++++++++++--------------------- 3 files changed, 75 insertions(+), 54 deletions(-) (limited to 'src') diff --git a/src/wikiget/file.py b/src/wikiget/file.py index 60a71e0..c1b9ae6 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -17,11 +17,17 @@ class File: - def __init__(self, name, dest=None): + def __init__(self, name, dest=""): + """ + Initializes a new file with the specified name and an optional destination name. + + :param name: name of the file + :type name: str + :param dest: destination of the file, if different from the name; if not + specified, defaults to the name + :type dest: str, optional + """ self.object = None self.site = None self.name = name - if dest is None: - self.dest = name - else: - self.dest = dest + self.dest = dest if dest else name diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 8ebd996..1610417 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -23,11 +23,14 @@ from wikiget import BLOCKSIZE def valid_file(search_string): """ - Determines if the given string contains a valid file name, defined as a - string ending with a '.' and at least one character, beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. + Determines if the given string contains a valid file name, defined as a string + ending with a '.' and at least one character, beginning with 'File:' or 'Image:', + the standard file prefixes in MediaWiki. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ # second group could also restrict to file extensions with three or more # letters with ([^/\r\n\t\f\v]+\.\w{3,}) @@ -37,12 +40,15 @@ def valid_file(search_string): def valid_site(search_string): """ - Determines if the given string contains a valid site name, defined as a - string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all - subdomains of those domains. Eventually, it should be possible to support - any MediaWiki site, regardless of domain name. + Determines if the given string contains a valid site name, defined as a string + ending with 'wikipedia.org' or 'wikimedia.org'. This covers all subdomains of those + domains. Eventually, it should be possible to support any MediaWiki site, regardless + of domain name. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) return site_regex.search(search_string) @@ -50,10 +56,12 @@ def valid_site(search_string): def verify_hash(filename): """ - Calculates the SHA1 hash of the given file for comparison with a known - value. + Calculates the SHA1 hash of the given file for comparison with a known value. + :param filename: name of the file to calculate a hash for + :type filename: str :return: hash digest + :rtype: str """ hasher = hashlib.sha1() # noqa: S324 with open(filename, "rb") as dl: diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 8c067e0..c470b46 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -111,10 +111,7 @@ def construct_parser(): return parser -def main(): - parser = construct_parser() - args = parser.parse_args() - +def configure_logging(args): loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages @@ -147,6 +144,51 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format=log_format) + +def batch_download(args): + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something happens to the file + # while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() + + +def main(): + # setup + parser = construct_parser() + args = parser.parse_args() + + configure_logging(args) + # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") @@ -154,42 +196,7 @@ def main(): if args.batch: # batch download mode - input_file = args.FILE - dl_list = [] - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error( - "File could not be read. The following error was encountered:" - ) - logging.error(e) - sys.exit(1) - else: - with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() + batch_download(args) else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 226b7cb84070c6d073e153ad410fca7798c8e334 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:04 -0700 Subject: Change logfile log level to debug --- src/wikiget/wikiget.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c470b46..5b36ce5 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -123,20 +123,18 @@ def configure_logging(args): # configure logging: # console log level is set via -v, -vv, and -q options; - # file log level is always info (TODO: add debug option) + # file log level is always debug (TODO: make this user configurable) base_format = "%(threadName)s - %(message)s" log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( - level=logging.INFO, + level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't - # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) @@ -192,7 +190,6 @@ def main(): # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") - # logging.info(f"Log level is set to {loglevel}") if args.batch: # batch download mode -- cgit v1.2.3 From 87052196874cc1bf82f70a6f5aa8e6df59bc1537 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:31 -0700 Subject: Revise batch file parsing to ignore blank and commented lines Previously, blank lines would cause an error and lines prepended with "#" would be downloaded like any other, assuming they were valid. Now, "#" can be used to mark ignored files or comments. --- src/wikiget/wikiget.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b36ce5..fba9509 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -145,7 +145,7 @@ def configure_logging(args): def batch_download(args): input_file = args.FILE - dl_list = [] + dl_list = {} logging.info(f"Using batch file '{input_file}'.") @@ -157,10 +157,12 @@ def batch_download(args): sys.exit(1) else: with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s # TODO: validate file contents before download process starts with ThreadPoolExecutor( @@ -168,11 +170,10 @@ def batch_download(args): thread_name_prefix="download", ) as executor: futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() + for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) + logging.info(f"Downloading '{line}' at line {line_num}") + file = prep_download(line, args) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 875748228e509e244c8f444114387f1a03cbb393 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:19:41 -0700 Subject: Update copyright year --- src/wikiget/validations.py | 2 +- src/wikiget/version.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 1610417..2bce34e 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2020 Cody Logan +# Copyright (C) 2018-2023 Cody Logan # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify diff --git a/src/wikiget/version.py b/src/wikiget/version.py index dd9b22c..34dabb7 100644 --- a/src/wikiget/version.py +++ b/src/wikiget/version.py @@ -1 +1,18 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2023 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + __version__ = "0.5.1" -- cgit v1.2.3 From 630541499a58f98c55d5cc372d21e745c106d250 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:24:13 -0700 Subject: Refactor parsing logic and revise exception handling --- src/wikiget/dl.py | 58 +++++++++++++++-------------------------------- src/wikiget/exceptions.py | 20 ++++++++++++++++ src/wikiget/parse.py | 54 +++++++++++++++++++++++++++++++++++++++++++ src/wikiget/wikiget.py | 12 ++++++++-- 4 files changed, 102 insertions(+), 42 deletions(-) create mode 100644 src/wikiget/exceptions.py create mode 100644 src/wikiget/parse.py (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 2b2befa..50b7460 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,46 +18,16 @@ import logging import os import sys -from urllib.parse import unquote, urlparse from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File -from wikiget.validations import valid_file, verify_hash - - -def get_dest(dl, args): - url = urlparse(dl) - - if url.netloc: - filename = url.path - site_name = url.netloc - if args.site is not wikiget.DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, ignoring site specified with --site") - else: - filename = dl - site_name = args.site - - file_match = valid_file(filename) - - # check if this is a valid file - if file_match and file_match.group(1): - # has File:/Image: prefix and extension - filename = file_match.group(2) - else: - # no file extension and/or prefix, probably an article - logging.error(f"Could not parse input '{filename}' as a file.") - sys.exit(1) - - filename = unquote(filename) # remove URL encoding for special characters - - dest = args.output or filename - - return filename, dest, site_name +from wikiget.parse import get_dest +from wikiget.validations import verify_hash def query_api(filename, site_name, args): @@ -98,8 +68,7 @@ def query_api(filename, site_name, args): # an API error at this point likely means access is denied, which could happen # with a private wiki logging.error( - "Access denied. Try providing credentials with " - "--username and --password." + "Access denied. Try providing credentials with --username and --password." ) logging.debug("Full error message:") for i in e.args: @@ -110,7 +79,10 @@ def query_api(filename, site_name, args): def prep_download(dl, args): - filename, dest, site_name = get_dest(dl, args) + try: + filename, dest, site_name = get_dest(dl, args) + except ParseError: + raise file = File(filename, dest) file.object, file.site = query_api(file.name, site_name, args) return file @@ -136,7 +108,7 @@ def download(f, args): if os.path.isfile(dest) and not args.force: logging.warning( - f"File '{dest}' already exists, skipping download (use -f to ignore)" + f"File '{dest}' already exists, skipping download (use -f to force)" ) else: try: @@ -167,19 +139,25 @@ def download(f, args): fd.write(chunk) progress_bar.update(len(chunk)) - # verify file integrity and optionally print details + # verify file integrity and log details dl_sha1 = verify_hash(dest) - logging.info(f"Downloaded file SHA1 is {dl_sha1}") - logging.info(f"Server file SHA1 is {file_sha1}") + logging.info(f"Remote file SHA1 is {file_sha1}") + logging.info(f"Local file SHA1 is {dl_sha1}") if dl_sha1 == file_sha1: logging.info("Hashes match!") # at this point, we've successfully downloaded the file + success_log = f"'{filename}' downloaded" + if args.output: + success_log += f" to '{dest}'" + logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") + # TODO: log but don't quit while in batch mode sys.exit(1) else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file.") + # TODO: log but don't quit while in batch mode sys.exit(1) diff --git a/src/wikiget/exceptions.py b/src/wikiget/exceptions.py new file mode 100644 index 0000000..94ed6b2 --- /dev/null +++ b/src/wikiget/exceptions.py @@ -0,0 +1,20 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + + +class ParseError(Exception): + pass diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py new file mode 100644 index 0000000..09c0767 --- /dev/null +++ b/src/wikiget/parse.py @@ -0,0 +1,54 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging +from urllib.parse import unquote, urlparse + +import wikiget +from wikiget.exceptions import ParseError +from wikiget.validations import valid_file + + +def get_dest(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not wikiget.DEFAULT_SITE: + # this will work even if the user specifies 'commons.wikimedia.org' + logging.warning("target is a URL, ignoring site specified with --site") + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + msg = f"Could not parse input '{filename}' as a file" + raise ParseError(msg) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + return filename, dest, site_name diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index fba9509..68e0233 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download +from wikiget.exceptions import ParseError def construct_parser(): @@ -173,7 +174,10 @@ def batch_download(args): for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{line}' at line {line_num}") - file = prep_download(line, args) + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -197,5 +201,9 @@ def main(): batch_download(args) else: # single download mode - file = prep_download(args.FILE, args) + try: + file = prep_download(args.FILE, args) + except ParseError as e: + logging.error(e) + sys.exit(1) download(file, args) -- cgit v1.2.3 From 06335ba0176cabd84f5b548995f465ac1c09bc8e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 17 Oct 2023 14:00:14 -0700 Subject: Clean up exception handling and error messages --- src/wikiget/dl.py | 23 ++++++++--------------- src/wikiget/parse.py | 5 +++-- src/wikiget/wikiget.py | 11 +++++++++++ 3 files changed, 22 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 50b7460..4521b72 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -24,7 +24,6 @@ from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget -from wikiget.exceptions import ParseError from wikiget.file import File from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -42,24 +41,22 @@ def query_api(filename, site_name, args): except ConnectionError as e: # usually this means there is no such site, or there's no network connection, # though it could be a certificate problem - logging.error("Couldn't connect to specified site.") - logging.debug("Full error message:") + logging.error("Could not connect to specified site") logging.debug(e) - sys.exit(1) + raise except HTTPError as e: # most likely a 403 forbidden or 404 not found error for api.php logging.error( - "Couldn't find the specified wiki's api.php. Check the value of --path." + "Could not find the specified wiki's api.php. Check the value of --path." ) - logging.debug("Full error message:") logging.debug(e) - sys.exit(1) + raise except (InvalidResponse, LoginError) as e: # InvalidResponse: site exists, but we couldn't communicate with the API # endpoint for some reason other than an HTTP error. # LoginError: missing or invalid credentials logging.error(e) - sys.exit(1) + raise # get info about the target file try: @@ -70,19 +67,15 @@ def query_api(filename, site_name, args): logging.error( "Access denied. Try providing credentials with --username and --password." ) - logging.debug("Full error message:") for i in e.args: logging.debug(i) - sys.exit(1) + raise return file, site def prep_download(dl, args): - try: - filename, dest, site_name = get_dest(dl, args) - except ParseError: - raise + filename, dest, site_name = get_dest(dl, args) file = File(filename, dest) file.object, file.site = query_api(file.name, site_name, args) return file @@ -158,6 +151,6 @@ def download(f, args): else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be a valid file.") + logging.error(f"Target '{filename}' does not appear to be a valid file") # TODO: log but don't quit while in batch mode sys.exit(1) diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index 09c0767..f5c221d 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -30,8 +30,9 @@ def get_dest(dl, args): filename = url.path site_name = url.netloc if args.site is not wikiget.DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, ignoring site specified with --site") + # this will work even if the user specifies 'commons.wikimedia.org' since + # we're comparing objects instead of values (is not vs. !=) + logging.warning("Target is a URL, ignoring site specified with --site") else: filename = dl site_name = args.site diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 68e0233..4446f96 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,6 +20,9 @@ import logging import sys from concurrent.futures import ThreadPoolExecutor +from mwclient import APIError, InvalidResponse, LoginError +from requests import ConnectionError, HTTPError + import wikiget from wikiget.dl import download, prep_download from wikiget.exceptions import ParseError @@ -178,6 +181,10 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.error( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -198,6 +205,8 @@ def main(): if args.batch: # batch download mode + # TODO: return non-zero exit code if any errors were encountered, even if some + # downloads completed successfully batch_download(args) else: # single download mode @@ -206,4 +215,6 @@ def main(): except ParseError as e: logging.error(e) sys.exit(1) + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + sys.exit(1) download(file, args) -- cgit v1.2.3 From ba1f10666554316c262efd2ee6950560560317c7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 12:59:08 -0700 Subject: Fix bug in batch downloading An invalid line in the batch file would cause the last valid file to be downloaded twice. --- src/wikiget/wikiget.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 4446f96..af13bc8 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -181,10 +181,12 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + continue except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): logging.error( f"Unable to download '{line}' (line {line_num}) due to an error" ) + continue future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 05457af0d73ff3a820c0b465e6607fc5832a6e74 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:23:28 -0700 Subject: Reorganize File class --- src/wikiget/dl.py | 19 ++++++++----------- src/wikiget/file.py | 12 +++++++++--- src/wikiget/parse.py | 4 +++- src/wikiget/wikiget.py | 4 ++-- 4 files changed, 22 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 4521b72..171b017 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -30,10 +30,8 @@ from wikiget.validations import verify_hash def query_api(filename, site_name, args): - logging.debug(f"User agent: {wikiget.USER_AGENT}") - # connect to site and identify ourselves - logging.info(f"Site name: {site_name}") + logging.info(f"Connecting to {site_name}") try: site = Site(site_name, path=args.path, clients_useragent=wikiget.USER_AGENT) if args.username and args.password: @@ -60,7 +58,7 @@ def query_api(filename, site_name, args): # get info about the target file try: - file = site.images[filename] + image = site.images[filename] except APIError as e: # an API error at this point likely means access is denied, which could happen # with a private wiki @@ -71,23 +69,22 @@ def query_api(filename, site_name, args): logging.debug(i) raise - return file, site + return image def prep_download(dl, args): - filename, dest, site_name = get_dest(dl, args) - file = File(filename, dest) - file.object, file.site = query_api(file.name, site_name, args) + file = get_dest(dl, args) + file.image = query_api(file.name, file.site, args) return file def download(f, args): - file = f.object + file = f.image filename = f.name - site = f.site dest = f.dest + site = file.site - if file.imageinfo != {}: + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] file_size = file.imageinfo["size"] diff --git a/src/wikiget/file.py b/src/wikiget/file.py index c1b9ae6..b890e63 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -15,9 +15,13 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +from mwclient.image import Image + +from wikiget import DEFAULT_SITE + class File: - def __init__(self, name, dest=""): + def __init__(self, name: str, dest: str = "", site: str = "") -> None: """ Initializes a new file with the specified name and an optional destination name. @@ -26,8 +30,10 @@ class File: :param dest: destination of the file, if different from the name; if not specified, defaults to the name :type dest: str, optional + :param site: name of the site hosting the file; if not specified, defaults to + the global default site """ - self.object = None - self.site = None + self.image: Image = None self.name = name self.dest = dest if dest else name + self.site = site if site else DEFAULT_SITE diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index f5c221d..4e9b195 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -52,4 +52,6 @@ def get_dest(dl, args): dest = args.output or filename - return filename, dest, site_name + file = File(filename, dest, site_name) + + return file diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index af13bc8..90078e1 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -195,15 +195,15 @@ def batch_download(args): def main(): - # setup + # setup our environment parser = construct_parser() args = parser.parse_args() - configure_logging(args) # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") + logging.debug(f"User agent: {wikiget.USER_AGENT}") if args.batch: # batch download mode -- cgit v1.2.3 From b136af078208882ae696b21c0d8aac009e7468d4 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:28:23 -0700 Subject: Move batch_download function to proper file --- src/wikiget/dl.py | 63 +++++++++++++++++++++++++++++++++++++++++++---- src/wikiget/wikiget.py | 67 +++++++++----------------------------------------- 2 files changed, 70 insertions(+), 60 deletions(-) (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 171b017..83aef9f 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,12 +18,14 @@ import logging import os import sys +from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -78,12 +80,62 @@ def prep_download(dl, args): return file +def batch_download(args): + input_file = args.FILE + dl_list = {} + errors = 0 + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in dl_list.items(): + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Processing '{line}' at line {line_num}") + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") + errors += 1 + continue + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.warning( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) + errors += 1 + continue + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + errors += future.result() + return errors + + def download(f, args): file = f.image filename = f.name dest = f.dest site = file.site + errors = 0 + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] @@ -100,6 +152,7 @@ def download(f, args): logging.warning( f"File '{dest}' already exists, skipping download (use -f to force)" ) + errors += 1 else: try: fd = open(dest, "wb") @@ -108,7 +161,7 @@ def download(f, args): "File could not be written. The following error was encountered:" ) logging.error(e) - sys.exit(1) + errors += 1 else: # download the file(s) if args.verbose >= wikiget.STD_VERBOSE: @@ -143,11 +196,11 @@ def download(f, args): logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 + + return errors diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 90078e1..e9a1147 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,13 +18,12 @@ import argparse import logging import sys -from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError from requests import ConnectionError, HTTPError import wikiget -from wikiget.dl import download, prep_download +from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError @@ -145,55 +144,6 @@ def configure_logging(args): else: # log only to console logging.basicConfig(level=loglevel, format=log_format) - - -def batch_download(args): - input_file = args.FILE - dl_list = {} - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error("File could not be read. The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - with fd: - # read the file into memory and process each line as we go - for line_num, line in enumerate(fd, start=1): - line_s = line.strip() - # ignore blank lines and lines starting with "#" (for comments) - if line_s and not line_s.startswith("#"): - dl_list[line_num] = line_s - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in dl_list.items(): - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{line}' at line {line_num}") - try: - file = prep_download(line, args) - except ParseError as e: - logging.warning(f"{e} (line {line_num})") - continue - except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): - logging.error( - f"Unable to download '{line}' (line {line_num}) due to an error" - ) - continue - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() - - def main(): # setup our environment parser = construct_parser() @@ -207,9 +157,14 @@ def main(): if args.batch: # batch download mode - # TODO: return non-zero exit code if any errors were encountered, even if some - # downloads completed successfully - batch_download(args) + errors = batch_download(args) + if errors: + # return non-zero exit code if any problems were encountered, even if some + # downloads completed successfully + logging.warning( + f"{errors} problem{'s'[:errors^1]} encountered during batch processing" + ) + sys.exit(1) else: # single download mode try: @@ -219,4 +174,6 @@ def main(): sys.exit(1) except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): sys.exit(1) - download(file, args) + errors = download(file, args) + if errors: + sys.exit(1) -- cgit v1.2.3 From 3d37cf6f86eb6c48a3a0a094c42ade6d7aed1daf Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:31:56 -0700 Subject: Move logging configuration to new file Also, use a LoggerAdapter to add contextual info (such as filenames) to log messages when downloading, especially useful with threaded batch processing. --- src/wikiget/dl.py | 29 +++++++++++++------------ src/wikiget/logging.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/wikiget/wikiget.py | 31 +-------------------------- 3 files changed, 72 insertions(+), 45 deletions(-) create mode 100644 src/wikiget/logging.py (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 83aef9f..5491378 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -27,6 +27,7 @@ from tqdm import tqdm import wikiget from wikiget.exceptions import ParseError from wikiget.file import File +from wikiget.logging import FileLogAdapter from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -136,6 +137,9 @@ def download(f, args): errors = 0 + logger = logging.getLogger("") + adapter = FileLogAdapter(logger, {"filename": filename}) + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] @@ -145,22 +149,17 @@ def download(f, args): filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}" if args.output: filename_log += f" to '{dest}'" - logging.info(filename_log) - logging.info(f"{file_url}") + adapter.info(filename_log) + adapter.info(f"{file_url}") if os.path.isfile(dest) and not args.force: - logging.warning( - f"File '{dest}' already exists, skipping download (use -f to force)" - ) + adapter.warning("File already exists, skipping download (use -f to force)") errors += 1 else: try: fd = open(dest, "wb") except OSError as e: - logging.error( - "File could not be written. The following error was encountered:" - ) - logging.error(e) + adapter.error(f"File could not be written. {e}") errors += 1 else: # download the file(s) @@ -185,22 +184,22 @@ def download(f, args): # verify file integrity and log details dl_sha1 = verify_hash(dest) - logging.info(f"Remote file SHA1 is {file_sha1}") - logging.info(f"Local file SHA1 is {dl_sha1}") + adapter.info(f"Remote file SHA1 is {file_sha1}") + adapter.info(f"Local file SHA1 is {dl_sha1}") if dl_sha1 == file_sha1: - logging.info("Hashes match!") + adapter.info("Hashes match!") # at this point, we've successfully downloaded the file success_log = f"'{filename}' downloaded" if args.output: success_log += f" to '{dest}'" - logging.info(success_log) + adapter.info(success_log) else: - logging.error("Hash mismatch! Downloaded file may be corrupt.") + adapter.error("Hash mismatch! Downloaded file may be corrupt.") errors += 1 else: # no file information returned - logging.error(f"Target '{filename}' does not appear to be a valid file") + adapter.warning("Target does not appear to be a valid file") errors += 1 return errors diff --git a/src/wikiget/logging.py b/src/wikiget/logging.py new file mode 100644 index 0000000..1536156 --- /dev/null +++ b/src/wikiget/logging.py @@ -0,0 +1,57 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging + +import wikiget + + +class FileLogAdapter(logging.LoggerAdapter): + def process(self, msg, kwargs): + return f"[{self.extra['filename']}] {msg}", kwargs + + +def configure_logging(args): + loglevel = logging.WARNING + if args.verbose >= wikiget.VERY_VERBOSE: + # this includes API and library messages + loglevel = logging.DEBUG + elif args.verbose >= wikiget.STD_VERBOSE: + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # configure logging: + # console log level is set via -v, -vv, and -q options; + # file log level is always debug (TODO: make this user configurable) + base_format = "%(message)s" + log_format = "[%(levelname)s] " + base_format + if args.logfile: + # log to console and file + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s [%(levelname)-7s] " + base_format, + filename=args.logfile, + ) + + console = logging.StreamHandler() + console.setLevel(loglevel) + console.setFormatter(logging.Formatter(log_format)) + logging.getLogger("").addHandler(console) + else: + # log only to console + logging.basicConfig(level=loglevel, format=log_format) diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index e9a1147..5b84dac 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -25,6 +25,7 @@ from requests import ConnectionError, HTTPError import wikiget from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError +from wikiget.logging import configure_logging def construct_parser(): @@ -114,36 +115,6 @@ def construct_parser(): return parser -def configure_logging(args): - loglevel = logging.WARNING - if args.verbose >= wikiget.VERY_VERBOSE: - # this includes API and library messages - loglevel = logging.DEBUG - elif args.verbose >= wikiget.STD_VERBOSE: - loglevel = logging.INFO - elif args.quiet: - loglevel = logging.ERROR - - # configure logging: - # console log level is set via -v, -vv, and -q options; - # file log level is always debug (TODO: make this user configurable) - base_format = "%(threadName)s - %(message)s" - log_format = "[%(levelname)s] " + base_format - if args.logfile: - # log to console and file - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s [%(levelname)-7s] " + base_format, - filename=args.logfile, - ) - - console = logging.StreamHandler() - console.setLevel(loglevel) - console.setFormatter(logging.Formatter(log_format)) - logging.getLogger("").addHandler(console) - else: - # log only to console - logging.basicConfig(level=loglevel, format=log_format) def main(): # setup our environment parser = construct_parser() -- cgit v1.2.3 From c1820026f97eaf671c29ab30f02879de0ac4df89 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:36:14 -0700 Subject: Add type annotations to source files --- src/wikiget/dl.py | 10 ++++++---- src/wikiget/logging.py | 3 ++- src/wikiget/parse.py | 4 +++- src/wikiget/validations.py | 7 ++++--- src/wikiget/wikiget.py | 4 ++-- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 5491378..5b5b43b 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,9 +18,11 @@ import logging import os import sys +from argparse import Namespace from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError, Site +from mwclient.image import Image from requests import ConnectionError, HTTPError from tqdm import tqdm @@ -32,7 +34,7 @@ from wikiget.parse import get_dest from wikiget.validations import verify_hash -def query_api(filename, site_name, args): +def query_api(filename: str, site_name: str, args: Namespace) -> Image: # connect to site and identify ourselves logging.info(f"Connecting to {site_name}") try: @@ -75,13 +77,13 @@ def query_api(filename, site_name, args): return image -def prep_download(dl, args): +def prep_download(dl: str, args: Namespace) -> File: file = get_dest(dl, args) file.image = query_api(file.name, file.site, args) return file -def batch_download(args): +def batch_download(args: Namespace) -> int: input_file = args.FILE dl_list = {} errors = 0 @@ -129,7 +131,7 @@ def batch_download(args): return errors -def download(f, args): +def download(f: File, args: Namespace) -> int: file = f.image filename = f.name dest = f.dest diff --git a/src/wikiget/logging.py b/src/wikiget/logging.py index 1536156..87b917c 100644 --- a/src/wikiget/logging.py +++ b/src/wikiget/logging.py @@ -16,6 +16,7 @@ # along with Wikiget. If not, see . import logging +from argparse import Namespace import wikiget @@ -25,7 +26,7 @@ class FileLogAdapter(logging.LoggerAdapter): return f"[{self.extra['filename']}] {msg}", kwargs -def configure_logging(args): +def configure_logging(args: Namespace) -> None: loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py index 4e9b195..fe3fe43 100644 --- a/src/wikiget/parse.py +++ b/src/wikiget/parse.py @@ -16,14 +16,16 @@ # along with Wikiget. If not, see . import logging +from argparse import Namespace from urllib.parse import unquote, urlparse import wikiget from wikiget.exceptions import ParseError +from wikiget.file import File from wikiget.validations import valid_file -def get_dest(dl, args): +def get_dest(dl: str, args: Namespace) -> File: url = urlparse(dl) if url.netloc: diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 2bce34e..c9e7bcf 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -17,11 +17,12 @@ import hashlib import re +from typing import Optional from wikiget import BLOCKSIZE -def valid_file(search_string): +def valid_file(search_string: str) -> Optional[re.Match]: """ Determines if the given string contains a valid file name, defined as a string ending with a '.' and at least one character, beginning with 'File:' or 'Image:', @@ -38,7 +39,7 @@ def valid_file(search_string): return file_regex.search(search_string) -def valid_site(search_string): +def valid_site(search_string: str) -> Optional[re.Match]: """ Determines if the given string contains a valid site name, defined as a string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all subdomains of those @@ -54,7 +55,7 @@ def valid_site(search_string): return site_regex.search(search_string) -def verify_hash(filename): +def verify_hash(filename: str) -> str: """ Calculates the SHA1 hash of the given file for comparison with a known value. diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b84dac..e64d00e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -28,7 +28,7 @@ from wikiget.exceptions import ParseError from wikiget.logging import configure_logging -def construct_parser(): +def construct_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -115,7 +115,7 @@ def construct_parser(): return parser -def main(): +def main() -> None: # setup our environment parser = construct_parser() args = parser.parse_args() -- cgit v1.2.3