From a1995912ed24b37a990f3fcd5e91dbf7b46669fb Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:17:04 -0700 Subject: Reorganize file tree --- src/wikiget/wikiget.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 src/wikiget/wikiget.py (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py new file mode 100644 index 0000000..a8679c9 --- /dev/null +++ b/src/wikiget/wikiget.py @@ -0,0 +1,157 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018-2021 Cody Logan and contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import argparse +import logging +import sys + +from . import DEFAULT_SITE, DEFAULT_PATH, wikiget_version +from .dl import download + + +def main(): + """ + Main entry point for console script. Automatically compiled by setuptools + when installed with `pip install` or `python setup.py install`. + """ + + parser = argparse.ArgumentParser(description=""" + A tool for downloading files from + MediaWiki sites using the file name or + description page URL + """, + epilog=""" + Copyright (C) 2018-2021 Cody Logan + and contributors. + License GPLv3+: GNU GPL version 3 or later + . + This is free software; you are free to + change and redistribute it under certain + conditions. There is NO WARRANTY, to the + extent permitted by law. + """) + parser.add_argument("FILE", help=""" + name of the file to download with the File: + prefix, or the URL of its file description page + """) + parser.add_argument("-V", "--version", action="version", + version=f"%(prog)s {wikiget_version}") + message_options = parser.add_mutually_exclusive_group() + message_options.add_argument("-q", "--quiet", + help="suppress warning messages", + action="store_true") + message_options.add_argument("-v", "--verbose", + help="print detailed information; " + "use -vv for even more detail", + action="count", default=0) + parser.add_argument("-f", "--force", + help="force overwriting existing files", + action="store_true") + parser.add_argument("-s", "--site", default=DEFAULT_SITE, + help="MediaWiki site to download from " + "(default: %(default)s)") + parser.add_argument("-p", "--path", default=DEFAULT_PATH, + help="MediaWiki site path, where api.php is located " + "(default: %(default)s)") + parser.add_argument("--username", default="", + help="MediaWiki site username, for private wikis") + parser.add_argument("--password", default="", + help="MediaWiki site password, for private wikis") + output_options = parser.add_mutually_exclusive_group() + output_options.add_argument("-o", "--output", + help="write download to OUTPUT") + output_options.add_argument("-a", "--batch", + help="treat FILE as a textfile containing " + "multiple files to download, one URL or " + "filename per line", action="store_true") + parser.add_argument("-l", "--logfile", default="", + help="save log output to LOGFILE") + + args = parser.parse_args() + + loglevel = logging.WARNING + if args.verbose >= 2: + # this includes API and library messages + loglevel = logging.DEBUG + elif args.verbose >= 1: + loglevel = logging.INFO + elif args.quiet: + loglevel = logging.ERROR + + # configure logging: + # console log level is set via -v, -vv, and -q options + # file log level is always info (TODO: add debug option) + if args.logfile: + # log to console and file + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)-7s] %(message)s", + filename=args.logfile + ) + + console = logging.StreamHandler() + # TODO: even when loglevel is set to logging.DEBUG, + # debug messages aren't printing to console + console.setLevel(loglevel) + console.setFormatter( + logging.Formatter("[%(levelname)s] %(message)s") + ) + logging.getLogger("").addHandler(console) + else: + # log only to console + logging.basicConfig( + level=loglevel, + format="[%(levelname)s] %(message)s" + ) + + # log events are appended to the file if it already exists, + # so note the start of a new download session + logging.info(f"Starting download session using wikiget {wikiget_version}") + # logging.info(f"Log level is set to {loglevel}") + + if args.batch: + # batch download mode + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file, "r") + except IOError as e: + logging.error("File could not be read. " + "The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something + # happens to the file while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + for line_num, url in enumerate(dl_list, start=1): + url = url.strip() + # keep track of batch file line numbers for + # debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + download(url, args) + else: + # single download mode + dl = args.FILE + download(dl, args) -- cgit v1.2.3 From 75a79785d851efa319f4216e0d3471d30a02154a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 26 Sep 2023 15:45:43 -0700 Subject: Style and format fixes --- src/wikiget/wikiget.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index b9a227f..bc6de38 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -102,10 +102,7 @@ def main(): action="store_true", ) parser.add_argument( - "-l", - "--logfile", - default="", - help="save log output to LOGFILE" + "-l", "--logfile", default="", help="save log output to LOGFILE" ) args = parser.parse_args() @@ -127,23 +124,18 @@ def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)-7s] %(message)s", - filename=args.logfile + filename=args.logfile, ) console = logging.StreamHandler() # TODO: even when loglevel is set to logging.DEBUG, # debug messages aren't printing to console console.setLevel(loglevel) - console.setFormatter( - logging.Formatter("[%(levelname)s] %(message)s") - ) + console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig( - level=loglevel, - format="[%(levelname)s] %(message)s" - ) + logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") # log events are appended to the file if it already exists, # so note the start of a new download session @@ -158,10 +150,11 @@ def main(): logging.info(f"Using batch file '{input_file}'.") try: - fd = open(input_file, "r") + fd = open(input_file) except OSError as e: - logging.error("File could not be read. " - "The following error was encountered:") + logging.error( + "File could not be read. The following error was encountered:" + ) logging.error(e) sys.exit(1) else: @@ -173,11 +166,11 @@ def main(): # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): - url = url.strip() + s_url = url.strip() # keep track of batch file line numbers for # debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - download(url, args) + logging.info(f"Downloading '{s_url}' at line {line_num}:") + download(s_url, args) else: # single download mode dl = args.FILE -- cgit v1.2.3 From 485df31f095a9b629a1dcc04af13956325856d8c Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:51:58 -0700 Subject: Update README and do some code cleanup --- src/wikiget/wikiget.py | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index bc6de38..934107e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -1,5 +1,5 @@ # wikiget - CLI tool for downloading files from Wikimedia sites -# Copyright (C) 2018-2021 Cody Logan and contributors +# Copyright (C) 2018-2023 Cody Logan and contributors # SPDX-License-Identifier: GPL-3.0-or-later # # Wikiget is free software: you can redistribute it and/or modify @@ -25,32 +25,27 @@ from wikiget.dl import download def main(): """ - Main entry point for console script. Automatically compiled by setuptools - when installed with `pip install` or `python setup.py install`. + Main entry point for console script. Automatically compiled by setuptools when + installed with `pip install` or `python setup.py install`. """ parser = argparse.ArgumentParser( description=""" - A tool for downloading files from - MediaWiki sites using the file name or + A tool for downloading files from MediaWiki sites using the file name or description page URL """, epilog=""" - Copyright (C) 2018-2023 Cody Logan - and contributors. - License GPLv3+: GNU GPL version 3 or later - . - This is free software; you are free to - change and redistribute it under certain - conditions. There is NO WARRANTY, to the - extent permitted by law. + Copyright (C) 2018-2023 Cody Logan and contributors. License GPLv3+: GNU GPL + version 3 or later . This is free + software; you are free to change and redistribute it under certain conditions. + There is NO WARRANTY, to the extent permitted by law. """, ) parser.add_argument( "FILE", help=""" - name of the file to download with the File: - prefix, or the URL of its file description page + name of the file to download with the File: prefix, or the URL of its file + description page """, ) parser.add_argument( @@ -96,9 +91,8 @@ def main(): output_options.add_argument( "-a", "--batch", - help="treat FILE as a textfile containing " - "multiple files to download, one URL or " - "filename per line", + help="treat FILE as a textfile containing multiple files to download, one URL " + "or filename per line", action="store_true", ) parser.add_argument( @@ -117,7 +111,7 @@ def main(): loglevel = logging.ERROR # configure logging: - # console log level is set via -v, -vv, and -q options + # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) if args.logfile: # log to console and file @@ -128,8 +122,8 @@ def main(): ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, - # debug messages aren't printing to console + # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't + # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) logging.getLogger("").addHandler(console) @@ -137,8 +131,8 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") - # log events are appended to the file if it already exists, - # so note the start of a new download session + # log events are appended to the file if it already exists, so note the start of a + # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") # logging.info(f"Log level is set to {loglevel}") @@ -159,16 +153,15 @@ def main(): sys.exit(1) else: with fd: - # store file contents in memory in case something - # happens to the file while we're downloading + # store file contents in memory in case something happens to the file + # while we're downloading for _, line in enumerate(fd): dl_list.append(line) # TODO: validate file contents before download process starts for line_num, url in enumerate(dl_list, start=1): s_url = url.strip() - # keep track of batch file line numbers for - # debugging/logging purposes + # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{s_url}' at line {line_num}:") download(s_url, args) else: -- cgit v1.2.3 From e18222daecca1656390652cbd1c7f6985080241a Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 09:58:37 -0700 Subject: Add short user and pass options Swapped path short option from -p to -P and added -u for username and -p for password --- src/wikiget/wikiget.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 934107e..f482280 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -75,16 +75,22 @@ def main(): help="MediaWiki site to download from (default: %(default)s)", ) parser.add_argument( - "-p", + "-P", "--path", default=wikiget.DEFAULT_PATH, help="MediaWiki site path, where api.php is located (default: %(default)s)", ) parser.add_argument( - "--username", default="", help="MediaWiki site username, for private wikis" + "-u", + "--username", + default="", + help="MediaWiki site username, for private wikis" ) parser.add_argument( - "--password", default="", help="MediaWiki site password, for private wikis" + "-p", + "--password", + default="", + help="MediaWiki site password, for private wikis" ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") -- cgit v1.2.3 From 865088207b39427b6b932de4f312d82bd5e05a53 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:26:09 -0700 Subject: Refactor for better code organization --- src/wikiget/wikiget.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index f482280..80d5057 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,15 +20,10 @@ import logging import sys import wikiget -from wikiget.dl import download +from wikiget.dl import download, prep_download -def main(): - """ - Main entry point for console script. Automatically compiled by setuptools when - installed with `pip install` or `python setup.py install`. - """ - +def construct_parser(): parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -84,13 +79,13 @@ def main(): "-u", "--username", default="", - help="MediaWiki site username, for private wikis" + help="MediaWiki site username, for private wikis", ) parser.add_argument( "-p", "--password", default="", - help="MediaWiki site password, for private wikis" + help="MediaWiki site password, for private wikis", ) output_options = parser.add_mutually_exclusive_group() output_options.add_argument("-o", "--output", help="write download to OUTPUT") @@ -104,7 +99,19 @@ def main(): parser.add_argument( "-l", "--logfile", default="", help="save log output to LOGFILE" ) + parser.add_argument( + "-j", + "--threads", + default=1, + help="Number of parallel downloads to attempt in batch mode", + type=int, + ) + return parser + + +def main(): + parser = construct_parser() args = parser.parse_args() loglevel = logging.WARNING @@ -165,12 +172,13 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, url in enumerate(dl_list, start=1): - s_url = url.strip() + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{s_url}' at line {line_num}:") - download(s_url, args) + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + download(file, args) else: # single download mode - dl = args.FILE - download(dl, args) + file = prep_download(args.FILE, args) + download(file, args) -- cgit v1.2.3 From 93e879e30ec2776c5d347e72be32f3ef30bd1410 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:28:23 -0700 Subject: Add parallel download option in batch mode Number of download threads can be set with new -j option. Unfortunately, it's not that much faster than downloading in serial, since the API calls made before the downloads actually start are not (and ideally should not be) parallelized. Still, for large batches, it saves a bit of time. Known issue: due to the download threads writing to the log asynchronously, the messages get jumbled up. This will be fixed eventually. --- src/wikiget/wikiget.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 80d5057..c16d3f6 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,6 +18,7 @@ import argparse import logging import sys +from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download @@ -172,12 +173,18 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - download(file, args) + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 43c1fc258499f54977a1b7b594b295c2dae03114 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 16:07:54 -0700 Subject: Reduce repeated code in log configuration --- src/wikiget/wikiget.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c16d3f6..51c870a 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -127,11 +127,13 @@ def main(): # configure logging: # console log level is set via -v, -vv, and -q options; # file log level is always info (TODO: add debug option) + base_format = "%(threadName)s - %(message)s" + log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( level=logging.INFO, - format="%(asctime)s [%(levelname)-7s] %(message)s", + format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) @@ -139,11 +141,11 @@ def main(): # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't # printing to console console.setLevel(loglevel) - console.setFormatter(logging.Formatter("[%(levelname)s] %(message)s")) + console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) else: # log only to console - logging.basicConfig(level=loglevel, format="[%(levelname)s] %(message)s") + logging.basicConfig(level=loglevel, format=log_format) # log events are appended to the file if it already exists, so note the start of a # new download session @@ -173,7 +175,10 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - with ThreadPoolExecutor(max_workers=args.threads) as executor: + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: futures = [] for line_num, line in enumerate(dl_list, start=1): url = line.strip() -- cgit v1.2.3 From 206f0fe0b97610fc371ad0acdd5146ac12eacfe7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 9 Oct 2023 13:50:30 -0700 Subject: Style cleanup --- src/wikiget/wikiget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 51c870a..8c067e0 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -104,7 +104,7 @@ def construct_parser(): "-j", "--threads", default=1, - help="Number of parallel downloads to attempt in batch mode", + help="number of parallel downloads to attempt in batch mode", type=int, ) -- cgit v1.2.3 From 8b70abecb543099528ecc8c3b1edfe0330d3d223 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 10:11:20 -0700 Subject: Refactor code and improve docstrings --- src/wikiget/wikiget.py | 87 +++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 8c067e0..c470b46 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -111,10 +111,7 @@ def construct_parser(): return parser -def main(): - parser = construct_parser() - args = parser.parse_args() - +def configure_logging(args): loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages @@ -147,6 +144,51 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format=log_format) + +def batch_download(args): + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something happens to the file + # while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() + + +def main(): + # setup + parser = construct_parser() + args = parser.parse_args() + + configure_logging(args) + # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") @@ -154,42 +196,7 @@ def main(): if args.batch: # batch download mode - input_file = args.FILE - dl_list = [] - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error( - "File could not be read. The following error was encountered:" - ) - logging.error(e) - sys.exit(1) - else: - with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() + batch_download(args) else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3 From 226b7cb84070c6d073e153ad410fca7798c8e334 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:04 -0700 Subject: Change logfile log level to debug --- src/wikiget/wikiget.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index c470b46..5b36ce5 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -123,20 +123,18 @@ def configure_logging(args): # configure logging: # console log level is set via -v, -vv, and -q options; - # file log level is always info (TODO: add debug option) + # file log level is always debug (TODO: make this user configurable) base_format = "%(threadName)s - %(message)s" log_format = "[%(levelname)s] " + base_format if args.logfile: # log to console and file logging.basicConfig( - level=logging.INFO, + level=logging.DEBUG, format="%(asctime)s [%(levelname)-7s] " + base_format, filename=args.logfile, ) console = logging.StreamHandler() - # TODO: even when loglevel is set to logging.DEBUG, debug messages aren't - # printing to console console.setLevel(loglevel) console.setFormatter(logging.Formatter(log_format)) logging.getLogger("").addHandler(console) @@ -192,7 +190,6 @@ def main(): # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") - # logging.info(f"Log level is set to {loglevel}") if args.batch: # batch download mode -- cgit v1.2.3 From 87052196874cc1bf82f70a6f5aa8e6df59bc1537 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 11:13:31 -0700 Subject: Revise batch file parsing to ignore blank and commented lines Previously, blank lines would cause an error and lines prepended with "#" would be downloaded like any other, assuming they were valid. Now, "#" can be used to mark ignored files or comments. --- src/wikiget/wikiget.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b36ce5..fba9509 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -145,7 +145,7 @@ def configure_logging(args): def batch_download(args): input_file = args.FILE - dl_list = [] + dl_list = {} logging.info(f"Using batch file '{input_file}'.") @@ -157,10 +157,12 @@ def batch_download(args): sys.exit(1) else: with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s # TODO: validate file contents before download process starts with ThreadPoolExecutor( @@ -168,11 +170,10 @@ def batch_download(args): thread_name_prefix="download", ) as executor: futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() + for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) + logging.info(f"Downloading '{line}' at line {line_num}") + file = prep_download(line, args) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 630541499a58f98c55d5cc372d21e745c106d250 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:24:13 -0700 Subject: Refactor parsing logic and revise exception handling --- src/wikiget/wikiget.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index fba9509..68e0233 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download +from wikiget.exceptions import ParseError def construct_parser(): @@ -173,7 +174,10 @@ def batch_download(args): for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{line}' at line {line_num}") - file = prep_download(line, args) + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -197,5 +201,9 @@ def main(): batch_download(args) else: # single download mode - file = prep_download(args.FILE, args) + try: + file = prep_download(args.FILE, args) + except ParseError as e: + logging.error(e) + sys.exit(1) download(file, args) -- cgit v1.2.3 From 06335ba0176cabd84f5b548995f465ac1c09bc8e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 17 Oct 2023 14:00:14 -0700 Subject: Clean up exception handling and error messages --- src/wikiget/wikiget.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 68e0233..4446f96 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -20,6 +20,9 @@ import logging import sys from concurrent.futures import ThreadPoolExecutor +from mwclient import APIError, InvalidResponse, LoginError +from requests import ConnectionError, HTTPError + import wikiget from wikiget.dl import download, prep_download from wikiget.exceptions import ParseError @@ -178,6 +181,10 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.error( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -198,6 +205,8 @@ def main(): if args.batch: # batch download mode + # TODO: return non-zero exit code if any errors were encountered, even if some + # downloads completed successfully batch_download(args) else: # single download mode @@ -206,4 +215,6 @@ def main(): except ParseError as e: logging.error(e) sys.exit(1) + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + sys.exit(1) download(file, args) -- cgit v1.2.3 From ba1f10666554316c262efd2ee6950560560317c7 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 12:59:08 -0700 Subject: Fix bug in batch downloading An invalid line in the batch file would cause the last valid file to be downloaded twice. --- src/wikiget/wikiget.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 4446f96..af13bc8 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -181,10 +181,12 @@ def batch_download(args): file = prep_download(line, args) except ParseError as e: logging.warning(f"{e} (line {line_num})") + continue except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): logging.error( f"Unable to download '{line}' (line {line_num}) due to an error" ) + continue future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish -- cgit v1.2.3 From 05457af0d73ff3a820c0b465e6607fc5832a6e74 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:23:28 -0700 Subject: Reorganize File class --- src/wikiget/wikiget.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index af13bc8..90078e1 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -195,15 +195,15 @@ def batch_download(args): def main(): - # setup + # setup our environment parser = construct_parser() args = parser.parse_args() - configure_logging(args) # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") + logging.debug(f"User agent: {wikiget.USER_AGENT}") if args.batch: # batch download mode -- cgit v1.2.3 From b136af078208882ae696b21c0d8aac009e7468d4 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:28:23 -0700 Subject: Move batch_download function to proper file --- src/wikiget/wikiget.py | 67 +++++++++----------------------------------------- 1 file changed, 12 insertions(+), 55 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 90078e1..e9a1147 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,13 +18,12 @@ import argparse import logging import sys -from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError from requests import ConnectionError, HTTPError import wikiget -from wikiget.dl import download, prep_download +from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError @@ -145,55 +144,6 @@ def configure_logging(args): else: # log only to console logging.basicConfig(level=loglevel, format=log_format) - - -def batch_download(args): - input_file = args.FILE - dl_list = {} - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error("File could not be read. The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - with fd: - # read the file into memory and process each line as we go - for line_num, line in enumerate(fd, start=1): - line_s = line.strip() - # ignore blank lines and lines starting with "#" (for comments) - if line_s and not line_s.startswith("#"): - dl_list[line_num] = line_s - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in dl_list.items(): - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{line}' at line {line_num}") - try: - file = prep_download(line, args) - except ParseError as e: - logging.warning(f"{e} (line {line_num})") - continue - except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): - logging.error( - f"Unable to download '{line}' (line {line_num}) due to an error" - ) - continue - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() - - def main(): # setup our environment parser = construct_parser() @@ -207,9 +157,14 @@ def main(): if args.batch: # batch download mode - # TODO: return non-zero exit code if any errors were encountered, even if some - # downloads completed successfully - batch_download(args) + errors = batch_download(args) + if errors: + # return non-zero exit code if any problems were encountered, even if some + # downloads completed successfully + logging.warning( + f"{errors} problem{'s'[:errors^1]} encountered during batch processing" + ) + sys.exit(1) else: # single download mode try: @@ -219,4 +174,6 @@ def main(): sys.exit(1) except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): sys.exit(1) - download(file, args) + errors = download(file, args) + if errors: + sys.exit(1) -- cgit v1.2.3 From 3d37cf6f86eb6c48a3a0a094c42ade6d7aed1daf Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:31:56 -0700 Subject: Move logging configuration to new file Also, use a LoggerAdapter to add contextual info (such as filenames) to log messages when downloading, especially useful with threaded batch processing. --- src/wikiget/wikiget.py | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index e9a1147..5b84dac 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -25,6 +25,7 @@ from requests import ConnectionError, HTTPError import wikiget from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError +from wikiget.logging import configure_logging def construct_parser(): @@ -114,36 +115,6 @@ def construct_parser(): return parser -def configure_logging(args): - loglevel = logging.WARNING - if args.verbose >= wikiget.VERY_VERBOSE: - # this includes API and library messages - loglevel = logging.DEBUG - elif args.verbose >= wikiget.STD_VERBOSE: - loglevel = logging.INFO - elif args.quiet: - loglevel = logging.ERROR - - # configure logging: - # console log level is set via -v, -vv, and -q options; - # file log level is always debug (TODO: make this user configurable) - base_format = "%(threadName)s - %(message)s" - log_format = "[%(levelname)s] " + base_format - if args.logfile: - # log to console and file - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s [%(levelname)-7s] " + base_format, - filename=args.logfile, - ) - - console = logging.StreamHandler() - console.setLevel(loglevel) - console.setFormatter(logging.Formatter(log_format)) - logging.getLogger("").addHandler(console) - else: - # log only to console - logging.basicConfig(level=loglevel, format=log_format) def main(): # setup our environment parser = construct_parser() -- cgit v1.2.3 From c1820026f97eaf671c29ab30f02879de0ac4df89 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:36:14 -0700 Subject: Add type annotations to source files --- src/wikiget/wikiget.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/wikiget/wikiget.py') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 5b84dac..e64d00e 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -28,7 +28,7 @@ from wikiget.exceptions import ParseError from wikiget.logging import configure_logging -def construct_parser(): +def construct_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description=""" A tool for downloading files from MediaWiki sites using the file name or @@ -115,7 +115,7 @@ def construct_parser(): return parser -def main(): +def main() -> None: # setup our environment parser = construct_parser() args = parser.parse_args() -- cgit v1.2.3