From b136af078208882ae696b21c0d8aac009e7468d4 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 20 Oct 2023 16:28:23 -0700 Subject: Move batch_download function to proper file --- src/wikiget/dl.py | 63 +++++++++++++++++++++++++++++++++++++++++++---- src/wikiget/wikiget.py | 67 +++++++++----------------------------------------- 2 files changed, 70 insertions(+), 60 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 171b017..83aef9f 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,12 +18,14 @@ import logging import os import sys +from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File from wikiget.parse import get_dest from wikiget.validations import verify_hash @@ -78,12 +80,62 @@ def prep_download(dl, args): return file +def batch_download(args): + input_file = args.FILE + dl_list = {} + errors = 0 + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # read the file into memory and process each line as we go + for line_num, line in enumerate(fd, start=1): + line_s = line.strip() + # ignore blank lines and lines starting with "#" (for comments) + if line_s and not line_s.startswith("#"): + dl_list[line_num] = line_s + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in dl_list.items(): + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Processing '{line}' at line {line_num}") + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") + errors += 1 + continue + except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): + logging.warning( + f"Unable to download '{line}' (line {line_num}) due to an error" + ) + errors += 1 + continue + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + errors += future.result() + return errors + + def download(f, args): file = f.image filename = f.name dest = f.dest site = file.site + errors = 0 + if file.exists: # file exists either locally or at a common repository, like Wikimedia Commons file_url = file.imageinfo["url"] @@ -100,6 +152,7 @@ def download(f, args): logging.warning( f"File '{dest}' already exists, skipping download (use -f to force)" ) + errors += 1 else: try: fd = open(dest, "wb") @@ -108,7 +161,7 @@ def download(f, args): "File could not be written. The following error was encountered:" ) logging.error(e) - sys.exit(1) + errors += 1 else: # download the file(s) if args.verbose >= wikiget.STD_VERBOSE: @@ -143,11 +196,11 @@ def download(f, args): logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file") - # TODO: log but don't quit while in batch mode - sys.exit(1) + errors += 1 + + return errors diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 90078e1..e9a1147 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,13 +18,12 @@ import argparse import logging import sys -from concurrent.futures import ThreadPoolExecutor from mwclient import APIError, InvalidResponse, LoginError from requests import ConnectionError, HTTPError import wikiget -from wikiget.dl import download, prep_download +from wikiget.dl import batch_download, download, prep_download from wikiget.exceptions import ParseError @@ -145,55 +144,6 @@ def configure_logging(args): else: # log only to console logging.basicConfig(level=loglevel, format=log_format) - - -def batch_download(args): - input_file = args.FILE - dl_list = {} - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error("File could not be read. The following error was encountered:") - logging.error(e) - sys.exit(1) - else: - with fd: - # read the file into memory and process each line as we go - for line_num, line in enumerate(fd, start=1): - line_s = line.strip() - # ignore blank lines and lines starting with "#" (for comments) - if line_s and not line_s.startswith("#"): - dl_list[line_num] = line_s - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in dl_list.items(): - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{line}' at line {line_num}") - try: - file = prep_download(line, args) - except ParseError as e: - logging.warning(f"{e} (line {line_num})") - continue - except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): - logging.error( - f"Unable to download '{line}' (line {line_num}) due to an error" - ) - continue - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() - - def main(): # setup our environment parser = construct_parser() @@ -207,9 +157,14 @@ def main(): if args.batch: # batch download mode - # TODO: return non-zero exit code if any errors were encountered, even if some - # downloads completed successfully - batch_download(args) + errors = batch_download(args) + if errors: + # return non-zero exit code if any problems were encountered, even if some + # downloads completed successfully + logging.warning( + f"{errors} problem{'s'[:errors^1]} encountered during batch processing" + ) + sys.exit(1) else: # single download mode try: @@ -219,4 +174,6 @@ def main(): sys.exit(1) except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError): sys.exit(1) - download(file, args) + errors = download(file, args) + if errors: + sys.exit(1) -- cgit v1.2.3