From 93e879e30ec2776c5d347e72be32f3ef30bd1410 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Tue, 3 Oct 2023 13:28:23 -0700 Subject: Add parallel download option in batch mode Number of download threads can be set with new -j option. Unfortunately, it's not that much faster than downloading in serial, since the API calls made before the downloads actually start are not (and ideally should not be) parallelized. Still, for large batches, it saves a bit of time. Known issue: due to the download threads writing to the log asynchronously, the messages get jumbled up. This will be fixed eventually. --- src/wikiget/wikiget.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 80d5057..c16d3f6 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -18,6 +18,7 @@ import argparse import logging import sys +from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download @@ -172,12 +173,18 @@ def main(): dl_list.append(line) # TODO: validate file contents before download process starts - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - download(file, args) + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() else: # single download mode file = prep_download(args.FILE, args) -- cgit v1.2.3