diff options
| author | Cody Logan <cody@lokken.dev> | 2023-11-16 10:23:49 -0800 |
|---|---|---|
| committer | Cody Logan <cody@lokken.dev> | 2023-11-16 10:23:49 -0800 |
| commit | 2f3074e1b2a62cbd5e32778abc0ff82027c1ce3b (patch) | |
| tree | c8e7fc5662cf1a34a5b2ecbd8b2bbbbaa630b19c /src/wikiget/dl.py | |
| parent | ce58a03caa6f4d9e3cb01898b4b73716031b24dd (diff) | |
| download | wikiget-2f3074e1b2a62cbd5e32778abc0ff82027c1ce3b.tar.gz wikiget-2f3074e1b2a62cbd5e32778abc0ff82027c1ce3b.zip | |
Reuse existing Site object when possible in batch downloads
Previously, every file downloaded in a batch would create a new Site
object. Now, the Site object created by the first file will be reused
by subsequent files if it matches the file's requested host, which will
significantly speed up the download process, assuming all files are from
the same site. This is a quick and dirty fix which could be improved to
better handle situations where there are a mix of files from different
sites.
Diffstat (limited to 'src/wikiget/dl.py')
| -rw-r--r-- | src/wikiget/dl.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 60405d0..c9b2ed5 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -20,7 +20,7 @@ import sys from argparse import Namespace from concurrent.futures import ThreadPoolExecutor -from mwclient import APIError, InvalidResponse, LoginError +from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm @@ -38,10 +38,6 @@ logger = logging.getLogger(__name__) def prep_download(dl: str, args: Namespace) -> File: """Prepare to download a file by parsing the filename or URL and CLI arguments. - First, the target is parsed for a valid name, destination, and site. If there are no - problems creating a File with this information, we connect to the site hosting it - and fetch the relevant Image object, which is added as an attribute to the File. - :param dl: a string representing the file or URL to download :type dl: str :param args: command-line arguments and their values @@ -57,8 +53,6 @@ def prep_download(dl: str, args: Namespace) -> File: msg = f"[{file.dest}] File already exists; skipping download (use -f to force)" raise FileExistsError(msg) - site = connect_to_site(file.site, args) - file.image = query_api(file.name, site) return file @@ -96,6 +90,8 @@ def process_download(args: Namespace) -> int: # single download mode try: file = prep_download(args.FILE, args) + site = connect_to_site(file.site, args) + file.image = query_api(file.name, site) except ParseError as e: logger.error(e) exit_code = 1 @@ -135,11 +131,20 @@ def batch_download(args: Namespace) -> int: # TODO: validate file contents before download process starts with ThreadPoolExecutor(max_workers=args.threads) as executor: futures = [] + site: Site = None for line_num, line in dl_dict.items(): # keep track of batch file line numbers for debugging/logging purposes logger.info("Processing '%s' at line %i", line, line_num) try: file = prep_download(line, args) + # if there's already a Site object matching the desired host, reuse it + # to reduce the number of API calls made per file + if not site or site.host != file.site: + logger.debug("Made a new site connection") + site = connect_to_site(file.site, args) + else: + logger.debug("Reused an existing site connection") + file.image = query_api(file.name, site) except ParseError as e: logger.warning("%s (line %i)", str(e), line_num) errors += 1 |
