diff options
| author | Cody Logan <cody@lokken.dev> | 2023-11-20 10:05:27 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-11-20 10:05:27 -0800 |
| commit | 2c6efe630af507f7f1b76048100f6500244151a3 (patch) | |
| tree | 8ad0ab091956a665a2d2a5471b36b1cf0f856990 /src/wikiget | |
| parent | d78f25717567870d4bb991e9bca7451f925f29ac (diff) | |
| parent | 7c5e87695f56ea3de7a04082b357998fc825e625 (diff) | |
| download | wikiget-2c6efe630af507f7f1b76048100f6500244151a3.tar.gz wikiget-2c6efe630af507f7f1b76048100f6500244151a3.zip | |
Merge pull request #15 from clpo13/reduce-api-calls
Speed up batch downloads by reducing the number of API calls
Diffstat (limited to 'src/wikiget')
| -rw-r--r-- | src/wikiget/client.py | 10 | ||||
| -rw-r--r-- | src/wikiget/dl.py | 39 | ||||
| -rw-r--r-- | src/wikiget/file.py | 8 |
3 files changed, 43 insertions, 14 deletions
diff --git a/src/wikiget/client.py b/src/wikiget/client.py index 3a6e40f..2729144 100644 --- a/src/wikiget/client.py +++ b/src/wikiget/client.py @@ -17,15 +17,21 @@ """Handle API calls (via mwclient) for site and image information.""" +from __future__ import annotations + import logging -from argparse import Namespace +from typing import TYPE_CHECKING from mwclient import APIError, InvalidResponse, LoginError, Site -from mwclient.image import Image from requests import ConnectionError, HTTPError import wikiget +if TYPE_CHECKING: + from argparse import Namespace + + from mwclient.image import Image + logger = logging.getLogger(__name__) diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 5e2a6cb..85c6685 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -17,33 +17,35 @@ """Prepare and process file downloads.""" +from __future__ import annotations + import logging import sys -from argparse import Namespace from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING -from mwclient import APIError, InvalidResponse, LoginError +from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget from wikiget.client import connect_to_site, query_api from wikiget.exceptions import ParseError -from wikiget.file import File from wikiget.logging import FileLogAdapter from wikiget.parse import get_dest, read_batch_file from wikiget.validations import verify_hash +if TYPE_CHECKING: + from argparse import Namespace + + from wikiget.file import File + logger = logging.getLogger(__name__) def prep_download(dl: str, args: Namespace) -> File: """Prepare to download a file by parsing the filename or URL and CLI arguments. - First, the target is parsed for a valid name, destination, and site. If there are no - problems creating a File with this information, we connect to the site hosting it - and fetch the relevant Image object, which is added as an attribute to the File. - :param dl: a string representing the file or URL to download :type dl: str :param args: command-line arguments and their values @@ -59,8 +61,6 @@ def prep_download(dl: str, args: Namespace) -> File: msg = f"[{file.dest}] File already exists; skipping download (use -f to force)" raise FileExistsError(msg) - site = connect_to_site(file.site, args) - file.image = query_api(file.name, site) return file @@ -98,6 +98,8 @@ def process_download(args: Namespace) -> int: # single download mode try: file = prep_download(args.FILE, args) + site = connect_to_site(file.site, args) + file.image = query_api(file.name, site) except ParseError as e: logger.error(e) exit_code = 1 @@ -134,14 +136,31 @@ def batch_download(args: Namespace) -> int: logger.error("File could not be read: %s", str(e)) sys.exit(1) - # TODO: validate file contents before download process starts with ThreadPoolExecutor(max_workers=args.threads) as executor: futures = [] + sites: list[Site] = [] for line_num, line in dl_dict.items(): # keep track of batch file line numbers for debugging/logging purposes logger.info("Processing '%s' at line %i", line, line_num) try: file = prep_download(line, args) + site = next( + filter( + lambda site: site.host == file.site, + sites, + ), + None, + ) + # if there's already a Site object matching the desired host, reuse it + # to reduce the number of API calls made per file + if site: + logger.debug("Reusing the existing connection to %s", site.host) + else: + logger.debug("Making a new connection to %s", file.site) + site = connect_to_site(file.site, args) + # cache the new Site for reuse + sites.append(site) + file.image = query_api(file.name, site) except ParseError as e: logger.warning("%s (line %i)", str(e), line_num) errors += 1 diff --git a/src/wikiget/file.py b/src/wikiget/file.py index 36ce892..0b0c1e0 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -17,12 +17,16 @@ """Define a File class for representing individual files to be downloaded.""" -from pathlib import Path +from __future__ import annotations -from mwclient.image import Image +from pathlib import Path +from typing import TYPE_CHECKING from wikiget import DEFAULT_SITE +if TYPE_CHECKING: + from mwclient.image import Image + class File: """A file object.""" |
