Merge pull request #15 from clpo13/reduce-api-calls

Speed up batch downloads by reducing the number of API calls
author: Cody Logan <cody@lokken.dev> 2023-11-20 10:05:27 -0800
committer: GitHub <noreply@github.com> 2023-11-20 10:05:27 -0800
commit: 2c6efe630af507f7f1b76048100f6500244151a3 (patch)
tree: 8ad0ab091956a665a2d2a5471b36b1cf0f856990 /src/wikiget
parent: d78f25717567870d4bb991e9bca7451f925f29ac (diff)
parent: 7c5e87695f56ea3de7a04082b357998fc825e625 (diff)
download: wikiget-2c6efe630af507f7f1b76048100f6500244151a3.tar.gz
wikiget-2c6efe630af507f7f1b76048100f6500244151a3.zip
3 files changed, 43 insertions, 14 deletions
diff --git a/src/wikiget/client.py b/src/wikiget/client.py
index 3a6e40f..2729144 100644
--- a/src/wikiget/client.py
+++ b/src/wikiget/client.py
@@ -17,15 +17,21 @@
 
 """Handle API calls (via mwclient) for site and image information."""
 
+from __future__ import annotations
+
 import logging
-from argparse import Namespace
+from typing import TYPE_CHECKING
 
 from mwclient import APIError, InvalidResponse, LoginError, Site
-from mwclient.image import Image
 from requests import ConnectionError, HTTPError
 
 import wikiget
 
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from mwclient.image import Image
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py
index 5e2a6cb..85c6685 100644
--- a/src/wikiget/dl.py
+++ b/src/wikiget/dl.py
@@ -17,33 +17,35 @@
 
 """Prepare and process file downloads."""
 
+from __future__ import annotations
+
 import logging
 import sys
-from argparse import Namespace
 from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING
 
-from mwclient import APIError, InvalidResponse, LoginError
+from mwclient import APIError, InvalidResponse, LoginError, Site
 from requests import ConnectionError, HTTPError
 from tqdm import tqdm
 
 import wikiget
 from wikiget.client import connect_to_site, query_api
 from wikiget.exceptions import ParseError
-from wikiget.file import File
 from wikiget.logging import FileLogAdapter
 from wikiget.parse import get_dest, read_batch_file
 from wikiget.validations import verify_hash
 
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from wikiget.file import File
+
 logger = logging.getLogger(__name__)
 
 
 def prep_download(dl: str, args: Namespace) -> File:
     """Prepare to download a file by parsing the filename or URL and CLI arguments.
 
-    First, the target is parsed for a valid name, destination, and site. If there are no
-    problems creating a File with this information, we connect to the site hosting it
-    and fetch the relevant Image object, which is added as an attribute to the File.
-
     :param dl: a string representing the file or URL to download
     :type dl: str
     :param args: command-line arguments and their values
@@ -59,8 +61,6 @@ def prep_download(dl: str, args: Namespace) -> File:
         msg = f"[{file.dest}] File already exists; skipping download (use -f to force)"
         raise FileExistsError(msg)
 
-    site = connect_to_site(file.site, args)
-    file.image = query_api(file.name, site)
     return file
 
 
@@ -98,6 +98,8 @@ def process_download(args: Namespace) -> int:
         # single download mode
         try:
             file = prep_download(args.FILE, args)
+            site = connect_to_site(file.site, args)
+            file.image = query_api(file.name, site)
         except ParseError as e:
             logger.error(e)
             exit_code = 1
@@ -134,14 +136,31 @@ def batch_download(args: Namespace) -> int:
         logger.error("File could not be read: %s", str(e))
         sys.exit(1)
 
-    # TODO: validate file contents before download process starts
     with ThreadPoolExecutor(max_workers=args.threads) as executor:
         futures = []
+        sites: list[Site] = []
         for line_num, line in dl_dict.items():
             # keep track of batch file line numbers for debugging/logging purposes
             logger.info("Processing '%s' at line %i", line, line_num)
             try:
                 file = prep_download(line, args)
+                site = next(
+                    filter(
+                        lambda site: site.host == file.site,
+                        sites,
+                    ),
+                    None,
+                )
+                # if there's already a Site object matching the desired host, reuse it
+                # to reduce the number of API calls made per file
+                if site:
+                    logger.debug("Reusing the existing connection to %s", site.host)
+                else:
+                    logger.debug("Making a new connection to %s", file.site)
+                    site = connect_to_site(file.site, args)
+                    # cache the new Site for reuse
+                    sites.append(site)
+                file.image = query_api(file.name, site)
             except ParseError as e:
                 logger.warning("%s (line %i)", str(e), line_num)
                 errors += 1
diff --git a/src/wikiget/file.py b/src/wikiget/file.py
index 36ce892..0b0c1e0 100644
--- a/src/wikiget/file.py
+++ b/src/wikiget/file.py
@@ -17,12 +17,16 @@
 
 """Define a File class for representing individual files to be downloaded."""
 
-from pathlib import Path
+from __future__ import annotations
 
-from mwclient.image import Image
+from pathlib import Path
+from typing import TYPE_CHECKING
 
 from wikiget import DEFAULT_SITE
 
+if TYPE_CHECKING:
+    from mwclient.image import Image
+
 
 class File:
     """A file object."""
author	Cody Logan <cody@lokken.dev>	2023-11-20 10:05:27 -0800
committer	GitHub <noreply@github.com>	2023-11-20 10:05:27 -0800
commit	2c6efe630af507f7f1b76048100f6500244151a3 (patch)
tree	8ad0ab091956a665a2d2a5471b36b1cf0f856990 /src/wikiget
parent	d78f25717567870d4bb991e9bca7451f925f29ac (diff)
parent	7c5e87695f56ea3de7a04082b357998fc825e625 (diff)
download	wikiget-2c6efe630af507f7f1b76048100f6500244151a3.tar.gz wikiget-2c6efe630af507f7f1b76048100f6500244151a3.zip