From 2f3074e1b2a62cbd5e32778abc0ff82027c1ce3b Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Thu, 16 Nov 2023 10:23:49 -0800 Subject: Reuse existing Site object when possible in batch downloads Previously, every file downloaded in a batch would create a new Site object. Now, the Site object created by the first file will be reused by subsequent files if it matches the file's requested host, which will significantly speed up the download process, assuming all files are from the same site. This is a quick and dirty fix which could be improved to better handle situations where there are a mix of files from different sites. --- src/wikiget/dl.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 60405d0..c9b2ed5 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -20,7 +20,7 @@ import sys from argparse import Namespace from concurrent.futures import ThreadPoolExecutor -from mwclient import APIError, InvalidResponse, LoginError +from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm @@ -38,10 +38,6 @@ logger = logging.getLogger(__name__) def prep_download(dl: str, args: Namespace) -> File: """Prepare to download a file by parsing the filename or URL and CLI arguments. - First, the target is parsed for a valid name, destination, and site. If there are no - problems creating a File with this information, we connect to the site hosting it - and fetch the relevant Image object, which is added as an attribute to the File. - :param dl: a string representing the file or URL to download :type dl: str :param args: command-line arguments and their values @@ -57,8 +53,6 @@ def prep_download(dl: str, args: Namespace) -> File: msg = f"[{file.dest}] File already exists; skipping download (use -f to force)" raise FileExistsError(msg) - site = connect_to_site(file.site, args) - file.image = query_api(file.name, site) return file @@ -96,6 +90,8 @@ def process_download(args: Namespace) -> int: # single download mode try: file = prep_download(args.FILE, args) + site = connect_to_site(file.site, args) + file.image = query_api(file.name, site) except ParseError as e: logger.error(e) exit_code = 1 @@ -135,11 +131,20 @@ def batch_download(args: Namespace) -> int: # TODO: validate file contents before download process starts with ThreadPoolExecutor(max_workers=args.threads) as executor: futures = [] + site: Site = None for line_num, line in dl_dict.items(): # keep track of batch file line numbers for debugging/logging purposes logger.info("Processing '%s' at line %i", line, line_num) try: file = prep_download(line, args) + # if there's already a Site object matching the desired host, reuse it + # to reduce the number of API calls made per file + if not site or site.host != file.site: + logger.debug("Made a new site connection") + site = connect_to_site(file.site, args) + else: + logger.debug("Reused an existing site connection") + file.image = query_api(file.name, site) except ParseError as e: logger.warning("%s (line %i)", str(e), line_num) errors += 1 -- cgit v1.2.3 From 6178c170d88434937d28026fe592629bd967681e Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Thu, 16 Nov 2023 12:01:17 -0800 Subject: Code cleanup; reorganize some tests --- src/wikiget/dl.py | 10 ++++++++-- src/wikiget/version.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 9b53d66..c84935b 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -15,10 +15,12 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +from __future__ import annotations + import logging import sys -from argparse import Namespace from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError @@ -27,11 +29,15 @@ from tqdm import tqdm import wikiget from wikiget.client import connect_to_site, query_api from wikiget.exceptions import ParseError -from wikiget.file import File from wikiget.logging import FileLogAdapter from wikiget.parse import get_dest, read_batch_file from wikiget.validations import verify_hash +if TYPE_CHECKING: + from argparse import Namespace + + from wikiget.file import File + logger = logging.getLogger(__name__) diff --git a/src/wikiget/version.py b/src/wikiget/version.py index 804c60f..479f200 100644 --- a/src/wikiget/version.py +++ b/src/wikiget/version.py @@ -15,4 +15,4 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . -__version__ = "0.8.0" +__version__ = "0.8.0.dev0" -- cgit v1.2.3 From 06dfda7b5430bfc895a39defad50f184d41281f1 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Thu, 16 Nov 2023 12:29:36 -0800 Subject: Additional type checking import blocks --- src/wikiget/client.py | 10 ++++++++-- src/wikiget/file.py | 8 ++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/client.py b/src/wikiget/client.py index 2fc4a6c..64c14d9 100644 --- a/src/wikiget/client.py +++ b/src/wikiget/client.py @@ -15,15 +15,21 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +from __future__ import annotations + import logging -from argparse import Namespace +from typing import TYPE_CHECKING from mwclient import APIError, InvalidResponse, LoginError, Site -from mwclient.image import Image from requests import ConnectionError, HTTPError import wikiget +if TYPE_CHECKING: + from argparse import Namespace + + from mwclient.image import Image + logger = logging.getLogger(__name__) diff --git a/src/wikiget/file.py b/src/wikiget/file.py index a362aff..6fc03a1 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -15,12 +15,16 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . -from pathlib import Path +from __future__ import annotations -from mwclient.image import Image +from pathlib import Path +from typing import TYPE_CHECKING from wikiget import DEFAULT_SITE +if TYPE_CHECKING: + from mwclient.image import Image + class File: """A file object.""" -- cgit v1.2.3 From 682b7b24b84c9d8614cf898a06f67681db222deb Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 17 Nov 2023 16:42:10 -0800 Subject: Cache site connections for reuse in batch downloads --- src/wikiget/dl.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'src/wikiget') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index d3b42fd..2160743 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -139,19 +139,28 @@ def batch_download(args: Namespace) -> int: # TODO: validate file contents before download process starts with ThreadPoolExecutor(max_workers=args.threads) as executor: futures = [] - site: Site = None + sites: list[Site] = [] for line_num, line in dl_dict.items(): # keep track of batch file line numbers for debugging/logging purposes logger.info("Processing '%s' at line %i", line, line_num) try: file = prep_download(line, args) + site = next( + filter( + lambda site: site.host == file.site, + sites, + ), + None, + ) # if there's already a Site object matching the desired host, reuse it # to reduce the number of API calls made per file - if not site or site.host != file.site: - logger.debug("Made a new site connection") - site = connect_to_site(file.site, args) + if site: + logger.debug("Reusing the existing connection to %s", site.host) else: - logger.debug("Reused an existing site connection") + logger.debug("Making a new connection to %s", file.site) + site = connect_to_site(file.site, args) + # cache the new Site for reuse + sites.append(site) file.image = query_api(file.name, site) except ParseError as e: logger.warning("%s (line %i)", str(e), line_num) -- cgit v1.2.3 From 7c5e87695f56ea3de7a04082b357998fc825e625 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 17 Nov 2023 16:55:47 -0800 Subject: Remove obsolete TODO comment --- src/wikiget/dl.py | 1 - 1 file changed, 1 deletion(-) (limited to 'src/wikiget') diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 2160743..85c6685 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -136,7 +136,6 @@ def batch_download(args: Namespace) -> int: logger.error("File could not be read: %s", str(e)) sys.exit(1) - # TODO: validate file contents before download process starts with ThreadPoolExecutor(max_workers=args.threads) as executor: futures = [] sites: list[Site] = [] -- cgit v1.2.3