diff options
| author | Cody Logan <cody@lokken.dev> | 2023-10-13 10:11:20 -0700 |
|---|---|---|
| committer | Cody Logan <cody@lokken.dev> | 2023-10-13 10:11:20 -0700 |
| commit | 8b70abecb543099528ecc8c3b1edfe0330d3d223 (patch) | |
| tree | 939de9ab71d283489406838af4d14ef10ec1528d /src | |
| parent | 5dc9b79bd68d2f7cf0dcf1adfaffd8e07b27c6ba (diff) | |
| download | wikiget-8b70abecb543099528ecc8c3b1edfe0330d3d223.tar.gz wikiget-8b70abecb543099528ecc8c3b1edfe0330d3d223.zip | |
Refactor code and improve docstrings
Diffstat (limited to 'src')
| -rw-r--r-- | src/wikiget/file.py | 16 | ||||
| -rw-r--r-- | src/wikiget/validations.py | 26 | ||||
| -rw-r--r-- | src/wikiget/wikiget.py | 87 |
3 files changed, 75 insertions, 54 deletions
diff --git a/src/wikiget/file.py b/src/wikiget/file.py index 60a71e0..c1b9ae6 100644 --- a/src/wikiget/file.py +++ b/src/wikiget/file.py @@ -17,11 +17,17 @@ class File: - def __init__(self, name, dest=None): + def __init__(self, name, dest=""): + """ + Initializes a new file with the specified name and an optional destination name. + + :param name: name of the file + :type name: str + :param dest: destination of the file, if different from the name; if not + specified, defaults to the name + :type dest: str, optional + """ self.object = None self.site = None self.name = name - if dest is None: - self.dest = name - else: - self.dest = dest + self.dest = dest if dest else name diff --git a/src/wikiget/validations.py b/src/wikiget/validations.py index 8ebd996..1610417 100644 --- a/src/wikiget/validations.py +++ b/src/wikiget/validations.py @@ -23,11 +23,14 @@ from wikiget import BLOCKSIZE def valid_file(search_string): """ - Determines if the given string contains a valid file name, defined as a - string ending with a '.' and at least one character, beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. + Determines if the given string contains a valid file name, defined as a string + ending with a '.' and at least one character, beginning with 'File:' or 'Image:', + the standard file prefixes in MediaWiki. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ # second group could also restrict to file extensions with three or more # letters with ([^/\r\n\t\f\v]+\.\w{3,}) @@ -37,12 +40,15 @@ def valid_file(search_string): def valid_site(search_string): """ - Determines if the given string contains a valid site name, defined as a - string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all - subdomains of those domains. Eventually, it should be possible to support - any MediaWiki site, regardless of domain name. + Determines if the given string contains a valid site name, defined as a string + ending with 'wikipedia.org' or 'wikimedia.org'. This covers all subdomains of those + domains. Eventually, it should be possible to support any MediaWiki site, regardless + of domain name. + :param search_string: string to validate + :type search_string: str :returns: a regex Match object if there's a match or None otherwise + :rtype: re.Match """ site_regex = re.compile(r"wiki[mp]edia\.org$", re.I) return site_regex.search(search_string) @@ -50,10 +56,12 @@ def valid_site(search_string): def verify_hash(filename): """ - Calculates the SHA1 hash of the given file for comparison with a known - value. + Calculates the SHA1 hash of the given file for comparison with a known value. + :param filename: name of the file to calculate a hash for + :type filename: str :return: hash digest + :rtype: str """ hasher = hashlib.sha1() # noqa: S324 with open(filename, "rb") as dl: diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index 8c067e0..c470b46 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -111,10 +111,7 @@ def construct_parser(): return parser -def main(): - parser = construct_parser() - args = parser.parse_args() - +def configure_logging(args): loglevel = logging.WARNING if args.verbose >= wikiget.VERY_VERBOSE: # this includes API and library messages @@ -147,6 +144,51 @@ def main(): # log only to console logging.basicConfig(level=loglevel, format=log_format) + +def batch_download(args): + input_file = args.FILE + dl_list = [] + + logging.info(f"Using batch file '{input_file}'.") + + try: + fd = open(input_file) + except OSError as e: + logging.error("File could not be read. The following error was encountered:") + logging.error(e) + sys.exit(1) + else: + with fd: + # store file contents in memory in case something happens to the file + # while we're downloading + for _, line in enumerate(fd): + dl_list.append(line) + + # TODO: validate file contents before download process starts + with ThreadPoolExecutor( + max_workers=args.threads, + thread_name_prefix="download", + ) as executor: + futures = [] + for line_num, line in enumerate(dl_list, start=1): + url = line.strip() + # keep track of batch file line numbers for debugging/logging purposes + logging.info(f"Downloading '{url}' at line {line_num}:") + file = prep_download(url, args) + future = executor.submit(download, file, args) + futures.append(future) + # wait for downloads to finish + for future in futures: + future.result() + + +def main(): + # setup + parser = construct_parser() + args = parser.parse_args() + + configure_logging(args) + # log events are appended to the file if it already exists, so note the start of a # new download session logging.info(f"Starting download session using wikiget {wikiget.wikiget_version}") @@ -154,42 +196,7 @@ def main(): if args.batch: # batch download mode - input_file = args.FILE - dl_list = [] - - logging.info(f"Using batch file '{input_file}'.") - - try: - fd = open(input_file) - except OSError as e: - logging.error( - "File could not be read. The following error was encountered:" - ) - logging.error(e) - sys.exit(1) - else: - with fd: - # store file contents in memory in case something happens to the file - # while we're downloading - for _, line in enumerate(fd): - dl_list.append(line) - - # TODO: validate file contents before download process starts - with ThreadPoolExecutor( - max_workers=args.threads, - thread_name_prefix="download", - ) as executor: - futures = [] - for line_num, line in enumerate(dl_list, start=1): - url = line.strip() - # keep track of batch file line numbers for debugging/logging purposes - logging.info(f"Downloading '{url}' at line {line_num}:") - file = prep_download(url, args) - future = executor.submit(download, file, args) - futures.append(future) - # wait for downloads to finish - for future in futures: - future.result() + batch_download(args) else: # single download mode file = prep_download(args.FILE, args) |
