From 630541499a58f98c55d5cc372d21e745c106d250 Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Fri, 13 Oct 2023 12:24:13 -0700 Subject: Refactor parsing logic and revise exception handling --- src/wikiget/dl.py | 58 +++++++++++++-------------------------- src/wikiget/exceptions.py | 20 ++++++++++++++ src/wikiget/parse.py | 54 +++++++++++++++++++++++++++++++++++++ src/wikiget/wikiget.py | 12 +++++++-- tests/test_dl.py | 69 +++++++++++++---------------------------------- tests/test_parse.py | 60 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 93 deletions(-) create mode 100644 src/wikiget/exceptions.py create mode 100644 src/wikiget/parse.py create mode 100644 tests/test_parse.py diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py index 2b2befa..50b7460 100644 --- a/src/wikiget/dl.py +++ b/src/wikiget/dl.py @@ -18,46 +18,16 @@ import logging import os import sys -from urllib.parse import unquote, urlparse from mwclient import APIError, InvalidResponse, LoginError, Site from requests import ConnectionError, HTTPError from tqdm import tqdm import wikiget +from wikiget.exceptions import ParseError from wikiget.file import File -from wikiget.validations import valid_file, verify_hash - - -def get_dest(dl, args): - url = urlparse(dl) - - if url.netloc: - filename = url.path - site_name = url.netloc - if args.site is not wikiget.DEFAULT_SITE: - # this will work even if the user specifies 'commons.wikimedia.org' - logging.warning("target is a URL, ignoring site specified with --site") - else: - filename = dl - site_name = args.site - - file_match = valid_file(filename) - - # check if this is a valid file - if file_match and file_match.group(1): - # has File:/Image: prefix and extension - filename = file_match.group(2) - else: - # no file extension and/or prefix, probably an article - logging.error(f"Could not parse input '{filename}' as a file.") - sys.exit(1) - - filename = unquote(filename) # remove URL encoding for special characters - - dest = args.output or filename - - return filename, dest, site_name +from wikiget.parse import get_dest +from wikiget.validations import verify_hash def query_api(filename, site_name, args): @@ -98,8 +68,7 @@ def query_api(filename, site_name, args): # an API error at this point likely means access is denied, which could happen # with a private wiki logging.error( - "Access denied. Try providing credentials with " - "--username and --password." + "Access denied. Try providing credentials with --username and --password." ) logging.debug("Full error message:") for i in e.args: @@ -110,7 +79,10 @@ def query_api(filename, site_name, args): def prep_download(dl, args): - filename, dest, site_name = get_dest(dl, args) + try: + filename, dest, site_name = get_dest(dl, args) + except ParseError: + raise file = File(filename, dest) file.object, file.site = query_api(file.name, site_name, args) return file @@ -136,7 +108,7 @@ def download(f, args): if os.path.isfile(dest) and not args.force: logging.warning( - f"File '{dest}' already exists, skipping download (use -f to ignore)" + f"File '{dest}' already exists, skipping download (use -f to force)" ) else: try: @@ -167,19 +139,25 @@ def download(f, args): fd.write(chunk) progress_bar.update(len(chunk)) - # verify file integrity and optionally print details + # verify file integrity and log details dl_sha1 = verify_hash(dest) - logging.info(f"Downloaded file SHA1 is {dl_sha1}") - logging.info(f"Server file SHA1 is {file_sha1}") + logging.info(f"Remote file SHA1 is {file_sha1}") + logging.info(f"Local file SHA1 is {dl_sha1}") if dl_sha1 == file_sha1: logging.info("Hashes match!") # at this point, we've successfully downloaded the file + success_log = f"'{filename}' downloaded" + if args.output: + success_log += f" to '{dest}'" + logging.info(success_log) else: logging.error("Hash mismatch! Downloaded file may be corrupt.") + # TODO: log but don't quit while in batch mode sys.exit(1) else: # no file information returned logging.error(f"Target '{filename}' does not appear to be a valid file.") + # TODO: log but don't quit while in batch mode sys.exit(1) diff --git a/src/wikiget/exceptions.py b/src/wikiget/exceptions.py new file mode 100644 index 0000000..94ed6b2 --- /dev/null +++ b/src/wikiget/exceptions.py @@ -0,0 +1,20 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + + +class ParseError(Exception): + pass diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py new file mode 100644 index 0000000..09c0767 --- /dev/null +++ b/src/wikiget/parse.py @@ -0,0 +1,54 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import logging +from urllib.parse import unquote, urlparse + +import wikiget +from wikiget.exceptions import ParseError +from wikiget.validations import valid_file + + +def get_dest(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not wikiget.DEFAULT_SITE: + # this will work even if the user specifies 'commons.wikimedia.org' + logging.warning("target is a URL, ignoring site specified with --site") + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + msg = f"Could not parse input '{filename}' as a file" + raise ParseError(msg) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + return filename, dest, site_name diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py index fba9509..68e0233 100644 --- a/src/wikiget/wikiget.py +++ b/src/wikiget/wikiget.py @@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor import wikiget from wikiget.dl import download, prep_download +from wikiget.exceptions import ParseError def construct_parser(): @@ -173,7 +174,10 @@ def batch_download(args): for line_num, line in dl_list.items(): # keep track of batch file line numbers for debugging/logging purposes logging.info(f"Downloading '{line}' at line {line_num}") - file = prep_download(line, args) + try: + file = prep_download(line, args) + except ParseError as e: + logging.warning(f"{e} (line {line_num})") future = executor.submit(download, file, args) futures.append(future) # wait for downloads to finish @@ -197,5 +201,9 @@ def main(): batch_download(args) else: # single download mode - file = prep_download(args.FILE, args) + try: + file = prep_download(args.FILE, args) + except ParseError as e: + logging.error(e) + sys.exit(1) download(file, args) diff --git a/tests/test_dl.py b/tests/test_dl.py index abf8763..fc68733 100644 --- a/tests/test_dl.py +++ b/tests/test_dl.py @@ -15,65 +15,32 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see . +import logging + import pytest -from wikiget.dl import get_dest +from wikiget import USER_AGENT +from wikiget.dl import prep_download, query_api from wikiget.wikiget import construct_parser -class TestGetDest: +# TODO: don't hit the actual API when doing tests +@pytest.mark.skip +class TestQueryApi: parser = construct_parser() - def test_get_dest_with_filename(self): + def test_query_api(self, caplog): + caplog.set_level(logging.DEBUG) args = self.parser.parse_args(["File:Example.jpg"]) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "commons.wikimedia.org" - - def test_get_dest_with_url(self): - args = self.parser.parse_args( - [ - "https://en.wikipedia.org/wiki/File:Example.jpg", - ] - ) - filename, dest, site_name = get_dest(args.FILE, args) - assert filename == "Example.jpg" - assert dest == "Example.jpg" - assert site_name == "en.wikipedia.org" + file, site = query_api("Example.jpg", "commons.wikimedia.org", args) + assert USER_AGENT in caplog.text - def test_get_dest_with_bad_filename(self): - args = self.parser.parse_args(["Example.jpg"]) - with pytest.raises(SystemExit): - filename, dest, site_name = get_dest(args.FILE, args) - - def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): - args = self.parser.parse_args( - [ - "https://commons.wikimedia.org/wiki/File:Example.jpg", - "--site", - "commons.wikimedia.org", - ] - ) - filename, dest, site_name = get_dest(args.FILE, args) - assert "target is a URL, ignoring site specified with --site" in caplog.text +@pytest.mark.skip +class TestPrepDownload: + parser = construct_parser() -# TODO: don't hit the actual API when doing tests -# class TestQueryApi: -# parser = construct_parser() -# -# def test_query_api(self, caplog): -# caplog.set_level(logging.DEBUG) -# args = self.parser.parse_args(["File:Example.jpg"]) -# file, site = query_api("Example.jpg", "commons.wikimedia.org", args) -# assert USER_AGENT in caplog.text -# -# -# class TestPrepDownload(): -# parser = construct_parser() -# -# def test_prep_download(self): -# args = self.parser.parse_args(["File:Example.jpg"]) -# file = prep_download(args.FILE, args) -# assert file is not None + def test_prep_download(self): + args = self.parser.parse_args(["File:Example.jpg"]) + file = prep_download(args.FILE, args) + assert file is not None diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..064b85c --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,60 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2023 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see . + +import pytest + +from wikiget.exceptions import ParseError +from wikiget.parse import get_dest +from wikiget.wikiget import construct_parser + + +class TestGetDest: + parser = construct_parser() + + def test_get_dest_with_filename(self): + args = self.parser.parse_args(["File:Example.jpg"]) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "commons.wikimedia.org" + + def test_get_dest_with_url(self): + args = self.parser.parse_args( + [ + "https://en.wikipedia.org/wiki/File:Example.jpg", + ] + ) + filename, dest, site_name = get_dest(args.FILE, args) + assert filename == "Example.jpg" + assert dest == "Example.jpg" + assert site_name == "en.wikipedia.org" + + def test_get_dest_with_bad_filename(self): + args = self.parser.parse_args(["Example.jpg"]) + with pytest.raises(ParseError): + filename, dest, site_name = get_dest(args.FILE, args) + + def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture): + args = self.parser.parse_args( + [ + "https://commons.wikimedia.org/wiki/File:Example.jpg", + "--site", + "commons.wikimedia.org", + ] + ) + filename, dest, site_name = get_dest(args.FILE, args) + assert "target is a URL, ignoring site specified with --site" in caplog.text -- cgit v1.2.3