From 630541499a58f98c55d5cc372d21e745c106d250 Mon Sep 17 00:00:00 2001
From: Cody Logan <cody@lokken.dev>
Date: Fri, 13 Oct 2023 12:24:13 -0700
Subject: Refactor parsing logic and revise exception handling

---
 src/wikiget/dl.py         | 58 +++++++++++++--------------------------
 src/wikiget/exceptions.py | 20 ++++++++++++++
 src/wikiget/parse.py      | 54 +++++++++++++++++++++++++++++++++++++
 src/wikiget/wikiget.py    | 12 +++++++--
 tests/test_dl.py          | 69 +++++++++++++----------------------------------
 tests/test_parse.py       | 60 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 180 insertions(+), 93 deletions(-)
 create mode 100644 src/wikiget/exceptions.py
 create mode 100644 src/wikiget/parse.py
 create mode 100644 tests/test_parse.py

diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py
index 2b2befa..50b7460 100644
--- a/src/wikiget/dl.py
+++ b/src/wikiget/dl.py
@@ -18,46 +18,16 @@
 import logging
 import os
 import sys
-from urllib.parse import unquote, urlparse
 
 from mwclient import APIError, InvalidResponse, LoginError, Site
 from requests import ConnectionError, HTTPError
 from tqdm import tqdm
 
 import wikiget
+from wikiget.exceptions import ParseError
 from wikiget.file import File
-from wikiget.validations import valid_file, verify_hash
-
-
-def get_dest(dl, args):
-    url = urlparse(dl)
-
-    if url.netloc:
-        filename = url.path
-        site_name = url.netloc
-        if args.site is not wikiget.DEFAULT_SITE:
-            # this will work even if the user specifies 'commons.wikimedia.org'
-            logging.warning("target is a URL, ignoring site specified with --site")
-    else:
-        filename = dl
-        site_name = args.site
-
-    file_match = valid_file(filename)
-
-    # check if this is a valid file
-    if file_match and file_match.group(1):
-        # has File:/Image: prefix and extension
-        filename = file_match.group(2)
-    else:
-        # no file extension and/or prefix, probably an article
-        logging.error(f"Could not parse input '{filename}' as a file.")
-        sys.exit(1)
-
-    filename = unquote(filename)  # remove URL encoding for special characters
-
-    dest = args.output or filename
-
-    return filename, dest, site_name
+from wikiget.parse import get_dest
+from wikiget.validations import verify_hash
 
 
 def query_api(filename, site_name, args):
@@ -98,8 +68,7 @@ def query_api(filename, site_name, args):
         # an API error at this point likely means access is denied, which could happen
         # with a private wiki
         logging.error(
-            "Access denied. Try providing credentials with "
-            "--username and --password."
+            "Access denied. Try providing credentials with --username and --password."
         )
         logging.debug("Full error message:")
         for i in e.args:
@@ -110,7 +79,10 @@ def query_api(filename, site_name, args):
 
 
 def prep_download(dl, args):
-    filename, dest, site_name = get_dest(dl, args)
+    try:
+        filename, dest, site_name = get_dest(dl, args)
+    except ParseError:
+        raise
     file = File(filename, dest)
     file.object, file.site = query_api(file.name, site_name, args)
     return file
@@ -136,7 +108,7 @@ def download(f, args):
 
         if os.path.isfile(dest) and not args.force:
             logging.warning(
-                f"File '{dest}' already exists, skipping download (use -f to ignore)"
+                f"File '{dest}' already exists, skipping download (use -f to force)"
             )
         else:
             try:
@@ -167,19 +139,25 @@ def download(f, args):
                             fd.write(chunk)
                             progress_bar.update(len(chunk))
 
-            # verify file integrity and optionally print details
+            # verify file integrity and log details
             dl_sha1 = verify_hash(dest)
 
-            logging.info(f"Downloaded file SHA1 is {dl_sha1}")
-            logging.info(f"Server file SHA1 is {file_sha1}")
+            logging.info(f"Remote file SHA1 is {file_sha1}")
+            logging.info(f"Local file SHA1 is {dl_sha1}")
             if dl_sha1 == file_sha1:
                 logging.info("Hashes match!")
                 # at this point, we've successfully downloaded the file
+                success_log = f"'{filename}' downloaded"
+                if args.output:
+                    success_log += f" to '{dest}'"
+                logging.info(success_log)
             else:
                 logging.error("Hash mismatch! Downloaded file may be corrupt.")
+                # TODO: log but don't quit while in batch mode
                 sys.exit(1)
 
     else:
         # no file information returned
         logging.error(f"Target '{filename}' does not appear to be a valid file.")
+        # TODO: log but don't quit while in batch mode
         sys.exit(1)
diff --git a/src/wikiget/exceptions.py b/src/wikiget/exceptions.py
new file mode 100644
index 0000000..94ed6b2
--- /dev/null
+++ b/src/wikiget/exceptions.py
@@ -0,0 +1,20 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2023 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+
+class ParseError(Exception):
+    pass
diff --git a/src/wikiget/parse.py b/src/wikiget/parse.py
new file mode 100644
index 0000000..09c0767
--- /dev/null
+++ b/src/wikiget/parse.py
@@ -0,0 +1,54 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2023 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+from urllib.parse import unquote, urlparse
+
+import wikiget
+from wikiget.exceptions import ParseError
+from wikiget.validations import valid_file
+
+
+def get_dest(dl, args):
+    url = urlparse(dl)
+
+    if url.netloc:
+        filename = url.path
+        site_name = url.netloc
+        if args.site is not wikiget.DEFAULT_SITE:
+            # this will work even if the user specifies 'commons.wikimedia.org'
+            logging.warning("target is a URL, ignoring site specified with --site")
+    else:
+        filename = dl
+        site_name = args.site
+
+    file_match = valid_file(filename)
+
+    # check if this is a valid file
+    if file_match and file_match.group(1):
+        # has File:/Image: prefix and extension
+        filename = file_match.group(2)
+    else:
+        # no file extension and/or prefix, probably an article
+        msg = f"Could not parse input '{filename}' as a file"
+        raise ParseError(msg)
+
+    filename = unquote(filename)  # remove URL encoding for special characters
+
+    dest = args.output or filename
+
+    return filename, dest, site_name
diff --git a/src/wikiget/wikiget.py b/src/wikiget/wikiget.py
index fba9509..68e0233 100644
--- a/src/wikiget/wikiget.py
+++ b/src/wikiget/wikiget.py
@@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor
 
 import wikiget
 from wikiget.dl import download, prep_download
+from wikiget.exceptions import ParseError
 
 
 def construct_parser():
@@ -173,7 +174,10 @@ def batch_download(args):
         for line_num, line in dl_list.items():
             # keep track of batch file line numbers for debugging/logging purposes
             logging.info(f"Downloading '{line}' at line {line_num}")
-            file = prep_download(line, args)
+            try:
+                file = prep_download(line, args)
+            except ParseError as e:
+                logging.warning(f"{e} (line {line_num})")
             future = executor.submit(download, file, args)
             futures.append(future)
         # wait for downloads to finish
@@ -197,5 +201,9 @@ def main():
         batch_download(args)
     else:
         # single download mode
-        file = prep_download(args.FILE, args)
+        try:
+            file = prep_download(args.FILE, args)
+        except ParseError as e:
+            logging.error(e)
+            sys.exit(1)
         download(file, args)
diff --git a/tests/test_dl.py b/tests/test_dl.py
index abf8763..fc68733 100644
--- a/tests/test_dl.py
+++ b/tests/test_dl.py
@@ -15,65 +15,32 @@
 # You should have received a copy of the GNU General Public License
 # along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
 
+import logging
+
 import pytest
 
-from wikiget.dl import get_dest
+from wikiget import USER_AGENT
+from wikiget.dl import prep_download, query_api
 from wikiget.wikiget import construct_parser
 
 
-class TestGetDest:
+# TODO: don't hit the actual API when doing tests
+@pytest.mark.skip
+class TestQueryApi:
     parser = construct_parser()
 
-    def test_get_dest_with_filename(self):
+    def test_query_api(self, caplog):
+        caplog.set_level(logging.DEBUG)
         args = self.parser.parse_args(["File:Example.jpg"])
-        filename, dest, site_name = get_dest(args.FILE, args)
-        assert filename == "Example.jpg"
-        assert dest == "Example.jpg"
-        assert site_name == "commons.wikimedia.org"
-
-    def test_get_dest_with_url(self):
-        args = self.parser.parse_args(
-            [
-                "https://en.wikipedia.org/wiki/File:Example.jpg",
-            ]
-        )
-        filename, dest, site_name = get_dest(args.FILE, args)
-        assert filename == "Example.jpg"
-        assert dest == "Example.jpg"
-        assert site_name == "en.wikipedia.org"
+        file, site = query_api("Example.jpg", "commons.wikimedia.org", args)
+        assert USER_AGENT in caplog.text
 
-    def test_get_dest_with_bad_filename(self):
-        args = self.parser.parse_args(["Example.jpg"])
-        with pytest.raises(SystemExit):
-            filename, dest, site_name = get_dest(args.FILE, args)
-
-    def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture):
-        args = self.parser.parse_args(
-            [
-                "https://commons.wikimedia.org/wiki/File:Example.jpg",
-                "--site",
-                "commons.wikimedia.org",
-            ]
-        )
-        filename, dest, site_name = get_dest(args.FILE, args)
-        assert "target is a URL, ignoring site specified with --site" in caplog.text
 
+@pytest.mark.skip
+class TestPrepDownload:
+    parser = construct_parser()
 
-# TODO: don't hit the actual API when doing tests
-# class TestQueryApi:
-#     parser = construct_parser()
-#
-#     def test_query_api(self, caplog):
-#         caplog.set_level(logging.DEBUG)
-#         args = self.parser.parse_args(["File:Example.jpg"])
-#         file, site = query_api("Example.jpg", "commons.wikimedia.org", args)
-#         assert USER_AGENT in caplog.text
-#
-#
-# class TestPrepDownload():
-#     parser = construct_parser()
-#
-#     def test_prep_download(self):
-#         args = self.parser.parse_args(["File:Example.jpg"])
-#         file = prep_download(args.FILE, args)
-#         assert file is not None
+    def test_prep_download(self):
+        args = self.parser.parse_args(["File:Example.jpg"])
+        file = prep_download(args.FILE, args)
+        assert file is not None
diff --git a/tests/test_parse.py b/tests/test_parse.py
new file mode 100644
index 0000000..064b85c
--- /dev/null
+++ b/tests/test_parse.py
@@ -0,0 +1,60 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2023 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import pytest
+
+from wikiget.exceptions import ParseError
+from wikiget.parse import get_dest
+from wikiget.wikiget import construct_parser
+
+
+class TestGetDest:
+    parser = construct_parser()
+
+    def test_get_dest_with_filename(self):
+        args = self.parser.parse_args(["File:Example.jpg"])
+        filename, dest, site_name = get_dest(args.FILE, args)
+        assert filename == "Example.jpg"
+        assert dest == "Example.jpg"
+        assert site_name == "commons.wikimedia.org"
+
+    def test_get_dest_with_url(self):
+        args = self.parser.parse_args(
+            [
+                "https://en.wikipedia.org/wiki/File:Example.jpg",
+            ]
+        )
+        filename, dest, site_name = get_dest(args.FILE, args)
+        assert filename == "Example.jpg"
+        assert dest == "Example.jpg"
+        assert site_name == "en.wikipedia.org"
+
+    def test_get_dest_with_bad_filename(self):
+        args = self.parser.parse_args(["Example.jpg"])
+        with pytest.raises(ParseError):
+            filename, dest, site_name = get_dest(args.FILE, args)
+
+    def test_get_dest_with_different_site(self, caplog: pytest.LogCaptureFixture):
+        args = self.parser.parse_args(
+            [
+                "https://commons.wikimedia.org/wiki/File:Example.jpg",
+                "--site",
+                "commons.wikimedia.org",
+            ]
+        )
+        filename, dest, site_name = get_dest(args.FILE, args)
+        assert "target is a URL, ignoring site specified with --site" in caplog.text
-- 
cgit v1.2.3