Merge pull request #8 from clpo13/dev

Merge dev branch changes into master
author: clpo13 <cody@lokken.dev> 2023-10-20 16:57:32 -0700
committer: GitHub <noreply@github.com> 2023-10-20 16:57:32 -0700
commit: 8583862e2d16144f184db2e31dbc37dbe3464fed (patch)
tree: 4a0d9edb5301b26d9dbd22ceb307a7e3b1db4820 /src/wikiget/dl.py
parent: e274ccea56219c7d07c0e677d44c8122a699dcaf (diff)
parent: c1820026f97eaf671c29ab30f02879de0ac4df89 (diff)
download: wikiget-8583862e2d16144f184db2e31dbc37dbe3464fed.tar.gz
wikiget-8583862e2d16144f184db2e31dbc37dbe3464fed.zip
1 files changed, 126 insertions, 93 deletions
diff --git a/src/wikiget/dl.py b/src/wikiget/dl.py
index 949f09e..5b5b43b 100644
--- a/src/wikiget/dl.py
+++ b/src/wikiget/dl.py
@@ -1,5 +1,5 @@
 # wikiget - CLI tool for downloading files from Wikimedia sites
-# Copyright (C) 2018-2021 Cody Logan and contributors
+# Copyright (C) 2018-2023 Cody Logan and contributors
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # Wikiget is free software: you can redistribute it and/or modify
@@ -15,125 +15,154 @@
 # You should have received a copy of the GNU General Public License
 # along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
 
+import logging
 import os
 import sys
-from urllib.parse import unquote, urlparse
+from argparse import Namespace
+from concurrent.futures import ThreadPoolExecutor
 
 from mwclient import APIError, InvalidResponse, LoginError, Site
+from mwclient.image import Image
 from requests import ConnectionError, HTTPError
 from tqdm import tqdm
 
 import wikiget
-from wikiget.validations import valid_file, verify_hash
+from wikiget.exceptions import ParseError
+from wikiget.file import File
+from wikiget.logging import FileLogAdapter
+from wikiget.parse import get_dest
+from wikiget.validations import verify_hash
 
 
-def download(dl, args):
-    url = urlparse(dl)
-
-    if url.netloc:
-        filename = url.path
-        site_name = url.netloc
-        if args.site is not wikiget.DEFAULT_SITE and not args.quiet:
-            # this will work even if the user specifies 'commons.wikimedia.org'
-            print("Warning: target is a URL, ignoring site specified with --site")
-    else:
-        filename = dl
-        site_name = args.site
-
-    file_match = valid_file(filename)
-
-    # check if this is a valid file
-    if file_match and file_match.group(1):
-        # has File:/Image: prefix and extension
-        filename = file_match.group(2)
-    else:
-        # no file extension and/or prefix, probably an article
-        print(f"Could not parse input '{filename}' as a file. ")
-        sys.exit(1)
-
-    filename = unquote(filename)  # remove URL encoding for special characters
-
-    dest = args.output or filename
-
-    if args.verbose >= wikiget.VERY_VERBOSE:
-        print(f"User agent: {wikiget.USER_AGENT}")
-
+def query_api(filename: str, site_name: str, args: Namespace) -> Image:
     # connect to site and identify ourselves
-    if args.verbose >= wikiget.STD_VERBOSE:
-        print(f"Site name: {site_name}")
+    logging.info(f"Connecting to {site_name}")
     try:
         site = Site(site_name, path=args.path, clients_useragent=wikiget.USER_AGENT)
         if args.username and args.password:
             site.login(args.username, args.password)
     except ConnectionError as e:
-        # usually this means there is no such site, or there's no network
-        # connection, though it could be a certificate problem
-        print("Error: couldn't connect to specified site.")
-        if args.verbose >= wikiget.VERY_VERBOSE:
-            print("Full error message:")
-            print(e)
-        sys.exit(1)
+        # usually this means there is no such site, or there's no network connection,
+        # though it could be a certificate problem
+        logging.error("Could not connect to specified site")
+        logging.debug(e)
+        raise
     except HTTPError as e:
         # most likely a 403 forbidden or 404 not found error for api.php
-        print(
-            "Error: couldn't find the specified wiki's api.php. "
-            "Check the value of --path."
+        logging.error(
+            "Could not find the specified wiki's api.php. Check the value of --path."
         )
-        if args.verbose >= wikiget.VERY_VERBOSE:
-            print("Full error message:")
-            print(e)
-        sys.exit(1)
+        logging.debug(e)
+        raise
     except (InvalidResponse, LoginError) as e:
-        # InvalidResponse: site exists, but we couldn't communicate with the
-        # API endpoint for some reason other than an HTTP error.
+        # InvalidResponse: site exists, but we couldn't communicate with the API
+        # endpoint for some reason other than an HTTP error.
         # LoginError: missing or invalid credentials
-        print(e)
-        sys.exit(1)
+        logging.error(e)
+        raise
 
     # get info about the target file
     try:
-        file = site.images[filename]
+        image = site.images[filename]
     except APIError as e:
-        # an API error at this point likely means access is denied,
-        # which could happen with a private wiki
-        print(
-            "Error: access denied. Try providing credentials with "
-            "--username and --password."
+        # an API error at this point likely means access is denied, which could happen
+        # with a private wiki
+        logging.error(
+            "Access denied. Try providing credentials with --username and --password."
         )
-        if args.verbose >= wikiget.VERY_VERBOSE:
-            print("Full error message:")
-            for i in e.args:
-                print(i)
-        sys.exit(1)
+        for i in e.args:
+            logging.debug(i)
+        raise
 
-    if file.imageinfo != {}:
-        # file exists either locally or at a common repository,
-        # like Wikimedia Commons
+    return image
+
+
+def prep_download(dl: str, args: Namespace) -> File:
+    file = get_dest(dl, args)
+    file.image = query_api(file.name, file.site, args)
+    return file
+
+
+def batch_download(args: Namespace) -> int:
+    input_file = args.FILE
+    dl_list = {}
+    errors = 0
+
+    logging.info(f"Using batch file '{input_file}'.")
+
+    try:
+        fd = open(input_file)
+    except OSError as e:
+        logging.error("File could not be read. The following error was encountered:")
+        logging.error(e)
+        sys.exit(1)
+    else:
+        with fd:
+            # read the file into memory and process each line as we go
+            for line_num, line in enumerate(fd, start=1):
+                line_s = line.strip()
+                # ignore blank lines and lines starting with "#" (for comments)
+                if line_s and not line_s.startswith("#"):
+                    dl_list[line_num] = line_s
+
+    # TODO: validate file contents before download process starts
+    with ThreadPoolExecutor(max_workers=args.threads) as executor:
+        futures = []
+        for line_num, line in dl_list.items():
+            # keep track of batch file line numbers for debugging/logging purposes
+            logging.info(f"Processing '{line}' at line {line_num}")
+            try:
+                file = prep_download(line, args)
+            except ParseError as e:
+                logging.warning(f"{e} (line {line_num})")
+                errors += 1
+                continue
+            except (ConnectionError, HTTPError, InvalidResponse, LoginError, APIError):
+                logging.warning(
+                    f"Unable to download '{line}' (line {line_num}) due to an error"
+                )
+                errors += 1
+                continue
+            future = executor.submit(download, file, args)
+            futures.append(future)
+        # wait for downloads to finish
+        for future in futures:
+            errors += future.result()
+    return errors
+
+
+def download(f: File, args: Namespace) -> int:
+    file = f.image
+    filename = f.name
+    dest = f.dest
+    site = file.site
+
+    errors = 0
+
+    logger = logging.getLogger("")
+    adapter = FileLogAdapter(logger, {"filename": filename})
+
+    if file.exists:
+        # file exists either locally or at a common repository, like Wikimedia Commons
         file_url = file.imageinfo["url"]
         file_size = file.imageinfo["size"]
         file_sha1 = file.imageinfo["sha1"]
 
-        if args.verbose >= wikiget.STD_VERBOSE:
-            print(
-                f"Info: downloading '{filename}' "
-                f"({file_size} bytes) from {site.host}",
-                end="",
-            )
-            if args.output:
-                print(f" to '{dest}'")
-            else:
-                print("\n", end="")
-            print(f"Info: {file_url}")
+        filename_log = f"Downloading '{filename}' ({file_size} bytes) from {site.host}"
+        if args.output:
+            filename_log += f" to '{dest}'"
+        adapter.info(filename_log)
+        adapter.info(f"{file_url}")
 
         if os.path.isfile(dest) and not args.force:
-            print(f"File '{dest}' already exists, skipping download (use -f to ignore)")
+            adapter.warning("File already exists, skipping download (use -f to force)")
+            errors += 1
         else:
             try:
                 fd = open(dest, "wb")
             except OSError as e:
-                print("File could not be written. The following error was encountered:")
-                print(e)
-                sys.exit(1)
+                adapter.error(f"File could not be written. {e}")
+                errors += 1
             else:
                 # download the file(s)
                 if args.verbose >= wikiget.STD_VERBOSE:
@@ -154,21 +183,25 @@ def download(dl, args):
                             fd.write(chunk)
                             progress_bar.update(len(chunk))
 
-            # verify file integrity and optionally print details
+            # verify file integrity and log details
             dl_sha1 = verify_hash(dest)
 
-            if args.verbose >= wikiget.STD_VERBOSE:
-                print(f"Info: downloaded file SHA1 is {dl_sha1}")
-                print(f"Info: server file SHA1 is {file_sha1}")
+            adapter.info(f"Remote file SHA1 is {file_sha1}")
+            adapter.info(f"Local file SHA1 is {dl_sha1}")
             if dl_sha1 == file_sha1:
-                if args.verbose >= wikiget.STD_VERBOSE:
-                    print("Info: hashes match!")
+                adapter.info("Hashes match!")
                 # at this point, we've successfully downloaded the file
+                success_log = f"'{filename}' downloaded"
+                if args.output:
+                    success_log += f" to '{dest}'"
+                adapter.info(success_log)
             else:
-                print("Error: hash mismatch! Downloaded file may be corrupt.")
-                sys.exit(1)
+                adapter.error("Hash mismatch! Downloaded file may be corrupt.")
+                errors += 1
 
     else:
         # no file information returned
-        print(f"Target '{filename}' does not appear to be a valid file.")
-        sys.exit(1)
+        adapter.warning("Target does not appear to be a valid file")
+        errors += 1
+
+    return errors
author	clpo13 <cody@lokken.dev>	2023-10-20 16:57:32 -0700
committer	GitHub <noreply@github.com>	2023-10-20 16:57:32 -0700
commit	8583862e2d16144f184db2e31dbc37dbe3464fed (patch)
tree	4a0d9edb5301b26d9dbd22ceb307a7e3b1db4820 /src/wikiget/dl.py
parent	e274ccea56219c7d07c0e677d44c8122a699dcaf (diff)
parent	c1820026f97eaf671c29ab30f02879de0ac4df89 (diff)
download	wikiget-8583862e2d16144f184db2e31dbc37dbe3464fed.tar.gz wikiget-8583862e2d16144f184db2e31dbc37dbe3464fed.zip