From cbf64a55ecbbc38304bc2def8c9b96d62332ba62 Mon Sep 17 00:00:00 2001
From: Cody Logan <clpo13@gmail.com>
Date: Thu, 26 Sep 2019 16:03:44 -0700
Subject: Add batch file functionality for downloading multiple files

---
 wikiget/wikiget.py | 60 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 16 deletions(-)

(limited to 'wikiget/wikiget.py')

diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index 9aaacf4..04d6f3e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -27,6 +27,9 @@ from tqdm import tqdm
 from wikiget.version import __version__
 
 BLOCKSIZE = 65536
+DEFAULT_SITE = "en.wikipedia.org"
+USER_AGENT = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
+             "mwclient/{}".format(__version__, mwclient_version)
 
 
 def main():
@@ -34,9 +37,6 @@ def main():
     Main entry point for console script. Automatically compiled by setuptools
     when installed with `pip install` or `python setup.py install`.
     """
-    default_site = "en.wikipedia.org"
-    user_agent = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
-                 "mwclient/{}".format(__version__, mwclient_version)
 
     parser = argparse.ArgumentParser(description="""
                                      A tool for downloading files from MediaWiki sites
@@ -63,9 +63,12 @@ def main():
                                 action="count", default=0)
     parser.add_argument("-f", "--force", help="force overwriting existing files",
                         action="store_true")
-    parser.add_argument("-s", "--site", default=default_site,
+    parser.add_argument("-s", "--site", default=DEFAULT_SITE,
                         help="MediaWiki site to download from (default: %(default)s)")
     parser.add_argument("-o", "--output", help="write download to OUTPUT")
+    parser.add_argument("-a", "--batch", help="treat FILE as a textfile containing multiple files to download, one URL or filename per line",
+                        action="store_true")
+
     args = parser.parse_args()
 
     # print API and debug messages in verbose mode
@@ -74,16 +77,39 @@ def main():
     elif args.verbose >= 1:
         logging.basicConfig(level=logging.WARNING)
 
-    url = urlparse(args.FILE)
+    if args.batch:
+        # batch download mode
+        input_file = args.FILE
+        if args.verbose >= 1:
+            print("Info: using batch file '{}'".format(input_file))
+        try:
+            fd = open(input_file, "r")
+        except IOError as e:
+            print("File could not be read. The following error was encountered:")
+            print(e)
+            sys.exit(1)
+        else:
+            with fd:
+                for _, line in enumerate(fd):
+                    line = line.strip()
+                    download(line, args)
+    else:
+        # single download mode
+        dl = args.FILE
+        download(dl, args)
+
+
+def download(dl, args):
+    url = urlparse(dl)
 
     if url.netloc:
         filename = url.path
         site_name = url.netloc
-        if args.site is not default_site and not args.quiet:
+        if args.site is not DEFAULT_SITE and not args.quiet:
             # this will work even if the user specifies 'en.wikipedia.org'
             print("Warning: target is a URL, ignoring site specified with --site")
     else:
-        filename = args.FILE
+        filename = dl
         site_name = args.site
 
     file_match = valid_file(filename)
@@ -114,11 +140,11 @@ def main():
     dest = args.output or filename
 
     if args.verbose >= 2:
-        print("User agent: {}".format(user_agent))
+        print("User agent: {}".format(USER_AGENT))
 
     # connect to site and identify ourselves
     try:
-        site = Site(site_name, clients_useragent=user_agent)
+        site = Site(site_name, clients_useragent=USER_AGENT)
     except ConnectionError:
         # usually this means there is no such site, or there's no network connection
         print("Error: couldn't connect to specified site.")
@@ -150,19 +176,21 @@ def main():
             print("File '{}' already exists, skipping download (use -f to ignore)".format(dest))
         else:
             try:
+                fd = open(dest, "wb")
+            except IOError as e:
+                print("File could not be written. The following error was encountered:")
+                print(e)
+                sys.exit(1)
+            else:
                 # download the file
                 with tqdm(total=file_size, unit="B",
                           unit_scale=True, unit_divisor=1024) as progress_bar:
-                    with open(dest, "wb") as fd:
+                    with fd:
                         res = site.connection.get(file_url, stream=True)
                         progress_bar.set_postfix(file=dest, refresh=False)
                         for chunk in res.iter_content(1024):
                             fd.write(chunk)
                             progress_bar.update(len(chunk))
-            except IOError as e:
-                print("File could not be written. The following error was encountered:")
-                print(e)
-                sys.exit(1)
 
             # verify file integrity and optionally print details
             dl_sha1 = verify_hash(dest)
@@ -173,14 +201,14 @@ def main():
             if dl_sha1 == file_sha1:
                 if args.verbose >= 1:
                     print("Info: hashes match!")
-                sys.exit(0)
+                # at this point, we've successfully downloaded the file
             else:
                 print("Error: hash mismatch! Downloaded file may be corrupt.")
                 sys.exit(1)
 
     else:
         # no file information returned
-        print("Target does not appear to be a valid file.")
+        print("Target '{}' does not appear to be a valid file.".format(filename))
         sys.exit(1)
 
 
-- 
cgit v1.2.3