Add batch file functionality for downloading multiple filesv0.2.0

author: Cody Logan <clpo13@gmail.com> 2019-09-26 16:03:44 -0700
committer: Cody Logan <clpo13@gmail.com> 2019-09-26 16:03:44 -0700
commit: cbf64a55ecbbc38304bc2def8c9b96d62332ba62 (patch)
tree: 56d7e6ecf2d578eed8fb7a733ba676100e774cda
parent: f34995d4547357bc90157d81e8445f72f6dada7f (diff)
download: wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.tar.gz
wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.zip
4 files changed, 59 insertions, 24 deletions
diff --git a/README.md b/README.md
index 554be0d..2230524 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Requires Python 2.7 or 3.5+. Install with `pip install --user wikiget`.
 
 ## Usage
 
-`wikiget [-h] [-V] [-q | -v] [-f] [--site SITE] [-o OUTPUT] FILE`
+`wikiget [-h] [-V] [-q | -v] [-f] [-a] [--site SITE] [-o OUTPUT] FILE`
 
 If `FILE` is in the form `File:Example.jpg` or `Example.jpg`, it will be fetched
 from the default site, which is "en.wikipedia.org". If it's the fully-qualified
@@ -32,6 +32,11 @@ By default, the program won't overwrite existing files with the same name as the
 target, but this can be forced with `-f` or `--force`. Additionally, the file can
 be downloaded to a different name with `-o`.
 
+Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE`
+will be treated as an input file containing multiple files to download, one filename
+or URL per line. If an error is encountered, execution stops immediately and the
+offending filename is printed.
+
 ### Examples
 
 ```bash
@@ -42,9 +47,9 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg
 
 ## Future plans
 
-- batch download categories, user uploads, or from a text file
+- batch download by (Commons) category or user uploads
 - download from any MediaWiki-powered site, not just Wikimedia projects
-- download Wikipedia articles, in plain text, wikitext, or other formats
+- maybe: download Wikipedia articles, in plain text, wikitext, or other formats
 
 ## Contributing
 
@@ -68,8 +73,8 @@ python3 -m venv venv
 To activate the virtual environment, use one of the following commands:
 
 ```bash
-source venv/bin/activate  # Linux and macOS
-.\venv\Scripts\activate   # Windows
+source venv/bin/activate  # Linux and macOS; activate.csh and activate.fish are also available
+.\venv\Scripts\activate   # Windows (Command Prompt or PowerShell)
 ```
 
 Then run `pip install -e .` to invoke an
diff --git a/test/test_wikiget.py b/test/test_wikiget.py
index 913b894..373fa25 100644
--- a/test/test_wikiget.py
+++ b/test/test_wikiget.py
@@ -81,10 +81,12 @@ def test_verify_hash():
     file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878"
 
     try:
-        with open(file_name, "w") as dl:
-            dl.write(file_contents)
+        dl = open(file_name, "w")
     except PermissionError:
         pytest.skip("need write access to create test file")
+    else:
+        with dl:
+            dl.write(file_contents)
 
     assert wikiget.verify_hash(file_name) == file_sha1
 
diff --git a/wikiget/version.py b/wikiget/version.py
index 262f3cb..5b1bb55 100644
--- a/wikiget/version.py
+++ b/wikiget/version.py
@@ -4,4 +4,4 @@
 
 """Sets the program version in setup.py and on the command line."""
 
-__version__ = "0.1.6"
+__version__ = "0.2.0"
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index 9aaacf4..04d6f3e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -27,6 +27,9 @@ from tqdm import tqdm
 from wikiget.version import __version__
 
 BLOCKSIZE = 65536
+DEFAULT_SITE = "en.wikipedia.org"
+USER_AGENT = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
+             "mwclient/{}".format(__version__, mwclient_version)
 
 
 def main():
@@ -34,9 +37,6 @@ def main():
     Main entry point for console script. Automatically compiled by setuptools
     when installed with `pip install` or `python setup.py install`.
     """
-    default_site = "en.wikipedia.org"
-    user_agent = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
-                 "mwclient/{}".format(__version__, mwclient_version)
 
     parser = argparse.ArgumentParser(description="""
                                      A tool for downloading files from MediaWiki sites
@@ -63,9 +63,12 @@ def main():
                                 action="count", default=0)
     parser.add_argument("-f", "--force", help="force overwriting existing files",
                         action="store_true")
-    parser.add_argument("-s", "--site", default=default_site,
+    parser.add_argument("-s", "--site", default=DEFAULT_SITE,
                         help="MediaWiki site to download from (default: %(default)s)")
     parser.add_argument("-o", "--output", help="write download to OUTPUT")
+    parser.add_argument("-a", "--batch", help="treat FILE as a textfile containing multiple files to download, one URL or filename per line",
+                        action="store_true")
+
     args = parser.parse_args()
 
     # print API and debug messages in verbose mode
@@ -74,16 +77,39 @@ def main():
     elif args.verbose >= 1:
         logging.basicConfig(level=logging.WARNING)
 
-    url = urlparse(args.FILE)
+    if args.batch:
+        # batch download mode
+        input_file = args.FILE
+        if args.verbose >= 1:
+            print("Info: using batch file '{}'".format(input_file))
+        try:
+            fd = open(input_file, "r")
+        except IOError as e:
+            print("File could not be read. The following error was encountered:")
+            print(e)
+            sys.exit(1)
+        else:
+            with fd:
+                for _, line in enumerate(fd):
+                    line = line.strip()
+                    download(line, args)
+    else:
+        # single download mode
+        dl = args.FILE
+        download(dl, args)
+
+
+def download(dl, args):
+    url = urlparse(dl)
 
     if url.netloc:
         filename = url.path
         site_name = url.netloc
-        if args.site is not default_site and not args.quiet:
+        if args.site is not DEFAULT_SITE and not args.quiet:
             # this will work even if the user specifies 'en.wikipedia.org'
             print("Warning: target is a URL, ignoring site specified with --site")
     else:
-        filename = args.FILE
+        filename = dl
         site_name = args.site
 
     file_match = valid_file(filename)
@@ -114,11 +140,11 @@ def main():
     dest = args.output or filename
 
     if args.verbose >= 2:
-        print("User agent: {}".format(user_agent))
+        print("User agent: {}".format(USER_AGENT))
 
     # connect to site and identify ourselves
     try:
-        site = Site(site_name, clients_useragent=user_agent)
+        site = Site(site_name, clients_useragent=USER_AGENT)
     except ConnectionError:
         # usually this means there is no such site, or there's no network connection
         print("Error: couldn't connect to specified site.")
@@ -150,19 +176,21 @@ def main():
             print("File '{}' already exists, skipping download (use -f to ignore)".format(dest))
         else:
             try:
+                fd = open(dest, "wb")
+            except IOError as e:
+                print("File could not be written. The following error was encountered:")
+                print(e)
+                sys.exit(1)
+            else:
                 # download the file
                 with tqdm(total=file_size, unit="B",
                           unit_scale=True, unit_divisor=1024) as progress_bar:
-                    with open(dest, "wb") as fd:
+                    with fd:
                         res = site.connection.get(file_url, stream=True)
                         progress_bar.set_postfix(file=dest, refresh=False)
                         for chunk in res.iter_content(1024):
                             fd.write(chunk)
                             progress_bar.update(len(chunk))
-            except IOError as e:
-                print("File could not be written. The following error was encountered:")
-                print(e)
-                sys.exit(1)
 
             # verify file integrity and optionally print details
             dl_sha1 = verify_hash(dest)
@@ -173,14 +201,14 @@ def main():
             if dl_sha1 == file_sha1:
                 if args.verbose >= 1:
                     print("Info: hashes match!")
-                sys.exit(0)
+                # at this point, we've successfully downloaded the file
             else:
                 print("Error: hash mismatch! Downloaded file may be corrupt.")
                 sys.exit(1)
 
     else:
         # no file information returned
-        print("Target does not appear to be a valid file.")
+        print("Target '{}' does not appear to be a valid file.".format(filename))
         sys.exit(1)
author	Cody Logan <clpo13@gmail.com>	2019-09-26 16:03:44 -0700
committer	Cody Logan <clpo13@gmail.com>	2019-09-26 16:03:44 -0700
commit	cbf64a55ecbbc38304bc2def8c9b96d62332ba62 (patch)
tree	56d7e6ecf2d578eed8fb7a733ba676100e774cda
parent	f34995d4547357bc90157d81e8445f72f6dada7f (diff)
download	wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.tar.gz wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.zip