diff options
| author | Cody Logan <clpo13@gmail.com> | 2019-09-26 16:03:44 -0700 |
|---|---|---|
| committer | Cody Logan <clpo13@gmail.com> | 2019-09-26 16:03:44 -0700 |
| commit | cbf64a55ecbbc38304bc2def8c9b96d62332ba62 (patch) | |
| tree | 56d7e6ecf2d578eed8fb7a733ba676100e774cda | |
| parent | f34995d4547357bc90157d81e8445f72f6dada7f (diff) | |
| download | wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.tar.gz wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.zip | |
Add batch file functionality for downloading multiple filesv0.2.0
| -rw-r--r-- | README.md | 15 | ||||
| -rw-r--r-- | test/test_wikiget.py | 6 | ||||
| -rw-r--r-- | wikiget/version.py | 2 | ||||
| -rw-r--r-- | wikiget/wikiget.py | 60 |
4 files changed, 59 insertions, 24 deletions
@@ -11,7 +11,7 @@ Requires Python 2.7 or 3.5+. Install with `pip install --user wikiget`. ## Usage -`wikiget [-h] [-V] [-q | -v] [-f] [--site SITE] [-o OUTPUT] FILE` +`wikiget [-h] [-V] [-q | -v] [-f] [-a] [--site SITE] [-o OUTPUT] FILE` If `FILE` is in the form `File:Example.jpg` or `Example.jpg`, it will be fetched from the default site, which is "en.wikipedia.org". If it's the fully-qualified @@ -32,6 +32,11 @@ By default, the program won't overwrite existing files with the same name as the target, but this can be forced with `-f` or `--force`. Additionally, the file can be downloaded to a different name with `-o`. +Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE` +will be treated as an input file containing multiple files to download, one filename +or URL per line. If an error is encountered, execution stops immediately and the +offending filename is printed. + ### Examples ```bash @@ -42,9 +47,9 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg ## Future plans -- batch download categories, user uploads, or from a text file +- batch download by (Commons) category or user uploads - download from any MediaWiki-powered site, not just Wikimedia projects -- download Wikipedia articles, in plain text, wikitext, or other formats +- maybe: download Wikipedia articles, in plain text, wikitext, or other formats ## Contributing @@ -68,8 +73,8 @@ python3 -m venv venv To activate the virtual environment, use one of the following commands: ```bash -source venv/bin/activate # Linux and macOS -.\venv\Scripts\activate # Windows +source venv/bin/activate # Linux and macOS; activate.csh and activate.fish are also available +.\venv\Scripts\activate # Windows (Command Prompt or PowerShell) ``` Then run `pip install -e .` to invoke an diff --git a/test/test_wikiget.py b/test/test_wikiget.py index 913b894..373fa25 100644 --- a/test/test_wikiget.py +++ b/test/test_wikiget.py @@ -81,10 +81,12 @@ def test_verify_hash(): file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878" try: - with open(file_name, "w") as dl: - dl.write(file_contents) + dl = open(file_name, "w") except PermissionError: pytest.skip("need write access to create test file") + else: + with dl: + dl.write(file_contents) assert wikiget.verify_hash(file_name) == file_sha1 diff --git a/wikiget/version.py b/wikiget/version.py index 262f3cb..5b1bb55 100644 --- a/wikiget/version.py +++ b/wikiget/version.py @@ -4,4 +4,4 @@ """Sets the program version in setup.py and on the command line.""" -__version__ = "0.1.6" +__version__ = "0.2.0" diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index 9aaacf4..04d6f3e 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -27,6 +27,9 @@ from tqdm import tqdm from wikiget.version import __version__ BLOCKSIZE = 65536 +DEFAULT_SITE = "en.wikipedia.org" +USER_AGENT = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \ + "mwclient/{}".format(__version__, mwclient_version) def main(): @@ -34,9 +37,6 @@ def main(): Main entry point for console script. Automatically compiled by setuptools when installed with `pip install` or `python setup.py install`. """ - default_site = "en.wikipedia.org" - user_agent = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \ - "mwclient/{}".format(__version__, mwclient_version) parser = argparse.ArgumentParser(description=""" A tool for downloading files from MediaWiki sites @@ -63,9 +63,12 @@ def main(): action="count", default=0) parser.add_argument("-f", "--force", help="force overwriting existing files", action="store_true") - parser.add_argument("-s", "--site", default=default_site, + parser.add_argument("-s", "--site", default=DEFAULT_SITE, help="MediaWiki site to download from (default: %(default)s)") parser.add_argument("-o", "--output", help="write download to OUTPUT") + parser.add_argument("-a", "--batch", help="treat FILE as a textfile containing multiple files to download, one URL or filename per line", + action="store_true") + args = parser.parse_args() # print API and debug messages in verbose mode @@ -74,16 +77,39 @@ def main(): elif args.verbose >= 1: logging.basicConfig(level=logging.WARNING) - url = urlparse(args.FILE) + if args.batch: + # batch download mode + input_file = args.FILE + if args.verbose >= 1: + print("Info: using batch file '{}'".format(input_file)) + try: + fd = open(input_file, "r") + except IOError as e: + print("File could not be read. The following error was encountered:") + print(e) + sys.exit(1) + else: + with fd: + for _, line in enumerate(fd): + line = line.strip() + download(line, args) + else: + # single download mode + dl = args.FILE + download(dl, args) + + +def download(dl, args): + url = urlparse(dl) if url.netloc: filename = url.path site_name = url.netloc - if args.site is not default_site and not args.quiet: + if args.site is not DEFAULT_SITE and not args.quiet: # this will work even if the user specifies 'en.wikipedia.org' print("Warning: target is a URL, ignoring site specified with --site") else: - filename = args.FILE + filename = dl site_name = args.site file_match = valid_file(filename) @@ -114,11 +140,11 @@ def main(): dest = args.output or filename if args.verbose >= 2: - print("User agent: {}".format(user_agent)) + print("User agent: {}".format(USER_AGENT)) # connect to site and identify ourselves try: - site = Site(site_name, clients_useragent=user_agent) + site = Site(site_name, clients_useragent=USER_AGENT) except ConnectionError: # usually this means there is no such site, or there's no network connection print("Error: couldn't connect to specified site.") @@ -150,19 +176,21 @@ def main(): print("File '{}' already exists, skipping download (use -f to ignore)".format(dest)) else: try: + fd = open(dest, "wb") + except IOError as e: + print("File could not be written. The following error was encountered:") + print(e) + sys.exit(1) + else: # download the file with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024) as progress_bar: - with open(dest, "wb") as fd: + with fd: res = site.connection.get(file_url, stream=True) progress_bar.set_postfix(file=dest, refresh=False) for chunk in res.iter_content(1024): fd.write(chunk) progress_bar.update(len(chunk)) - except IOError as e: - print("File could not be written. The following error was encountered:") - print(e) - sys.exit(1) # verify file integrity and optionally print details dl_sha1 = verify_hash(dest) @@ -173,14 +201,14 @@ def main(): if dl_sha1 == file_sha1: if args.verbose >= 1: print("Info: hashes match!") - sys.exit(0) + # at this point, we've successfully downloaded the file else: print("Error: hash mismatch! Downloaded file may be corrupt.") sys.exit(1) else: # no file information returned - print("Target does not appear to be a valid file.") + print("Target '{}' does not appear to be a valid file.".format(filename)) sys.exit(1) |
