aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCody Logan <clpo13@gmail.com>2019-09-26 16:03:44 -0700
committerCody Logan <clpo13@gmail.com>2019-09-26 16:03:44 -0700
commitcbf64a55ecbbc38304bc2def8c9b96d62332ba62 (patch)
tree56d7e6ecf2d578eed8fb7a733ba676100e774cda
parentf34995d4547357bc90157d81e8445f72f6dada7f (diff)
downloadwikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.tar.gz
wikiget-cbf64a55ecbbc38304bc2def8c9b96d62332ba62.zip
Add batch file functionality for downloading multiple filesv0.2.0
-rw-r--r--README.md15
-rw-r--r--test/test_wikiget.py6
-rw-r--r--wikiget/version.py2
-rw-r--r--wikiget/wikiget.py60
4 files changed, 59 insertions, 24 deletions
diff --git a/README.md b/README.md
index 554be0d..2230524 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Requires Python 2.7 or 3.5+. Install with `pip install --user wikiget`.
## Usage
-`wikiget [-h] [-V] [-q | -v] [-f] [--site SITE] [-o OUTPUT] FILE`
+`wikiget [-h] [-V] [-q | -v] [-f] [-a] [--site SITE] [-o OUTPUT] FILE`
If `FILE` is in the form `File:Example.jpg` or `Example.jpg`, it will be fetched
from the default site, which is "en.wikipedia.org". If it's the fully-qualified
@@ -32,6 +32,11 @@ By default, the program won't overwrite existing files with the same name as the
target, but this can be forced with `-f` or `--force`. Additionally, the file can
be downloaded to a different name with `-o`.
+Files can be batch downloaded with the `-a` or `--batch` flag. In this mode, `FILE`
+will be treated as an input file containing multiple files to download, one filename
+or URL per line. If an error is encountered, execution stops immediately and the
+offending filename is printed.
+
### Examples
```bash
@@ -42,9 +47,9 @@ wikiget https://en.wikipedia.org/wiki/File:Example.jpg -o test.jpg
## Future plans
-- batch download categories, user uploads, or from a text file
+- batch download by (Commons) category or user uploads
- download from any MediaWiki-powered site, not just Wikimedia projects
-- download Wikipedia articles, in plain text, wikitext, or other formats
+- maybe: download Wikipedia articles, in plain text, wikitext, or other formats
## Contributing
@@ -68,8 +73,8 @@ python3 -m venv venv
To activate the virtual environment, use one of the following commands:
```bash
-source venv/bin/activate # Linux and macOS
-.\venv\Scripts\activate # Windows
+source venv/bin/activate # Linux and macOS; activate.csh and activate.fish are also available
+.\venv\Scripts\activate # Windows (Command Prompt or PowerShell)
```
Then run `pip install -e .` to invoke an
diff --git a/test/test_wikiget.py b/test/test_wikiget.py
index 913b894..373fa25 100644
--- a/test/test_wikiget.py
+++ b/test/test_wikiget.py
@@ -81,10 +81,12 @@ def test_verify_hash():
file_sha1 = "8843d7f92416211de9ebb963ff4ce28125932878"
try:
- with open(file_name, "w") as dl:
- dl.write(file_contents)
+ dl = open(file_name, "w")
except PermissionError:
pytest.skip("need write access to create test file")
+ else:
+ with dl:
+ dl.write(file_contents)
assert wikiget.verify_hash(file_name) == file_sha1
diff --git a/wikiget/version.py b/wikiget/version.py
index 262f3cb..5b1bb55 100644
--- a/wikiget/version.py
+++ b/wikiget/version.py
@@ -4,4 +4,4 @@
"""Sets the program version in setup.py and on the command line."""
-__version__ = "0.1.6"
+__version__ = "0.2.0"
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index 9aaacf4..04d6f3e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -27,6 +27,9 @@ from tqdm import tqdm
from wikiget.version import __version__
BLOCKSIZE = 65536
+DEFAULT_SITE = "en.wikipedia.org"
+USER_AGENT = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
+ "mwclient/{}".format(__version__, mwclient_version)
def main():
@@ -34,9 +37,6 @@ def main():
Main entry point for console script. Automatically compiled by setuptools
when installed with `pip install` or `python setup.py install`.
"""
- default_site = "en.wikipedia.org"
- user_agent = "wikiget/{} (https://github.com/clpo13/python-wikiget) " \
- "mwclient/{}".format(__version__, mwclient_version)
parser = argparse.ArgumentParser(description="""
A tool for downloading files from MediaWiki sites
@@ -63,9 +63,12 @@ def main():
action="count", default=0)
parser.add_argument("-f", "--force", help="force overwriting existing files",
action="store_true")
- parser.add_argument("-s", "--site", default=default_site,
+ parser.add_argument("-s", "--site", default=DEFAULT_SITE,
help="MediaWiki site to download from (default: %(default)s)")
parser.add_argument("-o", "--output", help="write download to OUTPUT")
+ parser.add_argument("-a", "--batch", help="treat FILE as a textfile containing multiple files to download, one URL or filename per line",
+ action="store_true")
+
args = parser.parse_args()
# print API and debug messages in verbose mode
@@ -74,16 +77,39 @@ def main():
elif args.verbose >= 1:
logging.basicConfig(level=logging.WARNING)
- url = urlparse(args.FILE)
+ if args.batch:
+ # batch download mode
+ input_file = args.FILE
+ if args.verbose >= 1:
+ print("Info: using batch file '{}'".format(input_file))
+ try:
+ fd = open(input_file, "r")
+ except IOError as e:
+ print("File could not be read. The following error was encountered:")
+ print(e)
+ sys.exit(1)
+ else:
+ with fd:
+ for _, line in enumerate(fd):
+ line = line.strip()
+ download(line, args)
+ else:
+ # single download mode
+ dl = args.FILE
+ download(dl, args)
+
+
+def download(dl, args):
+ url = urlparse(dl)
if url.netloc:
filename = url.path
site_name = url.netloc
- if args.site is not default_site and not args.quiet:
+ if args.site is not DEFAULT_SITE and not args.quiet:
# this will work even if the user specifies 'en.wikipedia.org'
print("Warning: target is a URL, ignoring site specified with --site")
else:
- filename = args.FILE
+ filename = dl
site_name = args.site
file_match = valid_file(filename)
@@ -114,11 +140,11 @@ def main():
dest = args.output or filename
if args.verbose >= 2:
- print("User agent: {}".format(user_agent))
+ print("User agent: {}".format(USER_AGENT))
# connect to site and identify ourselves
try:
- site = Site(site_name, clients_useragent=user_agent)
+ site = Site(site_name, clients_useragent=USER_AGENT)
except ConnectionError:
# usually this means there is no such site, or there's no network connection
print("Error: couldn't connect to specified site.")
@@ -150,19 +176,21 @@ def main():
print("File '{}' already exists, skipping download (use -f to ignore)".format(dest))
else:
try:
+ fd = open(dest, "wb")
+ except IOError as e:
+ print("File could not be written. The following error was encountered:")
+ print(e)
+ sys.exit(1)
+ else:
# download the file
with tqdm(total=file_size, unit="B",
unit_scale=True, unit_divisor=1024) as progress_bar:
- with open(dest, "wb") as fd:
+ with fd:
res = site.connection.get(file_url, stream=True)
progress_bar.set_postfix(file=dest, refresh=False)
for chunk in res.iter_content(1024):
fd.write(chunk)
progress_bar.update(len(chunk))
- except IOError as e:
- print("File could not be written. The following error was encountered:")
- print(e)
- sys.exit(1)
# verify file integrity and optionally print details
dl_sha1 = verify_hash(dest)
@@ -173,14 +201,14 @@ def main():
if dl_sha1 == file_sha1:
if args.verbose >= 1:
print("Info: hashes match!")
- sys.exit(0)
+ # at this point, we've successfully downloaded the file
else:
print("Error: hash mismatch! Downloaded file may be corrupt.")
sys.exit(1)
else:
# no file information returned
- print("Target does not appear to be a valid file.")
+ print("Target '{}' does not appear to be a valid file.".format(filename))
sys.exit(1)