diff options
| author | Cody Logan <clpo13@gmail.com> | 2019-01-07 11:03:22 -0800 |
|---|---|---|
| committer | Cody Logan <clpo13@gmail.com> | 2019-01-07 11:03:22 -0800 |
| commit | c9191bae98fbad1b2aa5a7cedaaac414a519c17c (patch) | |
| tree | 3043964dd68e951e186a7d113713b2eaa9c0cce1 | |
| parent | 868b8fd6bddbe1608bc034588f5a6c69d24b2af6 (diff) | |
| download | wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.tar.gz wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.zip | |
Fix problem reading certain file names
URLs with encoded non-ASCII characters and files with spaces can now be downloaded as expected.
| -rw-r--r-- | test/test_wikiget.py | 6 | ||||
| -rw-r--r-- | wikiget/wikiget.py | 14 |
2 files changed, 13 insertions, 7 deletions
diff --git a/test/test_wikiget.py b/test/test_wikiget.py index 981d640..4200bd4 100644 --- a/test/test_wikiget.py +++ b/test/test_wikiget.py @@ -32,14 +32,16 @@ def test_file_regex(): def test_invalid_file_input(): - invalid_input = ["file:example", "example"] + invalid_input = ["file:example", "example.jpg", "Foo Bar.gif", + "Fil:Example.jpg"] for i in invalid_input: file_match = wikiget.valid_file(i) assert file_match is None def test_valid_file_input(): - valid_input = ["example.jpg", "file:example.jpg", "example.file-01.jpg"] + valid_input = ["Image:example.jpg", "file:example.jpg", "File:example.file-01.jpg", + "File:ß handwritten sample.gif"] for i in valid_input: file_match = wikiget.valid_file(i) assert file_match is not None diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index f70db1d..bbea06e 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -17,7 +17,7 @@ import logging import os import re import sys -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from mwclient import InvalidResponse, Site, __ver__ as mwclient_version from requests import ConnectionError @@ -98,11 +98,15 @@ def main(): print("Downloading Wikipedia articles is not currently supported.", end="") if file_match and not file_match.group(1): # file extension detected, but no prefix + # TODO: no longer possible to get to this point print(" If this is a file, please add the 'File:' prefix.") else: print("\n", end="") sys.exit(1) + # remove URL encoding + filename = unquote(filename) + dest = args.output or filename if args.verbose >= 2: @@ -163,14 +167,14 @@ def main(): def valid_file(search_string): """ Determines if the given string contains a valid file name, defined as a string - ending with a '.' and at least one character, optionally beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. + ending with a '.' and at least one character, beginning with 'File:' or + 'Image:', the standard file prefixes in MediaWiki. :param search_string: string to validate :returns: a regex Match object if there's a match or None otherwise """ # second group could also restrict to file extensions with three or more - # letters with ([^/\s]+\.\w{3,}) - file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)?([^/\s]+\.\w+)$") + # letters with ([^/\r\n\t\f\v]+\.\w{3,}) + file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\r\n\t\f\v]+\.\w+)$") return file_regex.search(search_string) |
