aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCody Logan <clpo13@gmail.com>2019-01-07 11:03:22 -0800
committerCody Logan <clpo13@gmail.com>2019-01-07 11:03:22 -0800
commitc9191bae98fbad1b2aa5a7cedaaac414a519c17c (patch)
tree3043964dd68e951e186a7d113713b2eaa9c0cce1
parent868b8fd6bddbe1608bc034588f5a6c69d24b2af6 (diff)
downloadwikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.tar.gz
wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.zip
Fix problem reading certain file names
URLs with encoded non-ASCII characters and files with spaces can now be downloaded as expected.
-rw-r--r--test/test_wikiget.py6
-rw-r--r--wikiget/wikiget.py14
2 files changed, 13 insertions, 7 deletions
diff --git a/test/test_wikiget.py b/test/test_wikiget.py
index 981d640..4200bd4 100644
--- a/test/test_wikiget.py
+++ b/test/test_wikiget.py
@@ -32,14 +32,16 @@ def test_file_regex():
def test_invalid_file_input():
- invalid_input = ["file:example", "example"]
+ invalid_input = ["file:example", "example.jpg", "Foo Bar.gif",
+ "Fil:Example.jpg"]
for i in invalid_input:
file_match = wikiget.valid_file(i)
assert file_match is None
def test_valid_file_input():
- valid_input = ["example.jpg", "file:example.jpg", "example.file-01.jpg"]
+ valid_input = ["Image:example.jpg", "file:example.jpg", "File:example.file-01.jpg",
+ "File:ß handwritten sample.gif"]
for i in valid_input:
file_match = wikiget.valid_file(i)
assert file_match is not None
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index f70db1d..bbea06e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -17,7 +17,7 @@ import logging
import os
import re
import sys
-from urllib.parse import urlparse
+from urllib.parse import unquote, urlparse
from mwclient import InvalidResponse, Site, __ver__ as mwclient_version
from requests import ConnectionError
@@ -98,11 +98,15 @@ def main():
print("Downloading Wikipedia articles is not currently supported.", end="")
if file_match and not file_match.group(1):
# file extension detected, but no prefix
+ # TODO: no longer possible to get to this point
print(" If this is a file, please add the 'File:' prefix.")
else:
print("\n", end="")
sys.exit(1)
+ # remove URL encoding
+ filename = unquote(filename)
+
dest = args.output or filename
if args.verbose >= 2:
@@ -163,14 +167,14 @@ def main():
def valid_file(search_string):
"""
Determines if the given string contains a valid file name, defined as a string
- ending with a '.' and at least one character, optionally beginning with 'File:'
- or 'Image:', the standard file prefixes in MediaWiki.
+ ending with a '.' and at least one character, beginning with 'File:' or
+ 'Image:', the standard file prefixes in MediaWiki.
:param search_string: string to validate
:returns: a regex Match object if there's a match or None otherwise
"""
# second group could also restrict to file extensions with three or more
- # letters with ([^/\s]+\.\w{3,})
- file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)?([^/\s]+\.\w+)$")
+ # letters with ([^/\r\n\t\f\v]+\.\w{3,})
+ file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\r\n\t\f\v]+\.\w+)$")
return file_regex.search(search_string)