Fix problem reading certain file names

URLs with encoded non-ASCII characters and files with spaces can now be downloaded as expected.
author: Cody Logan <clpo13@gmail.com> 2019-01-07 11:03:22 -0800
committer: Cody Logan <clpo13@gmail.com> 2019-01-07 11:03:22 -0800
commit: c9191bae98fbad1b2aa5a7cedaaac414a519c17c (patch)
tree: 3043964dd68e951e186a7d113713b2eaa9c0cce1
parent: 868b8fd6bddbe1608bc034588f5a6c69d24b2af6 (diff)
download: wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.tar.gz
wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.zip
2 files changed, 13 insertions, 7 deletions
diff --git a/test/test_wikiget.py b/test/test_wikiget.py
index 981d640..4200bd4 100644
--- a/test/test_wikiget.py
+++ b/test/test_wikiget.py
@@ -32,14 +32,16 @@ def test_file_regex():
 
 
 def test_invalid_file_input():
-    invalid_input = ["file:example", "example"]
+    invalid_input = ["file:example", "example.jpg", "Foo Bar.gif",
+                     "Fil:Example.jpg"]
     for i in invalid_input:
         file_match = wikiget.valid_file(i)
         assert file_match is None
 
 
 def test_valid_file_input():
-    valid_input = ["example.jpg", "file:example.jpg", "example.file-01.jpg"]
+    valid_input = ["Image:example.jpg", "file:example.jpg", "File:example.file-01.jpg",
+                   "File:ß handwritten sample.gif"]
     for i in valid_input:
         file_match = wikiget.valid_file(i)
         assert file_match is not None
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index f70db1d..bbea06e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -17,7 +17,7 @@ import logging
 import os
 import re
 import sys
-from urllib.parse import urlparse
+from urllib.parse import unquote, urlparse
 
 from mwclient import InvalidResponse, Site, __ver__ as mwclient_version
 from requests import ConnectionError
@@ -98,11 +98,15 @@ def main():
         print("Downloading Wikipedia articles is not currently supported.", end="")
         if file_match and not file_match.group(1):
             # file extension detected, but no prefix
+            # TODO: no longer possible to get to this point
             print(" If this is a file, please add the 'File:' prefix.")
         else:
             print("\n", end="")
         sys.exit(1)
 
+    # remove URL encoding
+    filename = unquote(filename)
+
     dest = args.output or filename
 
     if args.verbose >= 2:
@@ -163,14 +167,14 @@ def main():
 def valid_file(search_string):
     """
     Determines if the given string contains a valid file name, defined as a string
-    ending with a '.' and at least one character, optionally beginning with 'File:'
-    or 'Image:', the standard file prefixes in MediaWiki.
+    ending with a '.' and at least one character, beginning with 'File:' or
+    'Image:', the standard file prefixes in MediaWiki.
     :param search_string: string to validate
     :returns: a regex Match object if there's a match or None otherwise
     """
     # second group could also restrict to file extensions with three or more
-    # letters with ([^/\s]+\.\w{3,})
-    file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)?([^/\s]+\.\w+)$")
+    # letters with ([^/\r\n\t\f\v]+\.\w{3,})
+    file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\r\n\t\f\v]+\.\w+)$")
     return file_regex.search(search_string)
author	Cody Logan <clpo13@gmail.com>	2019-01-07 11:03:22 -0800
committer	Cody Logan <clpo13@gmail.com>	2019-01-07 11:03:22 -0800
commit	c9191bae98fbad1b2aa5a7cedaaac414a519c17c (patch)
tree	3043964dd68e951e186a7d113713b2eaa9c0cce1
parent	868b8fd6bddbe1608bc034588f5a6c69d24b2af6 (diff)
download	wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.tar.gz wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.zip