aboutsummaryrefslogtreecommitdiff
path: root/wikiget/wikiget.py
diff options
context:
space:
mode:
authorCody Logan <clpo13@gmail.com>2019-01-07 11:03:22 -0800
committerCody Logan <clpo13@gmail.com>2019-01-07 11:03:22 -0800
commitc9191bae98fbad1b2aa5a7cedaaac414a519c17c (patch)
tree3043964dd68e951e186a7d113713b2eaa9c0cce1 /wikiget/wikiget.py
parent868b8fd6bddbe1608bc034588f5a6c69d24b2af6 (diff)
downloadwikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.tar.gz
wikiget-c9191bae98fbad1b2aa5a7cedaaac414a519c17c.zip
Fix problem reading certain file names
URLs with encoded non-ASCII characters and files with spaces can now be downloaded as expected.
Diffstat (limited to 'wikiget/wikiget.py')
-rw-r--r--wikiget/wikiget.py14
1 files changed, 9 insertions, 5 deletions
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index f70db1d..bbea06e 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -17,7 +17,7 @@ import logging
import os
import re
import sys
-from urllib.parse import urlparse
+from urllib.parse import unquote, urlparse
from mwclient import InvalidResponse, Site, __ver__ as mwclient_version
from requests import ConnectionError
@@ -98,11 +98,15 @@ def main():
print("Downloading Wikipedia articles is not currently supported.", end="")
if file_match and not file_match.group(1):
# file extension detected, but no prefix
+ # TODO: no longer possible to get to this point
print(" If this is a file, please add the 'File:' prefix.")
else:
print("\n", end="")
sys.exit(1)
+ # remove URL encoding
+ filename = unquote(filename)
+
dest = args.output or filename
if args.verbose >= 2:
@@ -163,14 +167,14 @@ def main():
def valid_file(search_string):
"""
Determines if the given string contains a valid file name, defined as a string
- ending with a '.' and at least one character, optionally beginning with 'File:'
- or 'Image:', the standard file prefixes in MediaWiki.
+ ending with a '.' and at least one character, beginning with 'File:' or
+ 'Image:', the standard file prefixes in MediaWiki.
:param search_string: string to validate
:returns: a regex Match object if there's a match or None otherwise
"""
# second group could also restrict to file extensions with three or more
- # letters with ([^/\s]+\.\w{3,})
- file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)?([^/\s]+\.\w+)$")
+ # letters with ([^/\r\n\t\f\v]+\.\w{3,})
+ file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\r\n\t\f\v]+\.\w+)$")
return file_regex.search(search_string)