From c9191bae98fbad1b2aa5a7cedaaac414a519c17c Mon Sep 17 00:00:00 2001 From: Cody Logan Date: Mon, 7 Jan 2019 11:03:22 -0800 Subject: Fix problem reading certain file names URLs with encoded non-ASCII characters and files with spaces can now be downloaded as expected. --- wikiget/wikiget.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'wikiget/wikiget.py') diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index f70db1d..bbea06e 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -17,7 +17,7 @@ import logging import os import re import sys -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from mwclient import InvalidResponse, Site, __ver__ as mwclient_version from requests import ConnectionError @@ -98,11 +98,15 @@ def main(): print("Downloading Wikipedia articles is not currently supported.", end="") if file_match and not file_match.group(1): # file extension detected, but no prefix + # TODO: no longer possible to get to this point print(" If this is a file, please add the 'File:' prefix.") else: print("\n", end="") sys.exit(1) + # remove URL encoding + filename = unquote(filename) + dest = args.output or filename if args.verbose >= 2: @@ -163,14 +167,14 @@ def main(): def valid_file(search_string): """ Determines if the given string contains a valid file name, defined as a string - ending with a '.' and at least one character, optionally beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. + ending with a '.' and at least one character, beginning with 'File:' or + 'Image:', the standard file prefixes in MediaWiki. :param search_string: string to validate :returns: a regex Match object if there's a match or None otherwise """ # second group could also restrict to file extensions with three or more - # letters with ([^/\s]+\.\w{3,}) - file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)?([^/\s]+\.\w+)$") + # letters with ([^/\r\n\t\f\v]+\.\w{3,}) + file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\r\n\t\f\v]+\.\w+)$") return file_regex.search(search_string) -- cgit v1.2.3