diff options
Diffstat (limited to 'wikiget')
| -rw-r--r-- | wikiget/__init__.py | 20 | ||||
| -rw-r--r-- | wikiget/dl.py | 153 | ||||
| -rw-r--r-- | wikiget/validations.py | 64 | ||||
| -rw-r--r-- | wikiget/version.py | 4 | ||||
| -rw-r--r-- | wikiget/wikiget.py | 184 |
5 files changed, 241 insertions, 184 deletions
diff --git a/wikiget/__init__.py b/wikiget/__init__.py index e69de29..253fcf8 100644 --- a/wikiget/__init__.py +++ b/wikiget/__init__.py @@ -0,0 +1,20 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018, 2019, 2020 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see <https://www.gnu.org/licenses/>. + +# set some global constants +BLOCKSIZE = 65536 +DEFAULT_SITE = 'commons.wikimedia.org' diff --git a/wikiget/dl.py b/wikiget/dl.py new file mode 100644 index 0000000..b074269 --- /dev/null +++ b/wikiget/dl.py @@ -0,0 +1,153 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018, 2019, 2020 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see <https://www.gnu.org/licenses/>. + +import os +import sys +from urllib.parse import unquote, urlparse + +from mwclient import InvalidResponse, Site, __version__ as mwclient_version +from requests import ConnectionError +from tqdm import tqdm + +from . import DEFAULT_SITE +from .validations import valid_file, valid_site, verify_hash +from .version import __version__ + +USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \ + 'mwclient/{}'.format(__version__, mwclient_version) + + +def download(dl, args): + url = urlparse(dl) + + if url.netloc: + filename = url.path + site_name = url.netloc + if args.site is not DEFAULT_SITE and not args.quiet: + # this will work even if the user specifies 'commons.wikimedia.org' + print('Warning: target is a URL, ' + 'ignoring site specified with --site') + else: + filename = dl + site_name = args.site + + file_match = valid_file(filename) + site_match = valid_site(site_name) + + # check for valid site parameter + if not site_match: + print('Only Wikimedia sites (wikipedia.org and wikimedia.org) ' + 'are currently supported.') + sys.exit(1) + + # check if this is a valid file + if file_match and file_match.group(1): + # has File:/Image: prefix and extension + filename = file_match.group(2) + else: + # no file extension and/or prefix, probably an article + print('Downloading Wikipedia articles is not currently supported.', + end='') + if file_match and not file_match.group(1): + # file extension detected, but no prefix + # TODO: no longer possible to get to this point since + # file_match is None with no prefix + print(" If this is a file, please add the 'File:' prefix.") + else: + print('\n', end='') + sys.exit(1) + + filename = unquote(filename) # remove URL encoding for special characters + + dest = args.output or filename + + if args.verbose >= 2: + print('User agent: {}'.format(USER_AGENT)) + + # connect to site and identify ourselves + try: + site = Site(site_name, clients_useragent=USER_AGENT) + except ConnectionError: + # usually this means there is no such site, + # or there's no network connection + print("Error: couldn't connect to specified site.") + sys.exit(1) + except InvalidResponse as e: + # site exists, but we couldn't communicate with the API endpoint + print(e) + sys.exit(1) + + # get info about the target file + file = site.images[filename] + + if file.imageinfo != {}: + # file exists either locally or at Wikimedia Commons + file_url = file.imageinfo['url'] + file_size = file.imageinfo['size'] + file_sha1 = file.imageinfo['sha1'] + + if args.verbose >= 1: + print("Info: downloading '{}' " + '({} bytes) from {}'.format(filename, file_size, site.host), + end='') + if args.output: + print(" to '{}'".format(dest)) + else: + print('\n', end='') + print('Info: {}'.format(file_url)) + + if os.path.isfile(dest) and not args.force: + print("File '{}' already exists, skipping download " + '(use -f to ignore)'.format(dest)) + else: + try: + fd = open(dest, 'wb') + except IOError as e: + print('File could not be written. ' + 'The following error was encountered:') + print(e) + sys.exit(1) + else: + # download the file + with tqdm(total=file_size, unit='B', + unit_scale=True, unit_divisor=1024) as progress_bar: + with fd: + res = site.connection.get(file_url, stream=True) + progress_bar.set_postfix(file=dest, refresh=False) + for chunk in res.iter_content(1024): + fd.write(chunk) + progress_bar.update(len(chunk)) + + # verify file integrity and optionally print details + dl_sha1 = verify_hash(dest) + + if args.verbose >= 1: + print('Info: downloaded file SHA1 is {}'.format(dl_sha1)) + print('Info: server file SHA1 is {}'.format(file_sha1)) + if dl_sha1 == file_sha1: + if args.verbose >= 1: + print('Info: hashes match!') + # at this point, we've successfully downloaded the file + else: + print('Error: hash mismatch! Downloaded file may be corrupt.') + sys.exit(1) + + else: + # no file information returned + print("Target '{}' does not appear to be a valid file." + .format(filename)) + sys.exit(1) diff --git a/wikiget/validations.py b/wikiget/validations.py new file mode 100644 index 0000000..20ef74f --- /dev/null +++ b/wikiget/validations.py @@ -0,0 +1,64 @@ +# wikiget - CLI tool for downloading files from Wikimedia sites +# Copyright (C) 2018, 2019, 2020 Cody Logan +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Wikiget is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Wikiget is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Wikiget. If not, see <https://www.gnu.org/licenses/>. + +import hashlib +import re + +from . import BLOCKSIZE + + +def valid_file(search_string): + """ + Determines if the given string contains a valid file name, defined as a + string ending with a '.' and at least one character, beginning with 'File:' + or 'Image:', the standard file prefixes in MediaWiki. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + # second group could also restrict to file extensions with three or more + # letters with ([^/\r\n\t\f\v]+\.\w{3,}) + file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I) + return file_regex.search(search_string) + + +def valid_site(search_string): + """ + Determines if the given string contains a valid site name, defined as a + string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all + subdomains of those domains. Eventually, it should be possible to support + any MediaWiki site, regardless of domain name. + :param search_string: string to validate + :returns: a regex Match object if there's a match or None otherwise + """ + site_regex = re.compile(r'wiki[mp]edia\.org$', re.I) + return site_regex.search(search_string) + + +def verify_hash(filename): + """ + Calculates the SHA1 hash of the given file for comparison with a known + value. + :param filename: name of the file to calculate a hash for + :return: hash digest + """ + hasher = hashlib.sha1() + with open(filename, 'rb') as dl: + buf = dl.read(BLOCKSIZE) + while len(buf) > 0: + hasher.update(buf) + buf = dl.read(BLOCKSIZE) + return hasher.hexdigest() diff --git a/wikiget/version.py b/wikiget/version.py index eff7413..46b9520 100644 --- a/wikiget/version.py +++ b/wikiget/version.py @@ -1,3 +1 @@ -"""Sets the program version in setup.py and on the command line.""" - -__version__ = '0.4.2.dev1' +__version__ = '0.4.2.dev2' diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py index 566448f..ac85012 100644 --- a/wikiget/wikiget.py +++ b/wikiget/wikiget.py @@ -15,26 +15,13 @@ # You should have received a copy of the GNU General Public License # along with Wikiget. If not, see <https://www.gnu.org/licenses/>. -"""Main wikiget functions.""" - import argparse -import hashlib import logging -import os -import re import sys -from urllib.parse import unquote, urlparse - -from mwclient import InvalidResponse, Site, __version__ as mwclient_version -from requests import ConnectionError -from tqdm import tqdm - -from wikiget.version import __version__ -BLOCKSIZE = 65536 -DEFAULT_SITE = 'commons.wikimedia.org' -USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \ - 'mwclient/{}'.format(__version__, mwclient_version) +from . import DEFAULT_SITE +from .dl import download +from .version import __version__ def main(): @@ -114,168 +101,3 @@ def main(): # single download mode dl = args.FILE download(dl, args) - - -def download(dl, args): - url = urlparse(dl) - - if url.netloc: - filename = url.path - site_name = url.netloc - if args.site is not DEFAULT_SITE and not args.quiet: - # this will work even if the user specifies 'commons.wikimedia.org' - print('Warning: target is a URL, ' - 'ignoring site specified with --site') - else: - filename = dl - site_name = args.site - - file_match = valid_file(filename) - site_match = valid_site(site_name) - - # check for valid site parameter - if not site_match: - print('Only Wikimedia sites (wikipedia.org and wikimedia.org) ' - 'are currently supported.') - sys.exit(1) - - # check if this is a valid file - if file_match and file_match.group(1): - # has File:/Image: prefix and extension - filename = file_match.group(2) - else: - # no file extension and/or prefix, probably an article - print('Downloading Wikipedia articles is not currently supported.', - end='') - if file_match and not file_match.group(1): - # file extension detected, but no prefix - # TODO: no longer possible to get to this point since - # file_match is None with no prefix - print(" If this is a file, please add the 'File:' prefix.") - else: - print('\n', end='') - sys.exit(1) - - filename = unquote(filename) # remove URL encoding for special characters - - dest = args.output or filename - - if args.verbose >= 2: - print('User agent: {}'.format(USER_AGENT)) - - # connect to site and identify ourselves - try: - site = Site(site_name, clients_useragent=USER_AGENT) - except ConnectionError: - # usually this means there is no such site, - # or there's no network connection - print("Error: couldn't connect to specified site.") - sys.exit(1) - except InvalidResponse as e: - # site exists, but we couldn't communicate with the API endpoint - print(e) - sys.exit(1) - - # get info about the target file - file = site.images[filename] - - if file.imageinfo != {}: - # file exists either locally or at Wikimedia Commons - file_url = file.imageinfo['url'] - file_size = file.imageinfo['size'] - file_sha1 = file.imageinfo['sha1'] - - if args.verbose >= 1: - print("Info: downloading '{}' " - '({} bytes) from {}'.format(filename, file_size, site.host), - end='') - if args.output: - print(" to '{}'".format(dest)) - else: - print('\n', end='') - print('Info: {}'.format(file_url)) - - if os.path.isfile(dest) and not args.force: - print("File '{}' already exists, skipping download " - '(use -f to ignore)'.format(dest)) - else: - try: - fd = open(dest, 'wb') - except IOError as e: - print('File could not be written. ' - 'The following error was encountered:') - print(e) - sys.exit(1) - else: - # download the file - with tqdm(total=file_size, unit='B', - unit_scale=True, unit_divisor=1024) as progress_bar: - with fd: - res = site.connection.get(file_url, stream=True) - progress_bar.set_postfix(file=dest, refresh=False) - for chunk in res.iter_content(1024): - fd.write(chunk) - progress_bar.update(len(chunk)) - - # verify file integrity and optionally print details - dl_sha1 = verify_hash(dest) - - if args.verbose >= 1: - print('Info: downloaded file SHA1 is {}'.format(dl_sha1)) - print('Info: server file SHA1 is {}'.format(file_sha1)) - if dl_sha1 == file_sha1: - if args.verbose >= 1: - print('Info: hashes match!') - # at this point, we've successfully downloaded the file - else: - print('Error: hash mismatch! Downloaded file may be corrupt.') - sys.exit(1) - - else: - # no file information returned - print("Target '{}' does not appear to be a valid file." - .format(filename)) - sys.exit(1) - - -def valid_file(search_string): - """ - Determines if the given string contains a valid file name, defined as a - string ending with a '.' and at least one character, beginning with 'File:' - or 'Image:', the standard file prefixes in MediaWiki. - :param search_string: string to validate - :returns: a regex Match object if there's a match or None otherwise - """ - # second group could also restrict to file extensions with three or more - # letters with ([^/\r\n\t\f\v]+\.\w{3,}) - file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I) - return file_regex.search(search_string) - - -def valid_site(search_string): - """ - Determines if the given string contains a valid site name, defined as a - string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all - subdomains of those domains. Eventually, it should be possible to support - any MediaWiki site, regardless of domain name. - :param search_string: string to validate - :returns: a regex Match object if there's a match or None otherwise - """ - site_regex = re.compile(r'wiki[mp]edia\.org$', re.I) - return site_regex.search(search_string) - - -def verify_hash(filename): - """ - Calculates the SHA1 hash of the given file for comparison with a known - value. - :param filename: name of the file to calculate a hash for - :return: hash digest - """ - hasher = hashlib.sha1() - with open(filename, 'rb') as dl: - buf = dl.read(BLOCKSIZE) - while len(buf) > 0: - hasher.update(buf) - buf = dl.read(BLOCKSIZE) - return hasher.hexdigest() |
