aboutsummaryrefslogtreecommitdiff
path: root/wikiget
diff options
context:
space:
mode:
Diffstat (limited to 'wikiget')
-rw-r--r--wikiget/__init__.py20
-rw-r--r--wikiget/dl.py153
-rw-r--r--wikiget/validations.py64
-rw-r--r--wikiget/version.py4
-rw-r--r--wikiget/wikiget.py184
5 files changed, 241 insertions, 184 deletions
diff --git a/wikiget/__init__.py b/wikiget/__init__.py
index e69de29..253fcf8 100644
--- a/wikiget/__init__.py
+++ b/wikiget/__init__.py
@@ -0,0 +1,20 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+# set some global constants
+BLOCKSIZE = 65536
+DEFAULT_SITE = 'commons.wikimedia.org'
diff --git a/wikiget/dl.py b/wikiget/dl.py
new file mode 100644
index 0000000..b074269
--- /dev/null
+++ b/wikiget/dl.py
@@ -0,0 +1,153 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import sys
+from urllib.parse import unquote, urlparse
+
+from mwclient import InvalidResponse, Site, __version__ as mwclient_version
+from requests import ConnectionError
+from tqdm import tqdm
+
+from . import DEFAULT_SITE
+from .validations import valid_file, valid_site, verify_hash
+from .version import __version__
+
+USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \
+ 'mwclient/{}'.format(__version__, mwclient_version)
+
+
+def download(dl, args):
+ url = urlparse(dl)
+
+ if url.netloc:
+ filename = url.path
+ site_name = url.netloc
+ if args.site is not DEFAULT_SITE and not args.quiet:
+ # this will work even if the user specifies 'commons.wikimedia.org'
+ print('Warning: target is a URL, '
+ 'ignoring site specified with --site')
+ else:
+ filename = dl
+ site_name = args.site
+
+ file_match = valid_file(filename)
+ site_match = valid_site(site_name)
+
+ # check for valid site parameter
+ if not site_match:
+ print('Only Wikimedia sites (wikipedia.org and wikimedia.org) '
+ 'are currently supported.')
+ sys.exit(1)
+
+ # check if this is a valid file
+ if file_match and file_match.group(1):
+ # has File:/Image: prefix and extension
+ filename = file_match.group(2)
+ else:
+ # no file extension and/or prefix, probably an article
+ print('Downloading Wikipedia articles is not currently supported.',
+ end='')
+ if file_match and not file_match.group(1):
+ # file extension detected, but no prefix
+ # TODO: no longer possible to get to this point since
+ # file_match is None with no prefix
+ print(" If this is a file, please add the 'File:' prefix.")
+ else:
+ print('\n', end='')
+ sys.exit(1)
+
+ filename = unquote(filename) # remove URL encoding for special characters
+
+ dest = args.output or filename
+
+ if args.verbose >= 2:
+ print('User agent: {}'.format(USER_AGENT))
+
+ # connect to site and identify ourselves
+ try:
+ site = Site(site_name, clients_useragent=USER_AGENT)
+ except ConnectionError:
+ # usually this means there is no such site,
+ # or there's no network connection
+ print("Error: couldn't connect to specified site.")
+ sys.exit(1)
+ except InvalidResponse as e:
+ # site exists, but we couldn't communicate with the API endpoint
+ print(e)
+ sys.exit(1)
+
+ # get info about the target file
+ file = site.images[filename]
+
+ if file.imageinfo != {}:
+ # file exists either locally or at Wikimedia Commons
+ file_url = file.imageinfo['url']
+ file_size = file.imageinfo['size']
+ file_sha1 = file.imageinfo['sha1']
+
+ if args.verbose >= 1:
+ print("Info: downloading '{}' "
+ '({} bytes) from {}'.format(filename, file_size, site.host),
+ end='')
+ if args.output:
+ print(" to '{}'".format(dest))
+ else:
+ print('\n', end='')
+ print('Info: {}'.format(file_url))
+
+ if os.path.isfile(dest) and not args.force:
+ print("File '{}' already exists, skipping download "
+ '(use -f to ignore)'.format(dest))
+ else:
+ try:
+ fd = open(dest, 'wb')
+ except IOError as e:
+ print('File could not be written. '
+ 'The following error was encountered:')
+ print(e)
+ sys.exit(1)
+ else:
+ # download the file
+ with tqdm(total=file_size, unit='B',
+ unit_scale=True, unit_divisor=1024) as progress_bar:
+ with fd:
+ res = site.connection.get(file_url, stream=True)
+ progress_bar.set_postfix(file=dest, refresh=False)
+ for chunk in res.iter_content(1024):
+ fd.write(chunk)
+ progress_bar.update(len(chunk))
+
+ # verify file integrity and optionally print details
+ dl_sha1 = verify_hash(dest)
+
+ if args.verbose >= 1:
+ print('Info: downloaded file SHA1 is {}'.format(dl_sha1))
+ print('Info: server file SHA1 is {}'.format(file_sha1))
+ if dl_sha1 == file_sha1:
+ if args.verbose >= 1:
+ print('Info: hashes match!')
+ # at this point, we've successfully downloaded the file
+ else:
+ print('Error: hash mismatch! Downloaded file may be corrupt.')
+ sys.exit(1)
+
+ else:
+ # no file information returned
+ print("Target '{}' does not appear to be a valid file."
+ .format(filename))
+ sys.exit(1)
diff --git a/wikiget/validations.py b/wikiget/validations.py
new file mode 100644
index 0000000..20ef74f
--- /dev/null
+++ b/wikiget/validations.py
@@ -0,0 +1,64 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import hashlib
+import re
+
+from . import BLOCKSIZE
+
+
+def valid_file(search_string):
+ """
+ Determines if the given string contains a valid file name, defined as a
+ string ending with a '.' and at least one character, beginning with 'File:'
+ or 'Image:', the standard file prefixes in MediaWiki.
+ :param search_string: string to validate
+ :returns: a regex Match object if there's a match or None otherwise
+ """
+ # second group could also restrict to file extensions with three or more
+ # letters with ([^/\r\n\t\f\v]+\.\w{3,})
+ file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I)
+ return file_regex.search(search_string)
+
+
+def valid_site(search_string):
+ """
+ Determines if the given string contains a valid site name, defined as a
+ string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all
+ subdomains of those domains. Eventually, it should be possible to support
+ any MediaWiki site, regardless of domain name.
+ :param search_string: string to validate
+ :returns: a regex Match object if there's a match or None otherwise
+ """
+ site_regex = re.compile(r'wiki[mp]edia\.org$', re.I)
+ return site_regex.search(search_string)
+
+
+def verify_hash(filename):
+ """
+ Calculates the SHA1 hash of the given file for comparison with a known
+ value.
+ :param filename: name of the file to calculate a hash for
+ :return: hash digest
+ """
+ hasher = hashlib.sha1()
+ with open(filename, 'rb') as dl:
+ buf = dl.read(BLOCKSIZE)
+ while len(buf) > 0:
+ hasher.update(buf)
+ buf = dl.read(BLOCKSIZE)
+ return hasher.hexdigest()
diff --git a/wikiget/version.py b/wikiget/version.py
index eff7413..46b9520 100644
--- a/wikiget/version.py
+++ b/wikiget/version.py
@@ -1,3 +1 @@
-"""Sets the program version in setup.py and on the command line."""
-
-__version__ = '0.4.2.dev1'
+__version__ = '0.4.2.dev2'
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index 566448f..ac85012 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -15,26 +15,13 @@
# You should have received a copy of the GNU General Public License
# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-"""Main wikiget functions."""
-
import argparse
-import hashlib
import logging
-import os
-import re
import sys
-from urllib.parse import unquote, urlparse
-
-from mwclient import InvalidResponse, Site, __version__ as mwclient_version
-from requests import ConnectionError
-from tqdm import tqdm
-
-from wikiget.version import __version__
-BLOCKSIZE = 65536
-DEFAULT_SITE = 'commons.wikimedia.org'
-USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \
- 'mwclient/{}'.format(__version__, mwclient_version)
+from . import DEFAULT_SITE
+from .dl import download
+from .version import __version__
def main():
@@ -114,168 +101,3 @@ def main():
# single download mode
dl = args.FILE
download(dl, args)
-
-
-def download(dl, args):
- url = urlparse(dl)
-
- if url.netloc:
- filename = url.path
- site_name = url.netloc
- if args.site is not DEFAULT_SITE and not args.quiet:
- # this will work even if the user specifies 'commons.wikimedia.org'
- print('Warning: target is a URL, '
- 'ignoring site specified with --site')
- else:
- filename = dl
- site_name = args.site
-
- file_match = valid_file(filename)
- site_match = valid_site(site_name)
-
- # check for valid site parameter
- if not site_match:
- print('Only Wikimedia sites (wikipedia.org and wikimedia.org) '
- 'are currently supported.')
- sys.exit(1)
-
- # check if this is a valid file
- if file_match and file_match.group(1):
- # has File:/Image: prefix and extension
- filename = file_match.group(2)
- else:
- # no file extension and/or prefix, probably an article
- print('Downloading Wikipedia articles is not currently supported.',
- end='')
- if file_match and not file_match.group(1):
- # file extension detected, but no prefix
- # TODO: no longer possible to get to this point since
- # file_match is None with no prefix
- print(" If this is a file, please add the 'File:' prefix.")
- else:
- print('\n', end='')
- sys.exit(1)
-
- filename = unquote(filename) # remove URL encoding for special characters
-
- dest = args.output or filename
-
- if args.verbose >= 2:
- print('User agent: {}'.format(USER_AGENT))
-
- # connect to site and identify ourselves
- try:
- site = Site(site_name, clients_useragent=USER_AGENT)
- except ConnectionError:
- # usually this means there is no such site,
- # or there's no network connection
- print("Error: couldn't connect to specified site.")
- sys.exit(1)
- except InvalidResponse as e:
- # site exists, but we couldn't communicate with the API endpoint
- print(e)
- sys.exit(1)
-
- # get info about the target file
- file = site.images[filename]
-
- if file.imageinfo != {}:
- # file exists either locally or at Wikimedia Commons
- file_url = file.imageinfo['url']
- file_size = file.imageinfo['size']
- file_sha1 = file.imageinfo['sha1']
-
- if args.verbose >= 1:
- print("Info: downloading '{}' "
- '({} bytes) from {}'.format(filename, file_size, site.host),
- end='')
- if args.output:
- print(" to '{}'".format(dest))
- else:
- print('\n', end='')
- print('Info: {}'.format(file_url))
-
- if os.path.isfile(dest) and not args.force:
- print("File '{}' already exists, skipping download "
- '(use -f to ignore)'.format(dest))
- else:
- try:
- fd = open(dest, 'wb')
- except IOError as e:
- print('File could not be written. '
- 'The following error was encountered:')
- print(e)
- sys.exit(1)
- else:
- # download the file
- with tqdm(total=file_size, unit='B',
- unit_scale=True, unit_divisor=1024) as progress_bar:
- with fd:
- res = site.connection.get(file_url, stream=True)
- progress_bar.set_postfix(file=dest, refresh=False)
- for chunk in res.iter_content(1024):
- fd.write(chunk)
- progress_bar.update(len(chunk))
-
- # verify file integrity and optionally print details
- dl_sha1 = verify_hash(dest)
-
- if args.verbose >= 1:
- print('Info: downloaded file SHA1 is {}'.format(dl_sha1))
- print('Info: server file SHA1 is {}'.format(file_sha1))
- if dl_sha1 == file_sha1:
- if args.verbose >= 1:
- print('Info: hashes match!')
- # at this point, we've successfully downloaded the file
- else:
- print('Error: hash mismatch! Downloaded file may be corrupt.')
- sys.exit(1)
-
- else:
- # no file information returned
- print("Target '{}' does not appear to be a valid file."
- .format(filename))
- sys.exit(1)
-
-
-def valid_file(search_string):
- """
- Determines if the given string contains a valid file name, defined as a
- string ending with a '.' and at least one character, beginning with 'File:'
- or 'Image:', the standard file prefixes in MediaWiki.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- # second group could also restrict to file extensions with three or more
- # letters with ([^/\r\n\t\f\v]+\.\w{3,})
- file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I)
- return file_regex.search(search_string)
-
-
-def valid_site(search_string):
- """
- Determines if the given string contains a valid site name, defined as a
- string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all
- subdomains of those domains. Eventually, it should be possible to support
- any MediaWiki site, regardless of domain name.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- site_regex = re.compile(r'wiki[mp]edia\.org$', re.I)
- return site_regex.search(search_string)
-
-
-def verify_hash(filename):
- """
- Calculates the SHA1 hash of the given file for comparison with a known
- value.
- :param filename: name of the file to calculate a hash for
- :return: hash digest
- """
- hasher = hashlib.sha1()
- with open(filename, 'rb') as dl:
- buf = dl.read(BLOCKSIZE)
- while len(buf) > 0:
- hasher.update(buf)
- buf = dl.read(BLOCKSIZE)
- return hasher.hexdigest()