aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--setup.py6
-rw-r--r--test/test_validations.py (renamed from test/test_wikiget.py)14
-rw-r--r--wikiget/__init__.py20
-rw-r--r--wikiget/dl.py153
-rw-r--r--wikiget/validations.py64
-rw-r--r--wikiget/version.py4
-rw-r--r--wikiget/wikiget.py184
7 files changed, 251 insertions, 194 deletions
diff --git a/setup.py b/setup.py
index ef1c0bd..81cb54f 100644
--- a/setup.py
+++ b/setup.py
@@ -26,13 +26,13 @@ here = path.abspath(path.dirname(__file__))
with open(path.join(here, 'README.md'), 'r') as fr:
long_description = fr.read()
-version = {}
+version_file = {}
with open(path.join(here, 'wikiget', 'version.py'), 'r') as fv:
- exec(fv.read(), version)
+ exec(fv.read(), version_file)
setup(
name='wikiget',
- version=version['__version__'],
+ version=version_file['__version__'],
author='Cody Logan',
author_email='clpo13@gmail.com',
description='CLI tool for downloading files from MediaWiki sites',
diff --git a/test/test_wikiget.py b/test/test_validations.py
index cdc0706..a0e2628 100644
--- a/test/test_wikiget.py
+++ b/test/test_validations.py
@@ -22,7 +22,7 @@ import os
import pytest
-from wikiget import wikiget
+from wikiget.validations import valid_file, valid_site, verify_hash
def test_invalid_site_input():
@@ -32,7 +32,7 @@ def test_invalid_site_input():
invalid_input = ['example.com', 'vim.wikia.com',
'en.wikipedia.com', 'en.wikimpedia.org']
for i in invalid_input:
- site_match = wikiget.valid_site(i)
+ site_match = valid_site(i)
assert site_match is None
@@ -43,7 +43,7 @@ def test_valid_site_input():
valid_input = ['en.wikipedia.org', 'commons.wikimedia.org',
'de.wikipedia.org', 'meta.wikimedia.org']
for i in valid_input:
- site_match = wikiget.valid_site(i)
+ site_match = valid_site(i)
assert site_match is not None
@@ -53,7 +53,7 @@ def test_file_regex():
to the file prefix and name.
"""
i = 'File:Example.jpg'
- file_match = wikiget.valid_file(i)
+ file_match = valid_file(i)
assert file_match is not None
assert file_match.group(0) == 'File:Example.jpg' # entire match
assert file_match.group(1) == 'File:' # first group
@@ -67,7 +67,7 @@ def test_invalid_file_input():
invalid_input = ['file:example', 'example.jpg', 'Foo Bar.gif',
'Fil:Example.jpg']
for i in invalid_input:
- file_match = wikiget.valid_file(i)
+ file_match = valid_file(i)
assert file_match is None
@@ -79,7 +79,7 @@ def test_valid_file_input():
'File:example.file-01.jpg', 'FILE:FOO.BMP',
'File:ß handwritten sample.gif', 'File:A (1).jpeg']
for i in valid_input:
- file_match = wikiget.valid_file(i)
+ file_match = valid_file(i)
assert file_match is not None
@@ -100,6 +100,6 @@ def test_verify_hash():
with dl:
dl.write(file_contents)
- assert wikiget.verify_hash(file_name) == file_sha1
+ assert verify_hash(file_name) == file_sha1
os.remove(file_name)
diff --git a/wikiget/__init__.py b/wikiget/__init__.py
index e69de29..253fcf8 100644
--- a/wikiget/__init__.py
+++ b/wikiget/__init__.py
@@ -0,0 +1,20 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+# set some global constants
+BLOCKSIZE = 65536
+DEFAULT_SITE = 'commons.wikimedia.org'
diff --git a/wikiget/dl.py b/wikiget/dl.py
new file mode 100644
index 0000000..b074269
--- /dev/null
+++ b/wikiget/dl.py
@@ -0,0 +1,153 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import sys
+from urllib.parse import unquote, urlparse
+
+from mwclient import InvalidResponse, Site, __version__ as mwclient_version
+from requests import ConnectionError
+from tqdm import tqdm
+
+from . import DEFAULT_SITE
+from .validations import valid_file, valid_site, verify_hash
+from .version import __version__
+
+USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \
+ 'mwclient/{}'.format(__version__, mwclient_version)
+
+
+def download(dl, args):
+ url = urlparse(dl)
+
+ if url.netloc:
+ filename = url.path
+ site_name = url.netloc
+ if args.site is not DEFAULT_SITE and not args.quiet:
+ # this will work even if the user specifies 'commons.wikimedia.org'
+ print('Warning: target is a URL, '
+ 'ignoring site specified with --site')
+ else:
+ filename = dl
+ site_name = args.site
+
+ file_match = valid_file(filename)
+ site_match = valid_site(site_name)
+
+ # check for valid site parameter
+ if not site_match:
+ print('Only Wikimedia sites (wikipedia.org and wikimedia.org) '
+ 'are currently supported.')
+ sys.exit(1)
+
+ # check if this is a valid file
+ if file_match and file_match.group(1):
+ # has File:/Image: prefix and extension
+ filename = file_match.group(2)
+ else:
+ # no file extension and/or prefix, probably an article
+ print('Downloading Wikipedia articles is not currently supported.',
+ end='')
+ if file_match and not file_match.group(1):
+ # file extension detected, but no prefix
+ # TODO: no longer possible to get to this point since
+ # file_match is None with no prefix
+ print(" If this is a file, please add the 'File:' prefix.")
+ else:
+ print('\n', end='')
+ sys.exit(1)
+
+ filename = unquote(filename) # remove URL encoding for special characters
+
+ dest = args.output or filename
+
+ if args.verbose >= 2:
+ print('User agent: {}'.format(USER_AGENT))
+
+ # connect to site and identify ourselves
+ try:
+ site = Site(site_name, clients_useragent=USER_AGENT)
+ except ConnectionError:
+ # usually this means there is no such site,
+ # or there's no network connection
+ print("Error: couldn't connect to specified site.")
+ sys.exit(1)
+ except InvalidResponse as e:
+ # site exists, but we couldn't communicate with the API endpoint
+ print(e)
+ sys.exit(1)
+
+ # get info about the target file
+ file = site.images[filename]
+
+ if file.imageinfo != {}:
+ # file exists either locally or at Wikimedia Commons
+ file_url = file.imageinfo['url']
+ file_size = file.imageinfo['size']
+ file_sha1 = file.imageinfo['sha1']
+
+ if args.verbose >= 1:
+ print("Info: downloading '{}' "
+ '({} bytes) from {}'.format(filename, file_size, site.host),
+ end='')
+ if args.output:
+ print(" to '{}'".format(dest))
+ else:
+ print('\n', end='')
+ print('Info: {}'.format(file_url))
+
+ if os.path.isfile(dest) and not args.force:
+ print("File '{}' already exists, skipping download "
+ '(use -f to ignore)'.format(dest))
+ else:
+ try:
+ fd = open(dest, 'wb')
+ except IOError as e:
+ print('File could not be written. '
+ 'The following error was encountered:')
+ print(e)
+ sys.exit(1)
+ else:
+ # download the file
+ with tqdm(total=file_size, unit='B',
+ unit_scale=True, unit_divisor=1024) as progress_bar:
+ with fd:
+ res = site.connection.get(file_url, stream=True)
+ progress_bar.set_postfix(file=dest, refresh=False)
+ for chunk in res.iter_content(1024):
+ fd.write(chunk)
+ progress_bar.update(len(chunk))
+
+ # verify file integrity and optionally print details
+ dl_sha1 = verify_hash(dest)
+
+ if args.verbose >= 1:
+ print('Info: downloaded file SHA1 is {}'.format(dl_sha1))
+ print('Info: server file SHA1 is {}'.format(file_sha1))
+ if dl_sha1 == file_sha1:
+ if args.verbose >= 1:
+ print('Info: hashes match!')
+ # at this point, we've successfully downloaded the file
+ else:
+ print('Error: hash mismatch! Downloaded file may be corrupt.')
+ sys.exit(1)
+
+ else:
+ # no file information returned
+ print("Target '{}' does not appear to be a valid file."
+ .format(filename))
+ sys.exit(1)
diff --git a/wikiget/validations.py b/wikiget/validations.py
new file mode 100644
index 0000000..20ef74f
--- /dev/null
+++ b/wikiget/validations.py
@@ -0,0 +1,64 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import hashlib
+import re
+
+from . import BLOCKSIZE
+
+
+def valid_file(search_string):
+ """
+ Determines if the given string contains a valid file name, defined as a
+ string ending with a '.' and at least one character, beginning with 'File:'
+ or 'Image:', the standard file prefixes in MediaWiki.
+ :param search_string: string to validate
+ :returns: a regex Match object if there's a match or None otherwise
+ """
+ # second group could also restrict to file extensions with three or more
+ # letters with ([^/\r\n\t\f\v]+\.\w{3,})
+ file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I)
+ return file_regex.search(search_string)
+
+
+def valid_site(search_string):
+ """
+ Determines if the given string contains a valid site name, defined as a
+ string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all
+ subdomains of those domains. Eventually, it should be possible to support
+ any MediaWiki site, regardless of domain name.
+ :param search_string: string to validate
+ :returns: a regex Match object if there's a match or None otherwise
+ """
+ site_regex = re.compile(r'wiki[mp]edia\.org$', re.I)
+ return site_regex.search(search_string)
+
+
+def verify_hash(filename):
+ """
+ Calculates the SHA1 hash of the given file for comparison with a known
+ value.
+ :param filename: name of the file to calculate a hash for
+ :return: hash digest
+ """
+ hasher = hashlib.sha1()
+ with open(filename, 'rb') as dl:
+ buf = dl.read(BLOCKSIZE)
+ while len(buf) > 0:
+ hasher.update(buf)
+ buf = dl.read(BLOCKSIZE)
+ return hasher.hexdigest()
diff --git a/wikiget/version.py b/wikiget/version.py
index eff7413..46b9520 100644
--- a/wikiget/version.py
+++ b/wikiget/version.py
@@ -1,3 +1 @@
-"""Sets the program version in setup.py and on the command line."""
-
-__version__ = '0.4.2.dev1'
+__version__ = '0.4.2.dev2'
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
index 566448f..ac85012 100644
--- a/wikiget/wikiget.py
+++ b/wikiget/wikiget.py
@@ -15,26 +15,13 @@
# You should have received a copy of the GNU General Public License
# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-"""Main wikiget functions."""
-
import argparse
-import hashlib
import logging
-import os
-import re
import sys
-from urllib.parse import unquote, urlparse
-
-from mwclient import InvalidResponse, Site, __version__ as mwclient_version
-from requests import ConnectionError
-from tqdm import tqdm
-
-from wikiget.version import __version__
-BLOCKSIZE = 65536
-DEFAULT_SITE = 'commons.wikimedia.org'
-USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \
- 'mwclient/{}'.format(__version__, mwclient_version)
+from . import DEFAULT_SITE
+from .dl import download
+from .version import __version__
def main():
@@ -114,168 +101,3 @@ def main():
# single download mode
dl = args.FILE
download(dl, args)
-
-
-def download(dl, args):
- url = urlparse(dl)
-
- if url.netloc:
- filename = url.path
- site_name = url.netloc
- if args.site is not DEFAULT_SITE and not args.quiet:
- # this will work even if the user specifies 'commons.wikimedia.org'
- print('Warning: target is a URL, '
- 'ignoring site specified with --site')
- else:
- filename = dl
- site_name = args.site
-
- file_match = valid_file(filename)
- site_match = valid_site(site_name)
-
- # check for valid site parameter
- if not site_match:
- print('Only Wikimedia sites (wikipedia.org and wikimedia.org) '
- 'are currently supported.')
- sys.exit(1)
-
- # check if this is a valid file
- if file_match and file_match.group(1):
- # has File:/Image: prefix and extension
- filename = file_match.group(2)
- else:
- # no file extension and/or prefix, probably an article
- print('Downloading Wikipedia articles is not currently supported.',
- end='')
- if file_match and not file_match.group(1):
- # file extension detected, but no prefix
- # TODO: no longer possible to get to this point since
- # file_match is None with no prefix
- print(" If this is a file, please add the 'File:' prefix.")
- else:
- print('\n', end='')
- sys.exit(1)
-
- filename = unquote(filename) # remove URL encoding for special characters
-
- dest = args.output or filename
-
- if args.verbose >= 2:
- print('User agent: {}'.format(USER_AGENT))
-
- # connect to site and identify ourselves
- try:
- site = Site(site_name, clients_useragent=USER_AGENT)
- except ConnectionError:
- # usually this means there is no such site,
- # or there's no network connection
- print("Error: couldn't connect to specified site.")
- sys.exit(1)
- except InvalidResponse as e:
- # site exists, but we couldn't communicate with the API endpoint
- print(e)
- sys.exit(1)
-
- # get info about the target file
- file = site.images[filename]
-
- if file.imageinfo != {}:
- # file exists either locally or at Wikimedia Commons
- file_url = file.imageinfo['url']
- file_size = file.imageinfo['size']
- file_sha1 = file.imageinfo['sha1']
-
- if args.verbose >= 1:
- print("Info: downloading '{}' "
- '({} bytes) from {}'.format(filename, file_size, site.host),
- end='')
- if args.output:
- print(" to '{}'".format(dest))
- else:
- print('\n', end='')
- print('Info: {}'.format(file_url))
-
- if os.path.isfile(dest) and not args.force:
- print("File '{}' already exists, skipping download "
- '(use -f to ignore)'.format(dest))
- else:
- try:
- fd = open(dest, 'wb')
- except IOError as e:
- print('File could not be written. '
- 'The following error was encountered:')
- print(e)
- sys.exit(1)
- else:
- # download the file
- with tqdm(total=file_size, unit='B',
- unit_scale=True, unit_divisor=1024) as progress_bar:
- with fd:
- res = site.connection.get(file_url, stream=True)
- progress_bar.set_postfix(file=dest, refresh=False)
- for chunk in res.iter_content(1024):
- fd.write(chunk)
- progress_bar.update(len(chunk))
-
- # verify file integrity and optionally print details
- dl_sha1 = verify_hash(dest)
-
- if args.verbose >= 1:
- print('Info: downloaded file SHA1 is {}'.format(dl_sha1))
- print('Info: server file SHA1 is {}'.format(file_sha1))
- if dl_sha1 == file_sha1:
- if args.verbose >= 1:
- print('Info: hashes match!')
- # at this point, we've successfully downloaded the file
- else:
- print('Error: hash mismatch! Downloaded file may be corrupt.')
- sys.exit(1)
-
- else:
- # no file information returned
- print("Target '{}' does not appear to be a valid file."
- .format(filename))
- sys.exit(1)
-
-
-def valid_file(search_string):
- """
- Determines if the given string contains a valid file name, defined as a
- string ending with a '.' and at least one character, beginning with 'File:'
- or 'Image:', the standard file prefixes in MediaWiki.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- # second group could also restrict to file extensions with three or more
- # letters with ([^/\r\n\t\f\v]+\.\w{3,})
- file_regex = re.compile(r'(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$', re.I)
- return file_regex.search(search_string)
-
-
-def valid_site(search_string):
- """
- Determines if the given string contains a valid site name, defined as a
- string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all
- subdomains of those domains. Eventually, it should be possible to support
- any MediaWiki site, regardless of domain name.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- site_regex = re.compile(r'wiki[mp]edia\.org$', re.I)
- return site_regex.search(search_string)
-
-
-def verify_hash(filename):
- """
- Calculates the SHA1 hash of the given file for comparison with a known
- value.
- :param filename: name of the file to calculate a hash for
- :return: hash digest
- """
- hasher = hashlib.sha1()
- with open(filename, 'rb') as dl:
- buf = dl.read(BLOCKSIZE)
- while len(buf) > 0:
- hasher.update(buf)
- buf = dl.read(BLOCKSIZE)
- return hasher.hexdigest()