Split main file into separate chunks

author: Cody Logan <clpo13@gmail.com> 2020-01-28 13:50:01 -0800
committer: Cody Logan <clpo13@gmail.com> 2020-01-28 13:50:01 -0800
commit: 21bd3641d69a8558fd75195a76ead8f07101768a (patch)
tree: 6ee9a9c4230f993c9db58212ff5dfdc5b73b8d35 /wikiget/dl.py
parent: 71f9b13025882512f150d9b167c4dc7f91ab3b5c (diff)
download: wikiget-21bd3641d69a8558fd75195a76ead8f07101768a.tar.gz
wikiget-21bd3641d69a8558fd75195a76ead8f07101768a.zip
1 files changed, 153 insertions, 0 deletions
diff --git a/wikiget/dl.py b/wikiget/dl.py
new file mode 100644
index 0000000..b074269
--- /dev/null
+++ b/wikiget/dl.py
@@ -0,0 +1,153 @@
+# wikiget - CLI tool for downloading files from Wikimedia sites
+# Copyright (C) 2018, 2019, 2020 Cody Logan
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# Wikiget is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Wikiget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import sys
+from urllib.parse import unquote, urlparse
+
+from mwclient import InvalidResponse, Site, __version__ as mwclient_version
+from requests import ConnectionError
+from tqdm import tqdm
+
+from . import DEFAULT_SITE
+from .validations import valid_file, valid_site, verify_hash
+from .version import __version__
+
+USER_AGENT = 'wikiget/{} (https://github.com/clpo13/wikiget) ' \
+             'mwclient/{}'.format(__version__, mwclient_version)
+
+
+def download(dl, args):
+    url = urlparse(dl)
+
+    if url.netloc:
+        filename = url.path
+        site_name = url.netloc
+        if args.site is not DEFAULT_SITE and not args.quiet:
+            # this will work even if the user specifies 'commons.wikimedia.org'
+            print('Warning: target is a URL, '
+                  'ignoring site specified with --site')
+    else:
+        filename = dl
+        site_name = args.site
+
+    file_match = valid_file(filename)
+    site_match = valid_site(site_name)
+
+    # check for valid site parameter
+    if not site_match:
+        print('Only Wikimedia sites (wikipedia.org and wikimedia.org) '
+              'are currently supported.')
+        sys.exit(1)
+
+    # check if this is a valid file
+    if file_match and file_match.group(1):
+        # has File:/Image: prefix and extension
+        filename = file_match.group(2)
+    else:
+        # no file extension and/or prefix, probably an article
+        print('Downloading Wikipedia articles is not currently supported.',
+              end='')
+        if file_match and not file_match.group(1):
+            # file extension detected, but no prefix
+            # TODO: no longer possible to get to this point since
+            # file_match is None with no prefix
+            print(" If this is a file, please add the 'File:' prefix.")
+        else:
+            print('\n', end='')
+        sys.exit(1)
+
+    filename = unquote(filename)  # remove URL encoding for special characters
+
+    dest = args.output or filename
+
+    if args.verbose >= 2:
+        print('User agent: {}'.format(USER_AGENT))
+
+    # connect to site and identify ourselves
+    try:
+        site = Site(site_name, clients_useragent=USER_AGENT)
+    except ConnectionError:
+        # usually this means there is no such site,
+        # or there's no network connection
+        print("Error: couldn't connect to specified site.")
+        sys.exit(1)
+    except InvalidResponse as e:
+        # site exists, but we couldn't communicate with the API endpoint
+        print(e)
+        sys.exit(1)
+
+    # get info about the target file
+    file = site.images[filename]
+
+    if file.imageinfo != {}:
+        # file exists either locally or at Wikimedia Commons
+        file_url = file.imageinfo['url']
+        file_size = file.imageinfo['size']
+        file_sha1 = file.imageinfo['sha1']
+
+        if args.verbose >= 1:
+            print("Info: downloading '{}' "
+                  '({} bytes) from {}'.format(filename, file_size, site.host),
+                  end='')
+            if args.output:
+                print(" to '{}'".format(dest))
+            else:
+                print('\n', end='')
+            print('Info: {}'.format(file_url))
+
+        if os.path.isfile(dest) and not args.force:
+            print("File '{}' already exists, skipping download "
+                  '(use -f to ignore)'.format(dest))
+        else:
+            try:
+                fd = open(dest, 'wb')
+            except IOError as e:
+                print('File could not be written. '
+                      'The following error was encountered:')
+                print(e)
+                sys.exit(1)
+            else:
+                # download the file
+                with tqdm(total=file_size, unit='B',
+                          unit_scale=True, unit_divisor=1024) as progress_bar:
+                    with fd:
+                        res = site.connection.get(file_url, stream=True)
+                        progress_bar.set_postfix(file=dest, refresh=False)
+                        for chunk in res.iter_content(1024):
+                            fd.write(chunk)
+                            progress_bar.update(len(chunk))
+
+            # verify file integrity and optionally print details
+            dl_sha1 = verify_hash(dest)
+
+            if args.verbose >= 1:
+                print('Info: downloaded file SHA1 is {}'.format(dl_sha1))
+                print('Info: server file SHA1 is {}'.format(file_sha1))
+            if dl_sha1 == file_sha1:
+                if args.verbose >= 1:
+                    print('Info: hashes match!')
+                # at this point, we've successfully downloaded the file
+            else:
+                print('Error: hash mismatch! Downloaded file may be corrupt.')
+                sys.exit(1)
+
+    else:
+        # no file information returned
+        print("Target '{}' does not appear to be a valid file."
+              .format(filename))
+        sys.exit(1)
author	Cody Logan <clpo13@gmail.com>	2020-01-28 13:50:01 -0800
committer	Cody Logan <clpo13@gmail.com>	2020-01-28 13:50:01 -0800
commit	21bd3641d69a8558fd75195a76ead8f07101768a (patch)
tree	6ee9a9c4230f993c9db58212ff5dfdc5b73b8d35 /wikiget/dl.py
parent	71f9b13025882512f150d9b167c4dc7f91ab3b5c (diff)
download	wikiget-21bd3641d69a8558fd75195a76ead8f07101768a.tar.gz wikiget-21bd3641d69a8558fd75195a76ead8f07101768a.zip