aboutsummaryrefslogtreecommitdiff
path: root/wikiget
diff options
context:
space:
mode:
Diffstat (limited to 'wikiget')
-rw-r--r--wikiget/__init__.py31
-rw-r--r--wikiget/dl.py174
-rw-r--r--wikiget/validations.py64
-rw-r--r--wikiget/version.py1
-rw-r--r--wikiget/wikiget.py131
5 files changed, 0 insertions, 401 deletions
diff --git a/wikiget/__init__.py b/wikiget/__init__.py
deleted file mode 100644
index b68b0ec..0000000
--- a/wikiget/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# wikiget - CLI tool for downloading files from Wikimedia sites
-# Copyright (C) 2018, 2019, 2020 Cody Logan and contributors
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# Wikiget is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Wikiget is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-
-from mwclient import __version__ as mwclient_version
-
-from wikiget.version import __version__ as wikiget_version
-
-# set some global constants
-BLOCKSIZE = 65536
-CHUNKSIZE = 1024
-DEFAULT_SITE = "commons.wikimedia.org"
-DEFAULT_PATH = "/w/"
-USER_AGENT = "wikiget/{} (https://github.com/clpo13/wikiget) mwclient/{}".format(
- wikiget_version, mwclient_version
-)
-STD_VERBOSE = 1
-VERY_VERBOSE = 2
diff --git a/wikiget/dl.py b/wikiget/dl.py
deleted file mode 100644
index 949f09e..0000000
--- a/wikiget/dl.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# wikiget - CLI tool for downloading files from Wikimedia sites
-# Copyright (C) 2018-2021 Cody Logan and contributors
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# Wikiget is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Wikiget is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-
-import os
-import sys
-from urllib.parse import unquote, urlparse
-
-from mwclient import APIError, InvalidResponse, LoginError, Site
-from requests import ConnectionError, HTTPError
-from tqdm import tqdm
-
-import wikiget
-from wikiget.validations import valid_file, verify_hash
-
-
-def download(dl, args):
- url = urlparse(dl)
-
- if url.netloc:
- filename = url.path
- site_name = url.netloc
- if args.site is not wikiget.DEFAULT_SITE and not args.quiet:
- # this will work even if the user specifies 'commons.wikimedia.org'
- print("Warning: target is a URL, ignoring site specified with --site")
- else:
- filename = dl
- site_name = args.site
-
- file_match = valid_file(filename)
-
- # check if this is a valid file
- if file_match and file_match.group(1):
- # has File:/Image: prefix and extension
- filename = file_match.group(2)
- else:
- # no file extension and/or prefix, probably an article
- print(f"Could not parse input '{filename}' as a file. ")
- sys.exit(1)
-
- filename = unquote(filename) # remove URL encoding for special characters
-
- dest = args.output or filename
-
- if args.verbose >= wikiget.VERY_VERBOSE:
- print(f"User agent: {wikiget.USER_AGENT}")
-
- # connect to site and identify ourselves
- if args.verbose >= wikiget.STD_VERBOSE:
- print(f"Site name: {site_name}")
- try:
- site = Site(site_name, path=args.path, clients_useragent=wikiget.USER_AGENT)
- if args.username and args.password:
- site.login(args.username, args.password)
- except ConnectionError as e:
- # usually this means there is no such site, or there's no network
- # connection, though it could be a certificate problem
- print("Error: couldn't connect to specified site.")
- if args.verbose >= wikiget.VERY_VERBOSE:
- print("Full error message:")
- print(e)
- sys.exit(1)
- except HTTPError as e:
- # most likely a 403 forbidden or 404 not found error for api.php
- print(
- "Error: couldn't find the specified wiki's api.php. "
- "Check the value of --path."
- )
- if args.verbose >= wikiget.VERY_VERBOSE:
- print("Full error message:")
- print(e)
- sys.exit(1)
- except (InvalidResponse, LoginError) as e:
- # InvalidResponse: site exists, but we couldn't communicate with the
- # API endpoint for some reason other than an HTTP error.
- # LoginError: missing or invalid credentials
- print(e)
- sys.exit(1)
-
- # get info about the target file
- try:
- file = site.images[filename]
- except APIError as e:
- # an API error at this point likely means access is denied,
- # which could happen with a private wiki
- print(
- "Error: access denied. Try providing credentials with "
- "--username and --password."
- )
- if args.verbose >= wikiget.VERY_VERBOSE:
- print("Full error message:")
- for i in e.args:
- print(i)
- sys.exit(1)
-
- if file.imageinfo != {}:
- # file exists either locally or at a common repository,
- # like Wikimedia Commons
- file_url = file.imageinfo["url"]
- file_size = file.imageinfo["size"]
- file_sha1 = file.imageinfo["sha1"]
-
- if args.verbose >= wikiget.STD_VERBOSE:
- print(
- f"Info: downloading '{filename}' "
- f"({file_size} bytes) from {site.host}",
- end="",
- )
- if args.output:
- print(f" to '{dest}'")
- else:
- print("\n", end="")
- print(f"Info: {file_url}")
-
- if os.path.isfile(dest) and not args.force:
- print(f"File '{dest}' already exists, skipping download (use -f to ignore)")
- else:
- try:
- fd = open(dest, "wb")
- except OSError as e:
- print("File could not be written. The following error was encountered:")
- print(e)
- sys.exit(1)
- else:
- # download the file(s)
- if args.verbose >= wikiget.STD_VERBOSE:
- leave_bars = True
- else:
- leave_bars = False
- with tqdm(
- leave=leave_bars,
- total=file_size,
- unit="B",
- unit_scale=True,
- unit_divisor=wikiget.CHUNKSIZE,
- ) as progress_bar:
- with fd:
- res = site.connection.get(file_url, stream=True)
- progress_bar.set_postfix(file=dest, refresh=False)
- for chunk in res.iter_content(wikiget.CHUNKSIZE):
- fd.write(chunk)
- progress_bar.update(len(chunk))
-
- # verify file integrity and optionally print details
- dl_sha1 = verify_hash(dest)
-
- if args.verbose >= wikiget.STD_VERBOSE:
- print(f"Info: downloaded file SHA1 is {dl_sha1}")
- print(f"Info: server file SHA1 is {file_sha1}")
- if dl_sha1 == file_sha1:
- if args.verbose >= wikiget.STD_VERBOSE:
- print("Info: hashes match!")
- # at this point, we've successfully downloaded the file
- else:
- print("Error: hash mismatch! Downloaded file may be corrupt.")
- sys.exit(1)
-
- else:
- # no file information returned
- print(f"Target '{filename}' does not appear to be a valid file.")
- sys.exit(1)
diff --git a/wikiget/validations.py b/wikiget/validations.py
deleted file mode 100644
index dc70df4..0000000
--- a/wikiget/validations.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# wikiget - CLI tool for downloading files from Wikimedia sites
-# Copyright (C) 2018, 2019, 2020 Cody Logan
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# Wikiget is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Wikiget is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-
-import hashlib
-import re
-
-from wikiget import BLOCKSIZE
-
-
-def valid_file(search_string):
- """
- Determines if the given string contains a valid file name, defined as a
- string ending with a '.' and at least one character, beginning with 'File:'
- or 'Image:', the standard file prefixes in MediaWiki.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- # second group could also restrict to file extensions with three or more
- # letters with ([^/\r\n\t\f\v]+\.\w{3,})
- file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I)
- return file_regex.search(search_string)
-
-
-def valid_site(search_string):
- """
- Determines if the given string contains a valid site name, defined as a
- string ending with 'wikipedia.org' or 'wikimedia.org'. This covers all
- subdomains of those domains. Eventually, it should be possible to support
- any MediaWiki site, regardless of domain name.
- :param search_string: string to validate
- :returns: a regex Match object if there's a match or None otherwise
- """
- site_regex = re.compile(r"wiki[mp]edia\.org$", re.I)
- return site_regex.search(search_string)
-
-
-def verify_hash(filename):
- """
- Calculates the SHA1 hash of the given file for comparison with a known
- value.
- :param filename: name of the file to calculate a hash for
- :return: hash digest
- """
- hasher = hashlib.sha1() # noqa: S324
- with open(filename, "rb") as dl:
- buf = dl.read(BLOCKSIZE)
- while len(buf) > 0:
- hasher.update(buf)
- buf = dl.read(BLOCKSIZE)
- return hasher.hexdigest()
diff --git a/wikiget/version.py b/wikiget/version.py
deleted file mode 100644
index dd9b22c..0000000
--- a/wikiget/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.5.1"
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
deleted file mode 100644
index ba36766..0000000
--- a/wikiget/wikiget.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# wikiget - CLI tool for downloading files from Wikimedia sites
-# Copyright (C) 2018-2021 Cody Logan and contributors
-# SPDX-License-Identifier: GPL-3.0-or-later
-#
-# Wikiget is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Wikiget is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
-
-import argparse
-import logging
-import sys
-
-import wikiget
-from wikiget.dl import download
-
-
-def main():
- """
- Main entry point for console script. Automatically compiled by setuptools
- when installed with `pip install` or `python setup.py install`.
- """
-
- parser = argparse.ArgumentParser(
- description="""
- A tool for downloading files from
- MediaWiki sites using the file name or
- description page URL
- """,
- epilog="""
- Copyright (C) 2018-2023 Cody Logan
- and contributors.
- License GPLv3+: GNU GPL version 3 or later
- <http://www.gnu.org/licenses/gpl.html>.
- This is free software; you are free to
- change and redistribute it under certain
- conditions. There is NO WARRANTY, to the
- extent permitted by law.
- """,
- )
- parser.add_argument(
- "FILE",
- help="""
- name of the file to download with the File:
- prefix, or the URL of its file description page
- """,
- )
- parser.add_argument(
- "-V",
- "--version",
- action="version",
- version=f"%(prog)s {wikiget.wikiget_version}",
- )
- message_options = parser.add_mutually_exclusive_group()
- message_options.add_argument(
- "-q", "--quiet", help="suppress warning messages", action="store_true"
- )
- message_options.add_argument(
- "-v",
- "--verbose",
- help="print detailed information; use -vv for even more detail",
- action="count",
- default=0,
- )
- parser.add_argument(
- "-f", "--force", help="force overwriting existing files", action="store_true"
- )
- parser.add_argument(
- "-s",
- "--site",
- default=wikiget.DEFAULT_SITE,
- help="MediaWiki site to download from (default: %(default)s)",
- )
- parser.add_argument(
- "-p",
- "--path",
- default=wikiget.DEFAULT_PATH,
- help="MediaWiki site path, where api.php is located (default: %(default)s)",
- )
- parser.add_argument(
- "--username", default="", help="MediaWiki site username, for private wikis"
- )
- parser.add_argument(
- "--password", default="", help="MediaWiki site password, for private wikis"
- )
- output_options = parser.add_mutually_exclusive_group()
- output_options.add_argument("-o", "--output", help="write download to OUTPUT")
- output_options.add_argument(
- "-a",
- "--batch",
- help="treat FILE as a textfile containing "
- "multiple files to download, one URL or "
- "filename per line",
- action="store_true",
- )
-
- args = parser.parse_args()
-
- # print API and debug messages in verbose mode
- if args.verbose >= wikiget.VERY_VERBOSE:
- logging.basicConfig(level=logging.DEBUG)
- elif args.verbose >= wikiget.STD_VERBOSE:
- logging.basicConfig(level=logging.WARNING)
-
- if args.batch:
- # batch download mode
- input_file = args.FILE
- if args.verbose >= wikiget.STD_VERBOSE:
- print(f"Info: using batch file '{input_file}'")
- try:
- fd = open(input_file)
- except OSError as e:
- print("File could not be read. The following error was encountered:")
- print(e)
- sys.exit(1)
- else:
- with fd:
- for _, line in enumerate(fd):
- download(line.strip(), args)
- else:
- # single download mode
- dl = args.FILE
- download(dl, args)