aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCody Logan <clpo13@gmail.com>2018-12-13 16:55:07 -0800
committerCody Logan <clpo13@gmail.com>2018-12-13 16:55:07 -0800
commitf98a299a0905eac3dfb455dc6079652544e0510e (patch)
treed0e6ed7ae6403ea8dff58532907e95170051bd1a
parent0a37c4b3a11b6ecb20c1deff788fa011d6ffe3d0 (diff)
downloadwikiget-f98a299a0905eac3dfb455dc6079652544e0510e.tar.gz
wikiget-f98a299a0905eac3dfb455dc6079652544e0510e.zip
Initial program
-rw-r--r--.gitignore5
-rw-r--r--README.md72
-rw-r--r--setup.py51
-rw-r--r--wikiget/__init__.py0
-rw-r--r--wikiget/version.py9
-rw-r--r--wikiget/wikiget.py148
6 files changed, 283 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index 894a44c..3d6c39b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,8 @@ venv.bak/
# mypy
.mypy_cache/
+
+# IDE files
+.vs/
+.vscode/
+.idea/
diff --git a/README.md b/README.md
index 5dacc36..e388ad5 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,70 @@
-# python-wikiget
-Download files from MediaWiki sites
+# wikiget
+
+Something like a wget clone for downloading files from MediaWiki sites (like
+Wikipedia or Wikimedia Commons) using only the file name or the URL of its
+description page. Requires Python 2 or 3. Install with `pip install --user -U wikiget`.
+
+## Usage
+
+`wikiget [-h] [-V] [-q | -v] [-f] [--site SITE] [-o OUTPUT] FILE`
+
+If `FILE` is in the form `File:Example.jpg` or `Example.jpg`, it will be fetched
+from the default site, which is "en.wikipedia.org". If it's the fully-qualified
+URL of a file description page, like `https://commons.wikimedia.org/wiki/File:Example.jpg`,
+the file is fetched from the specified site, in this case "commons.wikimedia.org".
+
+The site can also be specified with the `--site` flag, though this will not have
+any effect if the full URL is given.
+
+More detailed information, such as the site used and full URL of the file, can be
+displayed with `-v` or `--verbose`. Use `-vv` to display even more detail. `-q` can
+be used to silence warnings.
+
+By default, the program won't overwrite existing files with the same name as the
+target, but this can be forced with `-f` or `--force`. Additionally, the file can
+be downloaded to a different name with `-o`.
+
+## Future plans
+
+- download from any MediaWiki-powered site, not just Wikimedia projects
+- download Wikipedia articles, in plain text, wikitext, or other formats
+
+## Contributing
+
+It's recommended that you use a virtual environment manager (like virtualenv) to
+install dependencies:
+
+```bash
+pip install --user -U virtualenv
+git clone https://github.com/clpo13/python-wikiget.git
+cd python-wikiget
+virtualenv venv
+```
+
+To activate the virtual environment, use one of the following commands:
+
+```bash
+source venv/bin/activate # Linux and macOS (bash, zsh)
+.\venv\Scripts\activate.bat # Windows command prompt
+.\venv\Scripts\Activate.ps1 # Windows PowerShell
+```
+
+Then run `pip install -e .` to invoke an editable install, meaning any changes
+made to the source will be reflected immediately.
+
+## License
+
+Copyright (C) 2018 Cody Logan
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <https://www.gnu.org/licenses/>.
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..784bd16
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""wikiget2
+Simple wget clone for downloading files from Wikimedia sites.
+Copyright (C) 2018 Cody Logan; licensed GPLv3+
+SPDX-License-Identifier: GPL-3.0-or-later
+"""
+
+from setuptools import setup, find_packages
+from os import path
+from io import open
+
+here = path.abspath(path.dirname(__file__))
+with open(path.join(here, "README.md"), "r") as fr:
+ long_description = fr.read()
+
+version = {}
+with open("wikiget/version.py") as fv:
+ exec(fv.read(), version)
+
+setup(
+ name="wikiget",
+ version=version["__version__"],
+ author="Cody Logan",
+ author_email="clpo13@gmail.com",
+ description="Tool for downloading files from MediaWiki sites",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ url="https://github.com/clpo13/python-wikiget",
+ keywords="mediawiki wikimedia wikipedia",
+ packages=find_packages(),
+ classifiers=[
+ "Development Status :: 3 - Alpha",
+ "Environment :: Console",
+ "Intended Audience :: End Users/Desktop",
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 2",
+ "Programming Language :: Python :: 3",
+ "Topic :: Utilities",
+ ],
+ install_requires=["future", "mwclient", "requests", "tqdm"],
+ project_urls={
+ "Bug Reports": "https://github.com/clpo13/python-wikiget/issues",
+ },
+ entry_points={
+ "console_scripts": [
+ 'wikiget=wikiget.wikiget:main',
+ ],
+ },
+)
diff --git a/wikiget/__init__.py b/wikiget/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/wikiget/__init__.py
diff --git a/wikiget/version.py b/wikiget/version.py
new file mode 100644
index 0000000..4607256
--- /dev/null
+++ b/wikiget/version.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+"""wikiget2
+Simple wget clone for downloading files from Wikimedia sites.
+Copyright (C) 2018 Cody Logan; licensed GPLv3+
+SPDX-License-Identifier: GPL-3.0-or-later
+"""
+
+__version__ = "0.1.0"
diff --git a/wikiget/wikiget.py b/wikiget/wikiget.py
new file mode 100644
index 0000000..fa49f2b
--- /dev/null
+++ b/wikiget/wikiget.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+"""wikiget2
+Simple wget clone for downloading files from Wikimedia sites.
+Copyright (C) 2018 Cody Logan; licensed GPLv3+
+SPDX-License-Identifier: GPL-3.0-or-later
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from builtins import open
+from future import standard_library
+standard_library.install_aliases()
+
+import argparse
+import logging
+import os
+import re
+import sys
+from urllib.parse import urlparse
+
+from mwclient import InvalidResponse, Site, __ver__ as mwclient_version
+from requests import ConnectionError
+from tqdm import tqdm
+
+from wikiget.version import __version__
+
+
+def main():
+ default_site = "en.wikipedia.org"
+ site_regex = re.compile(r"wiki[mp]edia\.org$", re.I)
+ file_regex = re.compile(r"([Ff]ile:|[Ii]mage:)([^/\s]+\.\w+)$")
+ user_agent = "wikiget/{} (https://github.com/clpo13/wikiget) " \
+ "mwclient/{}".format(__version__, mwclient_version)
+
+ parser = argparse.ArgumentParser(description="""
+ A tool for downloading files from MediaWiki sites
+ using the file name or description page URL
+ """,
+ epilog="""
+ Copyright (C) 2018 Cody Logan. License GPLv3+: GNU GPL version 3
+ or later <http://www.gnu.org/licenses/gpl.html>.
+ This is free software; you are free to change and redistribute it.
+ There is NO WARRANTY, to the extent permitted by law.
+ """)
+ parser.add_argument("FILE", help="""
+ name of the file to download with the File: or Image: prefix,
+ or the URL of its file description page
+ """)
+ parser.add_argument("-V", "--version", action="version", version="%(prog)s {}".format(__version__))
+ output_options = parser.add_mutually_exclusive_group()
+ output_options.add_argument("-q", "--quiet", help="suppress warning messages", action="store_true")
+ output_options.add_argument("-v", "--verbose",
+ help="print detailed information, use -vv for even more detail",
+ action="count", default=0)
+ parser.add_argument("-f", "--force", help="force overwriting existing files", action="store_true")
+ parser.add_argument("-s", "--site", default=default_site,
+ help="MediaWiki site to download from (default: %(default)s)")
+ parser.add_argument("-o", "--output", help="write download to OUTPUT")
+ args = parser.parse_args()
+
+ # print API and debug messages in verbose mode
+ if args.verbose >= 2:
+ logging.basicConfig(level=logging.DEBUG)
+ elif args.verbose >= 1:
+ logging.basicConfig(level=logging.WARNING)
+
+ url = urlparse(args.FILE)
+
+ if url.netloc:
+ filename = url.path
+ site_name = url.netloc
+ if args.site is not default_site and not args.quiet:
+ print("Warning: target is a URL, ignoring site specified with --site")
+ else:
+ filename = args.FILE
+ site_name = args.site
+
+ file_match = file_regex.search(filename)
+ site_match = site_regex.search(site_name)
+
+ # check for valid site parameter
+ if not site_match:
+ print("Only Wikimedia sites (wikipedia.org and wikimedia.org) are currently supported.")
+ sys.exit(1)
+
+ # check if this is a valid file
+ if file_match:
+ # get file name without File:/Image: prefix (second match group)
+ filename = file_match.group(2)
+ else:
+ # no file extension or prefix, probably an article
+ print("Downloading Wikipedia articles is not currently supported. "
+ "If this is a file, please add the 'File:' prefix.")
+ sys.exit(1)
+
+ dest = args.output or filename
+
+ if args.verbose >= 2:
+ print("User agent: {}".format(user_agent))
+
+ # connect to site and identify ourselves
+ try:
+ site = Site(site_name, clients_useragent=user_agent)
+ except ConnectionError:
+ # usually this means there is no such site, or there's no network connection
+ print("Error: couldn't connect to specified site.")
+ sys.exit(1)
+ except InvalidResponse as e:
+ # site exists, but we couldn't communicate with the API endpoint
+ print(e)
+ sys.exit(1)
+
+ # get info about the target file
+ file = site.images[filename]
+
+ if file.imageinfo != {}:
+ # file exists either locally or at Wikimedia Commons
+ file_url = file.imageinfo["url"]
+ file_size = file.imageinfo["size"]
+
+ if args.verbose >= 1:
+ print("Info: downloading '{}' ({} bytes) from {}".format(filename, file_size, site.host), end="")
+ if args.output:
+ print(" to '{}'".format(dest))
+ else:
+ print("\n", end="")
+ print("Info: {}".format(file_url))
+
+ if os.path.isfile(dest) and not args.force:
+ print("File '{}' already exists, skipping download (use -f to ignore)".format(dest))
+ else:
+ try:
+ # download the file
+ with tqdm(total=file_size, unit="B", unit_scale=True, unit_divisor=1024) as progress_bar:
+ with open(dest, "wb") as fd:
+ res = site.connection.get(file_url, stream=True)
+ progress_bar.set_postfix(file=dest, refresh=False)
+ for chunk in res.iter_content(1024):
+ fd.write(chunk)
+ progress_bar.update(len(chunk))
+ except IOError as e:
+ print("File could not be written. The following error was encountered:")
+ print(e)
+ sys.exit(1)
+ else:
+ # no file information returned
+ print("Target does not appear to be a valid file.")
+ sys.exit(1)