src/wikiget/parse.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

# wikiget - CLI tool for downloading files from Wikimedia sites
# Copyright (C) 2023 Cody Logan
# SPDX-License-Identifier: GPL-3.0-or-later
#
# Wikiget is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Wikiget is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.

import fileinput
import logging
from argparse import Namespace
from typing import Dict
from urllib.parse import unquote, urlparse

import wikiget
from wikiget.exceptions import ParseError
from wikiget.file import File
from wikiget.validations import valid_file

logger = logging.getLogger(__name__)


def get_dest(dl: str, args: Namespace) -> File:
    url = urlparse(dl)

    if url.netloc:
        filename = url.path
        site_name = url.netloc
        if args.site is not wikiget.DEFAULT_SITE:
            # this will work even if the user specifies 'commons.wikimedia.org' since
            # we're comparing objects instead of values (is not vs. !=)
            logger.warning("Target is a URL, ignoring site specified with --site")
    else:
        filename = dl
        site_name = args.site

    file_match = valid_file(filename)

    # check if this is a valid file
    if file_match and file_match.group(1):
        # has File:/Image: prefix and extension
        filename = file_match.group(2)
    else:
        # no file extension and/or prefix, probably an article
        msg = f"Could not parse input '{filename}' as a file"
        raise ParseError(msg)

    filename = unquote(filename)  # remove URL encoding for special characters

    dest = args.output or filename

    file = File(filename, dest, site_name)

    return file


def read_batch_file(batch_file: str) -> Dict[int, str]:
    dl_list = {}

    if batch_file == "-":
        logger.info("Using stdin for batch download")
    else:
        logger.info(f"Using file '{batch_file}' for batch download")

    with fileinput.input(batch_file) as fd:
        # read the file into memory and process each line as we go
        for line_num, line in enumerate(fd, start=1):
            line_s = line.strip()
            # ignore blank lines and lines starting with "#" (for comments)
            if line_s and not line_s.startswith("#"):
                dl_list[line_num] = line_s

    return dl_list