src/wikiget/validations.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

# wikiget - CLI tool for downloading files from Wikimedia sites
# Copyright (C) 2018-2023 Cody Logan
# SPDX-License-Identifier: GPL-3.0-or-later
#
# Wikiget is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Wikiget is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations

import hashlib
import re
from pathlib import Path

from wikiget import BLOCKSIZE


def valid_file(search_string: str) -> re.Match | None:
    """Determines if the given string contains a valid file name

    A valid file name is a string that begins with 'File:' or 'Image:' (the standard
    file prefixes in MediaWiki), includes a period, and has at least one character
    following the period, like 'File:Example.jpg' or 'Image:Example.svg'.

    :param search_string: string to validate
    :type search_string: str
    :returns: a regex Match object if there's a match or None otherwise
    :rtype: re.Match
    """
    # second group could also restrict to file extensions with three or more
    # letters with ([^/\r\n\t\f\v]+\.\w{3,})
    file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I)
    return file_regex.search(search_string)


def valid_site(search_string: str) -> re.Match | None:
    """Determines if the given string contains a valid site name

    A valid site name is a string ending with 'wikipedia.org' or 'wikimedia.org'. This
    covers all subdomains of those domains.

    Currently unused since any site is accepted as input, and we rely on the user to
    ensure the site has a compatible API.

    :param search_string: string to validate
    :type search_string: str
    :returns: a regex Match object if there's a match or None otherwise
    :rtype: re.Match
    """
    site_regex = re.compile(r"wiki[mp]edia\.org$", re.I)
    return site_regex.search(search_string)


def verify_hash(filename: str) -> str:
    """Calculates the SHA1 hash of the given file for comparison with a known value.

    Despite being insecure, SHA1 is used since that's what the MediaWiki API returns for
    the file hash.

    :param filename: name of the file to calculate a hash for
    :type filename: str
    :return: hash digest
    :rtype: str
    """
    hasher = hashlib.sha1()  # noqa: S324
    with Path(filename).open("rb") as dl:
        buf = dl.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = dl.read(BLOCKSIZE)
    return hasher.hexdigest()