1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
# wikiget - CLI tool for downloading files from Wikimedia sites
# Copyright (C) 2018-2023 Cody Logan
# SPDX-License-Identifier: GPL-3.0-or-later
#
# Wikiget is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Wikiget is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Wikiget. If not, see <https://www.gnu.org/licenses/>.
"""Validate file and site input and verify file hashes."""
from __future__ import annotations
import hashlib
import re
from typing import TYPE_CHECKING
from wikiget import BLOCKSIZE
if TYPE_CHECKING:
from pathlib import Path
def valid_file(search_string: str) -> re.Match | None:
"""Determine if the given string contains a valid file name.
A valid file name is a string that begins with 'File:' or 'Image:' (the standard
file prefixes in MediaWiki), includes a period, and has at least one character
following the period, like 'File:Example.jpg' or 'Image:Example.svg'.
Args:
search_string (str): string to validate
Returns:
re.Match: a regex Match object if there's a match or None otherwise
"""
# second group could also restrict to file extensions with three or more
# letters with ([^/\r\n\t\f\v]+\.\w{3,})
file_regex = re.compile(r"(File:|Image:)([^/\r\n\t\f\v]+\.\w+)$", re.I)
return file_regex.search(search_string)
def valid_site(search_string: str) -> re.Match | None:
"""Determine if the given string contains a valid site name.
A valid site name is a string ending with 'wikipedia.org' or 'wikimedia.org'. This
covers all subdomains of those domains.
Currently unused since any site is accepted as input, and we rely on the user to
ensure the site has a compatible API.
Args:
search_string (str): string to validate
Returns:
re.Match: a regex Match object if there's a match or None otherwise
"""
site_regex = re.compile(r"wiki[mp]edia\.org$", re.I)
return site_regex.search(search_string)
def verify_hash(file: Path) -> str:
"""Calculate the SHA1 hash of the given file for comparison with a known value.
Despite being insecure, SHA1 is used since that's what the MediaWiki API returns for
the file hash.
Args:
file (pathlib.Path): file to calculate a hash for, as a Path object
Returns:
str: hash digest
"""
hasher = hashlib.sha1() # noqa: S324
with file.open("rb") as dl:
buf = dl.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = dl.read(BLOCKSIZE)
return hasher.hexdigest()
|