Skip to content

Commit bd8075b

Browse files
authored
Merge pull request #83 from lahwaacz/check-pkg-urls
Add check-pkg-urls: script for checking the url field in Arch packages
2 parents 1195588 + 353db58 commit bd8075b

File tree

2 files changed

+258
-2
lines changed

2 files changed

+258
-2
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ PYTHON_SCRIPTS = \
1515
package/staging2testing \
1616
security/security-tracker-check \
1717
package/cleanup-list \
18-
package/srcinfo-pkg-graph
18+
package/srcinfo-pkg-graph \
19+
package/check-pkg-urls
1920

2021
SCRIPTS = \
2122
$(BASH_SCRIPTS) $(PYTHON_SCRIPTS)
@@ -51,4 +52,4 @@ check-bash: $(BASH_SCRIPTS)
5152
shellcheck $^
5253

5354
check-python: $(PYTHON_SCRIPTS)
54-
flake8 --ignore E123,E126,E128,E305,E501 $^
55+
flake8 --ignore W503,E123,E126,E128,E305,E501 $^

package/check-pkg-urls

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
#! /usr/bin/env python3
2+
3+
# SPDX-License-Identifier: GPL-2.0
4+
5+
"""
6+
The script checks if the URL field in all Arch Linux packages is valid, i.e.
7+
it leads to an existing website.
8+
9+
First a list of all packages in the core, extra, and multilib repositories is
10+
obtained using `pyalpm`. Then all URLs are checked using `httpx` and the status
11+
is saved. There may be various errors, such as:
12+
13+
- domain resolution error
14+
- SSL error (may be false-positive due to Python SSL package)
15+
- connection timeout or general connection error
16+
- HTTP status code (4xx, 5xx)
17+
18+
Some cases are treated as indeterminate and not reported as errors. On the
19+
other hand, some of the reported errors may be false-positive even in cases
20+
that *should* indicate an error, e.g. some infamous web servers return 403 or
21+
404 status codes with valid content that is rendered or redirected elsewhere
22+
using JavaScript.
23+
24+
Finally, a Markdown-formatted report is printed when all URLs are checked. Note
25+
that running the script may take a very long time (up to 2 hours for ~15k
26+
packages).
27+
28+
Dependencies:
29+
30+
- pyalpm
31+
- python-httpx
32+
- python-tqdm
33+
"""
34+
35+
import datetime
36+
import logging
37+
import tempfile
38+
import ssl
39+
from dataclasses import dataclass
40+
from functools import lru_cache
41+
from pathlib import Path
42+
43+
import httpx
44+
import pycman
45+
import pyalpm
46+
import tqdm
47+
import tqdm.contrib.logging
48+
49+
logger = logging.getLogger(__name__)
50+
51+
PACCONF = """
52+
[options]
53+
RootDir = /
54+
DBPath = {pacdbpath}
55+
CacheDir = {pacdbpath}
56+
LogFile = {pacdbpath}
57+
# Use system GPGDir so that we don't have to populate it
58+
GPGDir = /etc/pacman.d/gnupg/
59+
Architecture = {arch}
60+
61+
[core]
62+
Include = /etc/pacman.d/mirrorlist
63+
64+
[extra]
65+
Include = /etc/pacman.d/mirrorlist
66+
67+
[multilib]
68+
Include = /etc/pacman.d/mirrorlist
69+
"""
70+
71+
72+
def pacdb_init(config: str, dbpath: Path, arch: str):
73+
"""Initialize the pacman database and config"""
74+
dbpath.mkdir(exist_ok=True)
75+
confpath = dbpath / "pacman.conf"
76+
if not confpath.is_file():
77+
confpath.write_text(config.format(pacdbpath=dbpath, arch=arch))
78+
return pycman.config.init_with_config(confpath)
79+
80+
81+
def pacdb_refresh(pacdb, force=False):
82+
"""Sync databases like pacman -Sy"""
83+
try:
84+
logger.info("Syncing pacman database...")
85+
for db in pacdb.get_syncdbs():
86+
# since this is private pacman database, there is no locking
87+
db.update(force)
88+
except pyalpm.error:
89+
logger.exception("Failed to sync pacman database.")
90+
raise
91+
92+
93+
def all_pkgs(pacdb):
94+
"""Generate all packages in all sync databses."""
95+
for db in pacdb.get_syncdbs():
96+
for pkg in db.pkgcache:
97+
yield pkg
98+
99+
100+
# httpx client parameters
101+
limits = httpx.Limits(
102+
max_connections=100,
103+
max_keepalive_connections=None, # always allow keep-alive
104+
keepalive_expiry=60,
105+
)
106+
timeout = httpx.Timeout(
107+
15, pool=None
108+
) # disable timeout for waiting for a connection from the pool
109+
headers = {
110+
# fake user agent to bypass servers responding differently or not at all to non-browser user agents
111+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:128.0) Gecko/20100101 Firefox/128.0",
112+
}
113+
114+
# create an SSL context allowing only TLS1.2 and newer (if supported by the used openssl version)
115+
ssl_context = httpx.create_ssl_context(ssl.PROTOCOL_TLS)
116+
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
117+
118+
# initialize the HTTPX client
119+
transport = httpx.HTTPTransport(retries=3)
120+
client = httpx.Client(
121+
transport=transport,
122+
verify=ssl_context,
123+
headers=headers,
124+
timeout=timeout,
125+
limits=limits,
126+
)
127+
128+
129+
# NOTE: this is basically copy-pasted from https://github.com/lahwaacz/wiki-scripts/blob/master/ws/checkers/ExtlinkStatusChecker.py
130+
@lru_cache(maxsize=1024)
131+
def check_url_sync(url: httpx.URL | str, *, follow_redirects=True):
132+
if not isinstance(url, httpx.URL):
133+
url = httpx.URL(url)
134+
135+
try:
136+
# We need to use GET requests instead of HEAD, because many servers just return 404
137+
# (or do not reply at all) to HEAD requests. Instead, we skip the downloading of the
138+
# response body content by using ``stream`` interface.
139+
with client.stream("GET", url, follow_redirects=follow_redirects) as response:
140+
# nothing to do here, but using the context manager ensures that the response is
141+
# always properly closed
142+
pass
143+
# FIXME: workaround for https://github.com/encode/httpx/discussions/2682#discussioncomment-5746317
144+
except (httpx.ConnectError, ssl.SSLError) as e:
145+
if isinstance(e, ssl.SSLError) or str(e).startswith("[SSL:"):
146+
if "unable to get local issuer certificate" in str(e):
147+
# FIXME: this is a problem of the SSL library used by Python
148+
logger.warning(
149+
f"possible SSL error (unable to get local issuer certificate) for URL {url}"
150+
)
151+
return
152+
else:
153+
logger.error(f"SSL error ({e}) for URL {url}")
154+
return False
155+
if (
156+
"no address associated with hostname" in str(e).lower()
157+
or "name or service not known" in str(e).lower()
158+
):
159+
logger.error(f"domain name could not be resolved for URL {url}")
160+
return False
161+
# other connection error - indeterminate
162+
logger.warning(f"connection error for URL {url}")
163+
return
164+
except httpx.TooManyRedirects as e:
165+
logger.error(f"TooManyRedirects error ({e}) for URL {url}")
166+
return False
167+
# it seems that httpx does not capture all exceptions, e.g. anyio.EndOfStream
168+
# except httpx.RequestError as e:
169+
except Exception as e:
170+
# e.g. ReadTimeout has no message in the async version,
171+
# see https://github.com/encode/httpx/discussions/2681
172+
msg = str(e)
173+
if not msg:
174+
msg = type(e)
175+
# base class exception - indeterminate
176+
logger.error(f"URL {url} could not be checked due to {msg}")
177+
return
178+
179+
logger.debug(f"status code {response.status_code} for URL {url}")
180+
return response.status_code >= 200 and response.status_code < 300
181+
182+
183+
@dataclass
184+
class PackageUrlCheck:
185+
pkgname: str
186+
url: str
187+
result: bool | None = None
188+
timestamp: datetime.datetime | None = None
189+
190+
191+
def check_package_url(pkg_check: PackageUrlCheck, progress: tqdm.tqdm | None = None):
192+
logger.info(f"Checking URL {pkg_check.url} ({pkg_check.pkgname})")
193+
194+
pkg_check.result = check_url_sync(pkg_check.url)
195+
pkg_check.timestamp = datetime.datetime.now(datetime.UTC)
196+
197+
if progress is not None:
198+
progress.update(1)
199+
200+
201+
def check(pkg_checks: [PackageUrlCheck]):
202+
# initialize tqdm progressbar
203+
with tqdm.tqdm(total=len(pkg_checks)) as progress:
204+
# redirect logging to tqdm
205+
with tqdm.contrib.logging.logging_redirect_tqdm():
206+
# sort by URL to optimize for lru_cache
207+
for pkg_check in sorted(pkg_checks, key=lambda x: x.url):
208+
check_package_url(pkg_check, progress)
209+
210+
211+
def print_report(pkg_checks: [PackageUrlCheck]):
212+
report = "# Package URL check report\n\n"
213+
214+
report += "## Packages with broken url (result=False)\n\n"
215+
for pkg_check in pkg_checks:
216+
if pkg_check.result is False and pkg_check.timestamp is not None:
217+
report += f"- [ ] [{pkg_check.pkgname}](https://gitlab.archlinux.org/archlinux/packaging/packages/{pkg_check.pkgname}): {pkg_check.url}\n"
218+
219+
report += "## Packages with inconclusive check (result=None)\n\n"
220+
for pkg_check in pkg_checks:
221+
if pkg_check.result is None and pkg_check.timestamp is not None:
222+
report += f"- [ ] [{pkg_check.pkgname}](https://gitlab.archlinux.org/archlinux/packaging/packages/{pkg_check.pkgname}): {pkg_check.url}\n"
223+
224+
print(report)
225+
226+
227+
def main(tmpdir: Path):
228+
pacdb = pacdb_init(PACCONF, tmpdir, arch="x86_64")
229+
pacdb_refresh(pacdb)
230+
231+
# get all packages
232+
pkg_checks = [
233+
PackageUrlCheck(pkgname=pkg.name, url=pkg.url) for pkg in all_pkgs(pacdb)
234+
]
235+
236+
try:
237+
check(pkg_checks)
238+
except KeyboardInterrupt:
239+
pass
240+
finally:
241+
print_report(pkg_checks)
242+
243+
244+
if __name__ == "__main__":
245+
formatter = logging.Formatter("{levelname:8} {message}", style="{")
246+
handler = logging.StreamHandler()
247+
handler.setFormatter(formatter)
248+
logging.getLogger().addHandler(handler)
249+
logging.getLogger().setLevel(logging.INFO)
250+
251+
logging.getLogger("httpcore").setLevel(logging.INFO)
252+
logging.getLogger("httpx").setLevel(logging.WARN)
253+
254+
with tempfile.TemporaryDirectory() as tmpdir:
255+
main(Path(tmpdir))

0 commit comments

Comments
 (0)