|
| 1 | +#! /usr/bin/env python3 |
| 2 | + |
| 3 | +# SPDX-License-Identifier: GPL-2.0 |
| 4 | + |
| 5 | +""" |
| 6 | +The script checks if the URL field in all Arch Linux packages is valid, i.e. |
| 7 | +it leads to an existing website. |
| 8 | +
|
| 9 | +First a list of all packages in the core, extra, and multilib repositories is |
| 10 | +obtained using `pyalpm`. Then all URLs are checked using `httpx` and the status |
| 11 | +is saved. There may be various errors, such as: |
| 12 | +
|
| 13 | +- domain resolution error |
| 14 | +- SSL error (may be false-positive due to Python SSL package) |
| 15 | +- connection timeout or general connection error |
| 16 | +- HTTP status code (4xx, 5xx) |
| 17 | +
|
| 18 | +Some cases are treated as indeterminate and not reported as errors. On the |
| 19 | +other hand, some of the reported errors may be false-positive even in cases |
| 20 | +that *should* indicate an error, e.g. some infamous web servers return 403 or |
| 21 | +404 status codes with valid content that is rendered or redirected elsewhere |
| 22 | +using JavaScript. |
| 23 | +
|
| 24 | +Finally, a Markdown-formatted report is printed when all URLs are checked. Note |
| 25 | +that running the script may take a very long time (up to 2 hours for ~15k |
| 26 | +packages). |
| 27 | +
|
| 28 | +Dependencies: |
| 29 | +
|
| 30 | +- pyalpm |
| 31 | +- python-httpx |
| 32 | +- python-tqdm |
| 33 | +""" |
| 34 | + |
| 35 | +import datetime |
| 36 | +import logging |
| 37 | +import tempfile |
| 38 | +import ssl |
| 39 | +from dataclasses import dataclass |
| 40 | +from functools import lru_cache |
| 41 | +from pathlib import Path |
| 42 | + |
| 43 | +import httpx |
| 44 | +import pycman |
| 45 | +import pyalpm |
| 46 | +import tqdm |
| 47 | +import tqdm.contrib.logging |
| 48 | + |
| 49 | +logger = logging.getLogger(__name__) |
| 50 | + |
| 51 | +PACCONF = """ |
| 52 | +[options] |
| 53 | +RootDir = / |
| 54 | +DBPath = {pacdbpath} |
| 55 | +CacheDir = {pacdbpath} |
| 56 | +LogFile = {pacdbpath} |
| 57 | +# Use system GPGDir so that we don't have to populate it |
| 58 | +GPGDir = /etc/pacman.d/gnupg/ |
| 59 | +Architecture = {arch} |
| 60 | +
|
| 61 | +[core] |
| 62 | +Include = /etc/pacman.d/mirrorlist |
| 63 | +
|
| 64 | +[extra] |
| 65 | +Include = /etc/pacman.d/mirrorlist |
| 66 | +
|
| 67 | +[multilib] |
| 68 | +Include = /etc/pacman.d/mirrorlist |
| 69 | +""" |
| 70 | + |
| 71 | + |
| 72 | +def pacdb_init(config: str, dbpath: Path, arch: str): |
| 73 | + """Initialize the pacman database and config""" |
| 74 | + dbpath.mkdir(exist_ok=True) |
| 75 | + confpath = dbpath / "pacman.conf" |
| 76 | + if not confpath.is_file(): |
| 77 | + confpath.write_text(config.format(pacdbpath=dbpath, arch=arch)) |
| 78 | + return pycman.config.init_with_config(confpath) |
| 79 | + |
| 80 | + |
| 81 | +def pacdb_refresh(pacdb, force=False): |
| 82 | + """Sync databases like pacman -Sy""" |
| 83 | + try: |
| 84 | + logger.info("Syncing pacman database...") |
| 85 | + for db in pacdb.get_syncdbs(): |
| 86 | + # since this is private pacman database, there is no locking |
| 87 | + db.update(force) |
| 88 | + except pyalpm.error: |
| 89 | + logger.exception("Failed to sync pacman database.") |
| 90 | + raise |
| 91 | + |
| 92 | + |
| 93 | +def all_pkgs(pacdb): |
| 94 | + """Generate all packages in all sync databses.""" |
| 95 | + for db in pacdb.get_syncdbs(): |
| 96 | + for pkg in db.pkgcache: |
| 97 | + yield pkg |
| 98 | + |
| 99 | + |
| 100 | +# httpx client parameters |
| 101 | +limits = httpx.Limits( |
| 102 | + max_connections=100, |
| 103 | + max_keepalive_connections=None, # always allow keep-alive |
| 104 | + keepalive_expiry=60, |
| 105 | +) |
| 106 | +timeout = httpx.Timeout( |
| 107 | + 15, pool=None |
| 108 | +) # disable timeout for waiting for a connection from the pool |
| 109 | +headers = { |
| 110 | + # fake user agent to bypass servers responding differently or not at all to non-browser user agents |
| 111 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:128.0) Gecko/20100101 Firefox/128.0", |
| 112 | +} |
| 113 | + |
| 114 | +# create an SSL context allowing only TLS1.2 and newer (if supported by the used openssl version) |
| 115 | +ssl_context = httpx.create_ssl_context(ssl.PROTOCOL_TLS) |
| 116 | +ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2 |
| 117 | + |
| 118 | +# initialize the HTTPX client |
| 119 | +transport = httpx.HTTPTransport(retries=3) |
| 120 | +client = httpx.Client( |
| 121 | + transport=transport, |
| 122 | + verify=ssl_context, |
| 123 | + headers=headers, |
| 124 | + timeout=timeout, |
| 125 | + limits=limits, |
| 126 | +) |
| 127 | + |
| 128 | + |
| 129 | +# NOTE: this is basically copy-pasted from https://github.com/lahwaacz/wiki-scripts/blob/master/ws/checkers/ExtlinkStatusChecker.py |
| 130 | +@lru_cache(maxsize=1024) |
| 131 | +def check_url_sync(url: httpx.URL | str, *, follow_redirects=True): |
| 132 | + if not isinstance(url, httpx.URL): |
| 133 | + url = httpx.URL(url) |
| 134 | + |
| 135 | + try: |
| 136 | + # We need to use GET requests instead of HEAD, because many servers just return 404 |
| 137 | + # (or do not reply at all) to HEAD requests. Instead, we skip the downloading of the |
| 138 | + # response body content by using ``stream`` interface. |
| 139 | + with client.stream("GET", url, follow_redirects=follow_redirects) as response: |
| 140 | + # nothing to do here, but using the context manager ensures that the response is |
| 141 | + # always properly closed |
| 142 | + pass |
| 143 | + # FIXME: workaround for https://github.com/encode/httpx/discussions/2682#discussioncomment-5746317 |
| 144 | + except (httpx.ConnectError, ssl.SSLError) as e: |
| 145 | + if isinstance(e, ssl.SSLError) or str(e).startswith("[SSL:"): |
| 146 | + if "unable to get local issuer certificate" in str(e): |
| 147 | + # FIXME: this is a problem of the SSL library used by Python |
| 148 | + logger.warning( |
| 149 | + f"possible SSL error (unable to get local issuer certificate) for URL {url}" |
| 150 | + ) |
| 151 | + return |
| 152 | + else: |
| 153 | + logger.error(f"SSL error ({e}) for URL {url}") |
| 154 | + return False |
| 155 | + if ( |
| 156 | + "no address associated with hostname" in str(e).lower() |
| 157 | + or "name or service not known" in str(e).lower() |
| 158 | + ): |
| 159 | + logger.error(f"domain name could not be resolved for URL {url}") |
| 160 | + return False |
| 161 | + # other connection error - indeterminate |
| 162 | + logger.warning(f"connection error for URL {url}") |
| 163 | + return |
| 164 | + except httpx.TooManyRedirects as e: |
| 165 | + logger.error(f"TooManyRedirects error ({e}) for URL {url}") |
| 166 | + return False |
| 167 | + # it seems that httpx does not capture all exceptions, e.g. anyio.EndOfStream |
| 168 | + # except httpx.RequestError as e: |
| 169 | + except Exception as e: |
| 170 | + # e.g. ReadTimeout has no message in the async version, |
| 171 | + # see https://github.com/encode/httpx/discussions/2681 |
| 172 | + msg = str(e) |
| 173 | + if not msg: |
| 174 | + msg = type(e) |
| 175 | + # base class exception - indeterminate |
| 176 | + logger.error(f"URL {url} could not be checked due to {msg}") |
| 177 | + return |
| 178 | + |
| 179 | + logger.debug(f"status code {response.status_code} for URL {url}") |
| 180 | + return response.status_code >= 200 and response.status_code < 300 |
| 181 | + |
| 182 | + |
| 183 | +@dataclass |
| 184 | +class PackageUrlCheck: |
| 185 | + pkgname: str |
| 186 | + url: str |
| 187 | + result: bool | None = None |
| 188 | + timestamp: datetime.datetime | None = None |
| 189 | + |
| 190 | + |
| 191 | +def check_package_url(pkg_check: PackageUrlCheck, progress: tqdm.tqdm | None = None): |
| 192 | + logger.info(f"Checking URL {pkg_check.url} ({pkg_check.pkgname})") |
| 193 | + |
| 194 | + pkg_check.result = check_url_sync(pkg_check.url) |
| 195 | + pkg_check.timestamp = datetime.datetime.now(datetime.UTC) |
| 196 | + |
| 197 | + if progress is not None: |
| 198 | + progress.update(1) |
| 199 | + |
| 200 | + |
| 201 | +def check(pkg_checks: [PackageUrlCheck]): |
| 202 | + # initialize tqdm progressbar |
| 203 | + with tqdm.tqdm(total=len(pkg_checks)) as progress: |
| 204 | + # redirect logging to tqdm |
| 205 | + with tqdm.contrib.logging.logging_redirect_tqdm(): |
| 206 | + # sort by URL to optimize for lru_cache |
| 207 | + for pkg_check in sorted(pkg_checks, key=lambda x: x.url): |
| 208 | + check_package_url(pkg_check, progress) |
| 209 | + |
| 210 | + |
| 211 | +def print_report(pkg_checks: [PackageUrlCheck]): |
| 212 | + report = "# Package URL check report\n\n" |
| 213 | + |
| 214 | + report += "## Packages with broken url (result=False)\n\n" |
| 215 | + for pkg_check in pkg_checks: |
| 216 | + if pkg_check.result is False and pkg_check.timestamp is not None: |
| 217 | + report += f"- [ ] [{pkg_check.pkgname}](https://gitlab.archlinux.org/archlinux/packaging/packages/{pkg_check.pkgname}): {pkg_check.url}\n" |
| 218 | + |
| 219 | + report += "## Packages with inconclusive check (result=None)\n\n" |
| 220 | + for pkg_check in pkg_checks: |
| 221 | + if pkg_check.result is None and pkg_check.timestamp is not None: |
| 222 | + report += f"- [ ] [{pkg_check.pkgname}](https://gitlab.archlinux.org/archlinux/packaging/packages/{pkg_check.pkgname}): {pkg_check.url}\n" |
| 223 | + |
| 224 | + print(report) |
| 225 | + |
| 226 | + |
| 227 | +def main(tmpdir: Path): |
| 228 | + pacdb = pacdb_init(PACCONF, tmpdir, arch="x86_64") |
| 229 | + pacdb_refresh(pacdb) |
| 230 | + |
| 231 | + # get all packages |
| 232 | + pkg_checks = [ |
| 233 | + PackageUrlCheck(pkgname=pkg.name, url=pkg.url) for pkg in all_pkgs(pacdb) |
| 234 | + ] |
| 235 | + |
| 236 | + try: |
| 237 | + check(pkg_checks) |
| 238 | + except KeyboardInterrupt: |
| 239 | + pass |
| 240 | + finally: |
| 241 | + print_report(pkg_checks) |
| 242 | + |
| 243 | + |
| 244 | +if __name__ == "__main__": |
| 245 | + formatter = logging.Formatter("{levelname:8} {message}", style="{") |
| 246 | + handler = logging.StreamHandler() |
| 247 | + handler.setFormatter(formatter) |
| 248 | + logging.getLogger().addHandler(handler) |
| 249 | + logging.getLogger().setLevel(logging.INFO) |
| 250 | + |
| 251 | + logging.getLogger("httpcore").setLevel(logging.INFO) |
| 252 | + logging.getLogger("httpx").setLevel(logging.WARN) |
| 253 | + |
| 254 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 255 | + main(Path(tmpdir)) |
0 commit comments