|
| 1 | +"""Policy routing for downloader selection.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import re |
| 6 | +from dataclasses import dataclass |
| 7 | +from pathlib import Path |
| 8 | +from typing import Any, Dict, Iterable, Optional |
| 9 | +from urllib.parse import urlparse |
| 10 | + |
| 11 | +import yaml |
| 12 | + |
| 13 | +VALID_DOWNLOADERS = {"standard", "browser", "auto"} |
| 14 | +ROUTE_OPTION_KEYS = { |
| 15 | + "browser_timeout_ms", |
| 16 | + "browser_post_load_wait_ms", |
| 17 | + "browser_engine", |
| 18 | + "browser_headless", |
| 19 | + "browser_session_ttl_seconds", |
| 20 | +} |
| 21 | + |
| 22 | + |
| 23 | +def _normalize_downloader(value: Any, default: str = "standard") -> str: |
| 24 | + normalized = str(value or default).strip().lower() |
| 25 | + if normalized in {"default", "http"}: |
| 26 | + normalized = "standard" |
| 27 | + if normalized in {"browser_fallback"}: |
| 28 | + normalized = "auto" |
| 29 | + if normalized in {"browser_protected"}: |
| 30 | + normalized = "browser" |
| 31 | + if normalized not in VALID_DOWNLOADERS: |
| 32 | + raise ValueError(f"Unsupported downloader route: {value}") |
| 33 | + return normalized |
| 34 | + |
| 35 | + |
| 36 | +@dataclass(frozen=True) |
| 37 | +class DownloadPolicyMatch: |
| 38 | + domains: tuple[str, ...] = () |
| 39 | + url_regex: Optional[re.Pattern[str]] = None |
| 40 | + |
| 41 | + def matches(self, url: str) -> bool: |
| 42 | + parsed = urlparse(url) |
| 43 | + hostname = (parsed.hostname or "").lower() |
| 44 | + if self.domains: |
| 45 | + matched_domain = any( |
| 46 | + hostname == domain or hostname.endswith(f".{domain}") |
| 47 | + for domain in self.domains |
| 48 | + ) |
| 49 | + if not matched_domain: |
| 50 | + return False |
| 51 | + if self.url_regex and not self.url_regex.search(url): |
| 52 | + return False |
| 53 | + return True |
| 54 | + |
| 55 | + |
| 56 | +@dataclass(frozen=True) |
| 57 | +class DownloadPolicyRule: |
| 58 | + matcher: DownloadPolicyMatch |
| 59 | + downloader: str |
| 60 | + options: Dict[str, Any] |
| 61 | + |
| 62 | + def matches(self, url: str) -> bool: |
| 63 | + return self.matcher.matches(url) |
| 64 | + |
| 65 | + |
| 66 | +@dataclass(frozen=True) |
| 67 | +class DownloadPolicy: |
| 68 | + default_downloader: str = "standard" |
| 69 | + default_options: Dict[str, Any] | None = None |
| 70 | + rules: tuple[DownloadPolicyRule, ...] = () |
| 71 | + |
| 72 | + def resolve(self, url: str) -> tuple[str, Dict[str, Any]]: |
| 73 | + for rule in self.rules: |
| 74 | + if rule.matches(url): |
| 75 | + return rule.downloader, dict(rule.options) |
| 76 | + return self.default_downloader, dict(self.default_options or {}) |
| 77 | + |
| 78 | + |
| 79 | +def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]: |
| 80 | + return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS} |
| 81 | + |
| 82 | + |
| 83 | +def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch: |
| 84 | + domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip()) |
| 85 | + url_regex = raw.get("url_regex") |
| 86 | + compiled = re.compile(str(url_regex)) if url_regex else None |
| 87 | + return DownloadPolicyMatch(domains=domains, url_regex=compiled) |
| 88 | + |
| 89 | + |
| 90 | +def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy: |
| 91 | + default_block = dict(data.get("default") or {}) |
| 92 | + default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard") |
| 93 | + default_options = _extract_route_options(default_block) |
| 94 | + |
| 95 | + rules = [] |
| 96 | + for raw_rule in data.get("rules") or []: |
| 97 | + raw_rule = dict(raw_rule or {}) |
| 98 | + matcher = _build_matcher(dict(raw_rule.get("match") or {})) |
| 99 | + downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader) |
| 100 | + options = _extract_route_options(raw_rule) |
| 101 | + rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options)) |
| 102 | + |
| 103 | + return DownloadPolicy( |
| 104 | + default_downloader=default_downloader, |
| 105 | + default_options=default_options, |
| 106 | + rules=tuple(rules), |
| 107 | + ) |
| 108 | + |
| 109 | + |
| 110 | +def load_download_policy(path: str | Path) -> DownloadPolicy: |
| 111 | + policy_path = Path(path).expanduser().resolve() |
| 112 | + payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {} |
| 113 | + if not isinstance(payload, dict): |
| 114 | + raise ValueError("Download policy file must define a mapping at the top level") |
| 115 | + return build_download_policy(payload) |
| 116 | + |
| 117 | + |
| 118 | +__all__ = [ |
| 119 | + "DownloadPolicy", |
| 120 | + "DownloadPolicyMatch", |
| 121 | + "DownloadPolicyRule", |
| 122 | + "VALID_DOWNLOADERS", |
| 123 | + "build_download_policy", |
| 124 | + "load_download_policy", |
| 125 | +] |
0 commit comments