eellak
diff --git a/‎README.md‎
Lines changed: 60 additions & 1 deletion b/‎README.md‎
Lines changed: 60 additions & 1 deletion
diff --git a/‎docs/api/corpus.md‎
Lines changed: 23 additions & 0 deletions b/‎docs/api/corpus.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎docs/stages/download.md‎
Lines changed: 25 additions & 0 deletions b/‎docs/stages/download.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎install_glossapi.py‎
Lines changed: 23 additions & 0 deletions b/‎install_glossapi.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/glossapi/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/glossapi/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/glossapi/corpus/phase_download.py‎
Lines changed: 28 additions & 7 deletions b/‎src/glossapi/corpus/phase_download.py‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎src/glossapi/download_policy.py‎
Lines changed: 125 additions & 0 deletions b/‎src/glossapi/download_policy.py‎
Lines changed: 125 additions & 0 deletions
@@ -56,6 +56,65 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend
 
 `setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`.
 
+If you want a guided install that asks which phases you plan to use, run:
+
+```bash
+python install_glossapi.py
+```
+
+That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them.
+
+## Browser-Gated Download Mode
+
+`Corpus.download(...)` now supports three high-level routes for file acquisition:
+
+- `download_mode="standard"`: direct HTTP downloader only
+- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial
+- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints
+
+Use `browser_mode=True` as a legacy alias for `download_mode="browser"`.
+
+### Policy-driven routing
+
+If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL:
+
+```yaml
+default:
+  downloader: standard
+
+rules:
+  - match:
+      domains: [eur-lex.europa.eu]
+    downloader: browser
+
+  - match:
+      url_regex: "https://example.org/protected/.*"
+    downloader: auto
+```
+
+```python
+from glossapi import Corpus
+
+corpus = Corpus(input_dir="out", output_dir="out")
+corpus.download(
+    input_parquet="input_urls.parquet",
+    download_policy_file="download_policy.yml",
+)
+```
+
+### Operational notes
+
+- Browser mode is for browser-gated file endpoints, not viewer-only sources.
+- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files.
+- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory.
+- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files.
+
+### Regression strategy
+
+The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs.
+
+For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite.
+
 **DeepSeek runtime checklist**
 - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR.
 - Export these to force the real runtime and avoid silent stub output:
@@ -93,7 +152,7 @@ Use this as the shortest path from a documentation concept to the public call th
 
 | Stage | Main call | Important parameters | Writes |
 | --- | --- | --- | --- |
-| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` |
+| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` |
 | Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/<stem>.md`, `json/<stem>.docling.json(.zst)`, `json/metrics/*.json` |
 | Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/<stem>.md`, updated parquet metrics/flags |
 | OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/<stem>.md`, optional `json/<stem>.latex_map.jsonl` |
 
@@ -187,12 +187,35 @@ download(
 - Important parameters:
   - `links_column`: override URL column name
   - `parallelize_by`: choose grouping for the scheduler
+  - `download_mode`: one of `standard`, `auto`, or `browser`
+  - `browser_mode=True`: alias for `download_mode="browser"`
+  - `download_policy_file`: route specific domains/URL patterns to `standard`, `auto`, or `browser`
   - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc.
 - Main outputs:
   - downloaded files in `downloads/`
   - partial/final results in `download_results/`
   - returned `pd.DataFrame` with download status and metadata
 
+Browser-capable download mode is intended for browser-gated file endpoints where a real file still exists behind session/bootstrap checks. It is not a general viewer extractor. Viewer-only sources should still fail cleanly with a recorded error and no local file artifact.
+
+Example:
+
+```python
+corpus.download(
+    input_parquet="input_urls.parquet",
+    download_mode="browser",
+)
+```
+
+Policy-routed example:
+
+```python
+corpus.download(
+    input_parquet="input_urls.parquet",
+    download_policy_file="download_policy.yml",
+)
+```
+
 ## triage_math()
 
 - Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs.
 
@@ -8,6 +8,7 @@ The download stage acquires source documents from parquet-based URL metadata and
 
 - read URL-bearing parquet input
 - download files concurrently
+- route known browser-gated sources through browser-assisted acquisition when configured
 - retain source metadata context
 - avoid refetching previously successful downloads
 - assign stable-enough local filenames for downstream processing
@@ -42,10 +43,34 @@ Typical issues include:
 
 - transient network failures
 - rate limiting
+- browser-gated file endpoints that return HTML challenge/interstitial pages
+- viewer-only sources that should fail cleanly instead of being recorded as successful downloads
 - duplicate URLs
 - filename collisions
 - partially completed corpus fetches
 
+## Browser-gated sources
+
+The downloader now distinguishes between:
+
+- direct file endpoints
+- browser-gated file endpoints
+- viewer-only/document-reader sources
+
+For browser-gated file endpoints:
+
+- `download_mode="auto"` probes with direct HTTP and escalates to a browser session when it detects a recoverable interstitial
+- `download_mode="browser"` goes directly to the browser-assisted path
+- `download_policy_file=...` can route known domains or URL patterns to the correct path without probing every file
+
+Browser-assisted mode is designed for retrievable file endpoints, not for sources that only expose page images, tiles, HTML/SVG re-rendering, or DRM-wrapped readers.
+
+## Session reuse
+
+Browser-assisted mode reuses cached browser session state per domain so multiple files from the same protected source do not need a fresh browser bootstrap every time.
+
+This keeps the browser as a session-bootstrap resource rather than the main downloader.
+
 ## Contributor note
 
 Any change to filename assignment or result parquet structure can have downstream impact on:
 
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+def _bootstrap_repo_src() -> None:
+    repo_root = Path(__file__).resolve().parent
+    src_dir = repo_root / "src"
+    src_str = str(src_dir)
+    if src_str not in sys.path:
+        sys.path.insert(0, src_str)
+
+
+def main() -> int:
+    _bootstrap_repo_src()
+    from glossapi.scripts.install_glossapi import main as _main
+
+    return int(_main())
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -37,6 +37,10 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
+# Browser automation fallback for browser-gated file endpoints
+browser = [
+    "playwright>=1.52,<2",
+]
 # Docling extraction/layout stack
 docling = [
     "docling==2.48.0",
 
@@ -9,6 +9,7 @@
     'Sampler',
     'Section',
     'GlossDownloader',
+    'BrowserGlossDownloader',
 ]
 
 def __getattr__(name: str):
@@ -31,6 +32,9 @@ def __getattr__(name: str):
     if name == 'GlossDownloader':
         from .gloss_downloader import GlossDownloader  # type: ignore
         return GlossDownloader
+    if name == 'BrowserGlossDownloader':
+        from .gloss_browser_downloader import BrowserGlossDownloader  # type: ignore
+        return BrowserGlossDownloader
     raise AttributeError(name)
 
 try:
 
@@ -19,6 +19,7 @@
 import pandas as pd
 
 from .._naming import canonical_stem
+from ..gloss_browser_downloader import BrowserGlossDownloader
 from ..gloss_downloader import GlossDownloader
 # Avoid importing section/classifier here; download phase does not use them.
 from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path
@@ -212,6 +213,22 @@ def _looks_like_list(s: str) -> bool:
         # Initialize downloader configuration (kwargs take precedence)
         dl_cfg = dict(self.downloader_config)
         dl_cfg.update(kwargs)
+        browser_mode = dl_cfg.pop('browser_mode', None)
+        if browser_mode is not None and 'download_mode' not in dl_cfg:
+            dl_cfg['download_mode'] = 'browser' if browser_mode else 'standard'
+        download_mode = str(dl_cfg.pop('download_mode', 'standard')).strip().lower()
+        policy_requested = bool(dl_cfg.get('download_policy_file') or dl_cfg.get('download_policy'))
+        if download_mode in {'standard', 'default', 'http'} and not policy_requested:
+            downloader_cls = GlossDownloader
+            default_download_route = 'standard'
+        elif download_mode in {'browser', 'browser_protected'} or policy_requested:
+            downloader_cls = BrowserGlossDownloader
+            default_download_route = 'browser' if download_mode in {'browser', 'browser_protected'} else 'standard'
+        elif download_mode in {'auto', 'browser_fallback'}:
+            downloader_cls = BrowserGlossDownloader
+            default_download_route = 'auto'
+        else:
+            raise ValueError(f"Unsupported download_mode: {download_mode}")
         # Allow caller to override which column holds links
         if links_column:
             url_column = links_column
@@ -232,14 +249,18 @@ def _looks_like_list(s: str) -> bool:
         except Exception:
             pass
 
-        downloader = GlossDownloader(
-            url_column=url_column,
-            output_dir=str(self.output_dir),
-            log_level=self.logger.level,
-            verbose=verbose if verbose is not None else self.verbose,
+        downloader_kwargs = {
+            "url_column": url_column,
+            "output_dir": str(self.output_dir),
+            "log_level": self.logger.level,
+            "verbose": verbose if verbose is not None else self.verbose,
             **{k: v for k, v in dl_cfg.items() if k not in {'input_parquet'}},
-            _used_filename_bases=used_bases
-        )
+            "_used_filename_bases": used_bases,
+        }
+        if downloader_cls is BrowserGlossDownloader:
+            downloader_kwargs["default_download_route"] = default_download_route
+
+        downloader = downloader_cls(**downloader_kwargs)
 
         # Download files
         self.logger.info(f"Downloading files from URLs in {input_parquet}...")
 
@@ -0,0 +1,125 @@
+"""Policy routing for downloader selection."""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, Optional
+from urllib.parse import urlparse
+
+import yaml
+
+VALID_DOWNLOADERS = {"standard", "browser", "auto"}
+ROUTE_OPTION_KEYS = {
+    "browser_timeout_ms",
+    "browser_post_load_wait_ms",
+    "browser_engine",
+    "browser_headless",
+    "browser_session_ttl_seconds",
+}
+
+
+def _normalize_downloader(value: Any, default: str = "standard") -> str:
+    normalized = str(value or default).strip().lower()
+    if normalized in {"default", "http"}:
+        normalized = "standard"
+    if normalized in {"browser_fallback"}:
+        normalized = "auto"
+    if normalized in {"browser_protected"}:
+        normalized = "browser"
+    if normalized not in VALID_DOWNLOADERS:
+        raise ValueError(f"Unsupported downloader route: {value}")
+    return normalized
+
+
+@dataclass(frozen=True)
+class DownloadPolicyMatch:
+    domains: tuple[str, ...] = ()
+    url_regex: Optional[re.Pattern[str]] = None
+
+    def matches(self, url: str) -> bool:
+        parsed = urlparse(url)
+        hostname = (parsed.hostname or "").lower()
+        if self.domains:
+            matched_domain = any(
+                hostname == domain or hostname.endswith(f".{domain}")
+                for domain in self.domains
+            )
+            if not matched_domain:
+                return False
+        if self.url_regex and not self.url_regex.search(url):
+            return False
+        return True
+
+
+@dataclass(frozen=True)
+class DownloadPolicyRule:
+    matcher: DownloadPolicyMatch
+    downloader: str
+    options: Dict[str, Any]
+
+    def matches(self, url: str) -> bool:
+        return self.matcher.matches(url)
+
+
+@dataclass(frozen=True)
+class DownloadPolicy:
+    default_downloader: str = "standard"
+    default_options: Dict[str, Any] | None = None
+    rules: tuple[DownloadPolicyRule, ...] = ()
+
+    def resolve(self, url: str) -> tuple[str, Dict[str, Any]]:
+        for rule in self.rules:
+            if rule.matches(url):
+                return rule.downloader, dict(rule.options)
+        return self.default_downloader, dict(self.default_options or {})
+
+
+def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]:
+    return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS}
+
+
+def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch:
+    domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip())
+    url_regex = raw.get("url_regex")
+    compiled = re.compile(str(url_regex)) if url_regex else None
+    return DownloadPolicyMatch(domains=domains, url_regex=compiled)
+
+
+def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy:
+    default_block = dict(data.get("default") or {})
+    default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard")
+    default_options = _extract_route_options(default_block)
+
+    rules = []
+    for raw_rule in data.get("rules") or []:
+        raw_rule = dict(raw_rule or {})
+        matcher = _build_matcher(dict(raw_rule.get("match") or {}))
+        downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader)
+        options = _extract_route_options(raw_rule)
+        rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options))
+
+    return DownloadPolicy(
+        default_downloader=default_downloader,
+        default_options=default_options,
+        rules=tuple(rules),
+    )
+
+
+def load_download_policy(path: str | Path) -> DownloadPolicy:
+    policy_path = Path(path).expanduser().resolve()
+    payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {}
+    if not isinstance(payload, dict):
+        raise ValueError("Download policy file must define a mapping at the top level")
+    return build_download_policy(payload)
+
+
+__all__ = [
+    "DownloadPolicy",
+    "DownloadPolicyMatch",
+    "DownloadPolicyRule",
+    "VALID_DOWNLOADERS",
+    "build_download_policy",
+    "load_download_policy",
+]