eellak
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 18 additions & 23 deletions b/‎README.md‎
Lines changed: 18 additions & 23 deletions
diff --git a/‎dependency_setup/deepseek_gpu_smoke.py‎
Lines changed: 15 additions & 18 deletions b/‎dependency_setup/deepseek_gpu_smoke.py‎
Lines changed: 15 additions & 18 deletions
diff --git a/‎dependency_setup/deepseek_uv/pyproject.toml‎
Lines changed: 28 additions & 0 deletions b/‎dependency_setup/deepseek_uv/pyproject.toml‎
Lines changed: 28 additions & 0 deletions
@@ -58,10 +58,13 @@ htmlcov/
 # OCR test outputs
 test_ocr_*_output/
 *_demo_output/
+artifacts/
 
 # OCR model weights (if downloaded locally)
 nanonets/
 ocr_models/
+deepseek-ocr-2-model/
+models/
 
 # Noise analysis reports
 glossapi_noise_analysis_report.md
@@ -78,4 +81,5 @@ dependency_setup/.venvs/
 deepseek-ocr/DeepSeek-OCR-empty/
 # Local DeepSeek checkout and repro scripts (keep out of master)
 deepseek-ocr/
+deepseek-ocr-2/
 repro_rapidocr_onnx/
@@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss.
 
 ## Why GlossAPI
 - Handles download → extraction → cleaning → sectioning in one pipeline.
-- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR.
+- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation.
 - Rust-powered cleaner/noise metrics keep Markdown quality predictable.
 - Greek-first metadata and section classification tuned for academic corpora.
 - Modular Corpus API lets you resume from any stage or plug into existing flows.
@@ -40,45 +40,40 @@ PY
 
 ## Automated Environment Profiles
 
-Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes:
+Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime:
 
 ```bash
-# Vanilla pipeline (no GPU OCR extras)
-./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests
+# Docling / main GlossAPI environment
+./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests
 
-# Docling + RapidOCR mode
-./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests
-
-# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR)
-./dependency_setup/setup_glossapi.sh \
-  --mode deepseek \
+# DeepSeek OCR runtime (uv-managed)
+./dependency_setup/setup_deepseek_uv.sh \
   --venv dependency_setup/.venvs/deepseek \
-  --weights-dir /path/to/deepseek-ocr \
+  --model-root /path/to/deepseek-ocr-2-model \
+  --download-model \
   --run-tests --smoke-test
 ```
 
-Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately.
+`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`.
 
 **DeepSeek runtime checklist**
-- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub.
-- Export these to force the real CLI and avoid silent stub output:
+- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR.
+- Export these to force the real runtime and avoid silent stub output:
   - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`
   - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`
-  - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py`
-  - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python`
-  - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR`
-  - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib`
-- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`.
-- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`.
-- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`).
-- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`.
+  - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python`
+  - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`
+  - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2`
+- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`.
+- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise.
 
 ## Choose Your Install Path
 
 | Scenario | Commands | Notes |
 | --- | --- | --- |
 | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. |
-| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. |
+| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. |
+| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. |
 | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. |
 | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. |
 
 
@@ -3,9 +3,9 @@
 Minimal DeepSeek OCR integration smoke test.
 
 This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and
-verifies that real Markdown output is produced. It requires the DeepSeek-OCR
-weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to
-the repository root (override via ``DEEPSEEK_MODEL_DIR``).
+verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2
+weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the
+repository root (override via ``DEEPSEEK_MODEL_DIR``).
 """
 from __future__ import annotations
 
@@ -20,15 +20,16 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
 SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs"
-DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve()
+DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve()
 
 
 def ensure_model_available(model_root: Path) -> None:
-    expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors"
+    direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
+    expected = direct_root / "model-00001-of-000001.safetensors"
     if not expected.exists() or expected.stat().st_size < 1_000_000:
         raise FileNotFoundError(
-            f"Expected DeepSeek-OCR weights at {expected}. "
-            "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) "
+            f"Expected DeepSeek-OCR-2 weights at {expected}. "
+            "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) "
             "or set DEEPSEEK_MODEL_DIR to the directory that contains them."
         )
 
@@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None:
     from glossapi import Corpus
 
     ensure_model_available(model_root)
-    sample_pdf = SAMPLES_DIR / "sample01_plain.pdf"
+    model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
+    sample_pdf = SAMPLES_DIR / "alpha.pdf"
     if not sample_pdf.exists():
         raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}")
 
@@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None:
         parquet_path = dl_dir / "download_results.parquet"
         df.to_parquet(parquet_path, index=False)
 
+        os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1")
         os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0")
         os.environ.setdefault(
-            "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT",
-            str(model_root / "run_pdf_ocr_vllm.py"),
+            "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT",
+            str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"),
         )
         os.environ.setdefault(
             "GLOSSAPI_DEEPSEEK_PYTHON",
             sys.executable,
         )
-        ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str(
-            model_root / "libjpeg-turbo" / "lib"
-        )
-        os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra
-        os.environ["LD_LIBRARY_PATH"] = (
-            f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":")
-        )
+        os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir))
 
         corpus = Corpus(input_dir=input_dir, output_dir=output_dir)
         corpus.ocr(
@@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None:
 
 
 def main() -> None:
-    model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR")
+    model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR")
     if model_dir_env:
         model_root = Path(model_dir_env).expanduser().resolve()
     else:
 
@@ -0,0 +1,28 @@
+[project]
+name = "glossapi-deepseek-runtime"
+version = "0.1.0"
+description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution"
+requires-python = ">=3.11,<3.13"
+dependencies = [
+    "glossapi[docling,deepseek]",
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0",
+]
+
+[dependency-groups]
+test = [
+    "pytest",
+    "fpdf2",
+]
+
+[tool.uv.sources]
+glossapi = { path = "../..", editable = true }
+torch = { index = "pytorch-cu118" }
+torchvision = { index = "pytorch-cu118" }
+torchaudio = { index = "pytorch-cu118" }
+
+[[tool.uv.index]]
+name = "pytorch-cu118"
+url = "https://download.pytorch.org/whl/cu118"
+explicit = true