lanl · mikegros · Feb 4, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/examples/single_agent_examples/hypothesizer_agent/README.md b/examples/single_agent_examples/hypothesizer_agent/README.md
@@ -0,0 +1,38 @@
+For the Sci-Fi Bill of Rights demo, see sci_fi_bill_of_rights_inputs/README.txt.
+It requires downloading some public TXT files.
+
+If you want to use this on PDFs, particularly ones that need OCR, then you
+need to install some additional Python libraries.  At this time we haven't
+required those baked into URSA.
+
+On a mac you need:
+
+```
+brew update
+brew install ocrmypdf tesseract
+# NOTE: Feb 1, 2026 - gettext did not install on my mac so had to
+#       build from source, this is LENGTHY process, but 100%
+#       works:
+#       brew install --build-from-source gettext
+#       once gettext is installed, you can go back to
+#       brew install ocrmypdf
+pip install pypdf # you need this too in your Python env.
+```
+
+Once these are installed, you should see something like this, if OCR is needed:
+
+```
+[READING]: your_doc.pdf
+[OCR]: mode=skip (441 chars, 22 pages) -> your_doc.pdf.ocr.skip.pdf
+[OCR]: still low after skip-text; retrying with force-ocr -> your_doc.pdf.ocr.force.pdf
+```
+
+Note that the first `[OCR]` line will only show up if the PDF reading fails and there
+are no text layers discovered (this `skips` some complex / lengthy OCR techniques
+and tries a quick and dirty one.).
-Note that the first `[OCR]` line will only show up if the PDF reading fails and there
-are no text layers discovered (this `skips` some complex / lengthy OCR techniques
-and tries a quick and dirty one.).
+Note that the first `[OCR]` line will show up only if the PDF reading fails and
+no text layers are discovered (this `skips` some complex / lengthy OCR
+techniques and tries a quick and dirty one.).
-Note that the first `[OCR]` line will only show up if the PDF reading fails and there
-are no text layers discovered (this `skips` some complex / lengthy OCR techniques
-and tries a quick and dirty one.).
+Note that the first `[OCR]` line will show up only if the PDF reading fails and
+no text layers are discovered (this `skips` some complex / lengthy OCR
+techniques and tries a quick and dirty one.).
+
+Note that the second `[OCR]` line will only show up if the `skip` version
+still produced no good data to read, this is called the `force` version.
+
+Once a doc has been OCRed (either version) the reader will pick this up automatically
+in the future (ie it will only run this the first time it needs to).
diff --git a/src/ursa/tools/read_file_tool.py b/src/ursa/tools/read_file_tool.py
@@ -1,29 +1,164 @@
+import os
+import subprocess
+from pathlib import Path
+
 from langchain.tools import ToolRuntime
 from langchain_core.tools import tool
 
 from ursa.agents.base import AgentContext
 from ursa.util.parse import read_pdf_text, read_text_file
 
 
+def _pdf_page_count(path: str) -> int:
+    try:
+        from pypdf import PdfReader
+
+        return len(PdfReader(path).pages)
+    except Exception:
+        return 0
-def _pdf_page_count(path: str) -> int:
-    try:
-        from pypdf import PdfReader
-
-        return len(PdfReader(path).pages)
-    except Exception:
-        return 0
+def _pdf_page_count(path: str) -> int:
+    from pypdf import PdfReader
+    return len(PdfReader(path).pages)
-def _pdf_page_count(path: str) -> int:
-    try:
-        from pypdf import PdfReader
-
-        return len(PdfReader(path).pages)
-    except Exception:
-        return 0
+def _pdf_page_count(path: str) -> int:
+    from pypdf import PdfReader
+    return len(PdfReader(path).pages)
+
+
+def _ocr_to_searchable_pdf(
+    src_pdf: str, out_pdf: str, *, mode: str = "skip"
+) -> None:
+    # mode:
+    #  - "skip":  only OCR pages that look like they need it (your current behavior)
+    #  - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
+    cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
-
-
-def _ocr_to_searchable_pdf(
-    src_pdf: str, out_pdf: str, *, mode: str = "skip"
-) -> None:
-    # mode:
-    #  - "skip":  only OCR pages that look like they need it (your current behavior)
-    #  - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
-    cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
+def ocrmypdf_is_installed() -> bool:
+    return shutil.which("ocrmypdf") is not None        
+
+def _ocr_to_searchable_pdf(
+    src_pdf: str, out_pdf: str, *, mode: str = "skip"
+) -> None:
+    # mode:
+    #  - "skip":  only OCR pages that look like they need it (your current behavior)
+    #  - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
+    if not ocrmypdf_is_installed():
+        raise ImportError(
+            "ocrmypdf was not found in your path. "
+            "See installation instructions:"
+            "https://github.com/ocrmypdf/OCRmyPDF?tab=readme-ov-file#installation"
+        )
+        
+    cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
-
-
-def _ocr_to_searchable_pdf(
-    src_pdf: str, out_pdf: str, *, mode: str = "skip"
-) -> None:
-    # mode:
-    #  - "skip":  only OCR pages that look like they need it (your current behavior)
-    #  - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
-    cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
+def ocrmypdf_is_installed() -> bool:
+    return shutil.which("ocrmypdf") is not None        
+
+def _ocr_to_searchable_pdf(
+    src_pdf: str, out_pdf: str, *, mode: str = "skip"
+) -> None:
+    # mode:
+    #  - "skip":  only OCR pages that look like they need it (your current behavior)
+    #  - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
+    if not ocrmypdf_is_installed():
+        raise ImportError(
+            "ocrmypdf was not found in your path. "
+            "See installation instructions:"
+            "https://github.com/ocrmypdf/OCRmyPDF?tab=readme-ov-file#installation"
+        )
+        
+    cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
+
+    if mode == "force":
+        cmd += ["--force-ocr"]
+    else:
+        cmd += ["--skip-text"]
+
+    # Optional: dump a sidecar text file for debugging confidence
+    if os.getenv("READ_FILE_OCR_SIDECAR", "0").lower() in ("1", "true", "yes"):
+        cmd += ["--sidecar", out_pdf + ".txt"]
+
+    cmd += [src_pdf, out_pdf]
+
+    # Don’t swallow stderr/stdout when debugging
+    debug = os.getenv("READ_FILE_OCR_DEBUG", "0").lower() in (
+        "1",
+        "true",
+        "yes",
+    )
+    subprocess.run(
+        cmd,
+        check=True,
+        stdout=None if debug else subprocess.PIPE,
+        stderr=None if debug else subprocess.PIPE,
+        text=True,
+    )
+
+
 @tool
 def read_file(filename: str, runtime: ToolRuntime[AgentContext]) -> str:
-    """
-    Reads in a file with a given filename into a string. Can read in PDF
-    or files that are text/ASCII. Uses a PDF parser if the filename ends
-    with .pdf (case-insensitive)
+    """Read a file from the workspace.
+
+    - If filename ends with .pdf, extract text from the PDF.
+    - If extracted text is very small (likely scanned), optionally run OCR to add a text layer.
+    - Otherwise read as UTF-8 text.
 
     Args:
-        filename: string filename to read in
+        filename: File name relative to the workspace directory.
+
+    Returns:
+        Extracted text content.
     """
     full_filename = runtime.context.workspace.joinpath(filename)
 
-    print("[READING]: ", full_filename)
+    print("[READING]:", full_filename)
+
     try:
-        if full_filename.suffix.lower() == ".pdf":
-            file_contents = read_pdf_text(full_filename)
-        else:
-            file_contents = read_text_file(full_filename)
+        if not (full_filename.suffix.lower() == ".pdf"):
+            return read_text_file(full_filename)
+
+        # 1) normal extraction
+        text = read_pdf_text(full_filename) or ""
+
+        # 2) decide if OCR fallback is needed
+        pages = _pdf_page_count(full_filename)
+        ocr_enabled = os.getenv("READ_FILE_OCR", "1").lower() in (
+            "1",
+            "true",
+            "yes",
+        )
+        min_pages = int(os.getenv("READ_FILE_OCR_MIN_PAGES", "3"))
+        min_chars = int(os.getenv("READ_FILE_OCR_MIN_CHARS", "3000"))
+
+        if ocr_enabled and pages >= min_pages and len(text) < min_chars:
+            src = Path(full_filename)
+
+            mode_env = os.getenv("READ_FILE_OCR_MODE", "auto").lower()
+            force_if_still_low = os.getenv(
+                "READ_FILE_OCR_FORCE_IF_STILL_LOW", "1"
+            ).lower() in ("1", "true", "yes")
+
+            try:
+                # First pass (skip-text) unless user forces always-force
+                first_mode = "force" if mode_env == "force" else "skip"
+                ocr_pdf = str(
+                    src.with_suffix(src.suffix + f".ocr.{first_mode}.pdf")
+                )
+
+                if not os.path.exists(ocr_pdf) or os.path.getmtime(
+                    ocr_pdf
+                ) < os.path.getmtime(full_filename):
+                    print(
+                        f"[OCR]: mode={first_mode} ({len(text)} chars, {pages} pages) -> {ocr_pdf}"
+                    )
+                    _ocr_to_searchable_pdf(
+                        full_filename, ocr_pdf, mode=first_mode
+                    )
+                else:
+                    print(f"[OCR]: using cached OCR PDF -> {ocr_pdf}")
+
+                text2 = read_pdf_text(ocr_pdf) or ""
+                if len(text2) > len(text):
+                    text = text2
+
+                # Second pass: if still low and we weren’t already forcing, try force-ocr
+                if (
+                    force_if_still_low
+                    and mode_env != "force"
+                    and len(text) < min_chars
+                ):
+                    force_pdf = str(
+                        src.with_suffix(src.suffix + ".ocr.force.pdf")
+                    )
+                    if not os.path.exists(force_pdf) or os.path.getmtime(
+                        force_pdf
+                    ) < os.path.getmtime(full_filename):
+                        print(
+                            f"[OCR]: still low after skip-text; retrying with force-ocr -> {force_pdf}"
+                        )
+                        _ocr_to_searchable_pdf(
+                            full_filename, force_pdf, mode="force"
+                        )
+                    else:
+                        print(
+                            f"[OCR]: using cached force OCR PDF -> {force_pdf}"
+                        )
+
+                    text3 = read_pdf_text(force_pdf) or ""
+                    if len(text3) > len(text):
+                        text = text3
+
+            except (FileNotFoundError, subprocess.CalledProcessError) as e:
+                # Missing ocrmypdf or OCR failed: keep original extraction
+                print(f"[OCR Error]: {e}")
+            except Exception as e:
+                # Any other OCR-related failure: keep original extraction
+                print(f"[OCR Error]: {e}")
+
+        return text
+
+    except subprocess.CalledProcessError as e:
+        # OCR failed; return whatever we got from normal extraction
+        err = (e.stderr or "")[:500]
+        print(f"[OCR Error]: {err}")
+        return text if text else f"[Error]: OCR failed: {err}"
     except Exception as e:
         print(f"[Error]: {e}")
-        file_contents = f"[Error]: {e}"
-    return file_contents
+        return f"[Error]: {e}"
diff --git a/tests/tools/test_read_file_tool_ocr.py b/tests/tools/test_read_file_tool_ocr.py
@@ -0,0 +1,167 @@
+import os
+import shutil
+import time
+from pathlib import Path
+
+import pytest
+
+import ursa.tools.read_file_tool as rft
+
+# import the module (not just the symbol) so monkeypatch works cleanly
+from tests.tools.utils import make_runtime
+
+
+def _touch(p: Path, content: bytes = b"%PDF-1.4\n%fake\n") -> None:
+    p.write_bytes(content)
+    # ensure mtime changes if needed
+    os.utime(p, None)
+
+
+# def _call_tool(filename: str, workspace: Path) -> str:
+#     # If @tool produced a Tool object, it should have .invoke
+#     # InjectedState usually flows via state; passing state directly works in practice for unit tests.
+#     return rft.read_file.func(
+#         filename=filename, state={"workspace": str(workspace)}
+#     )
+
+
+def _call_tool(filename: str, workspace: Path) -> str:
+    tool_obj = rft.read_file
+
+    runtime = make_runtime(
+        workspace=workspace,
+        llm=None,
+        tool_call_id="read-file-call",
+    )
+    # Prefer the stable tool interface across langchain_core versions
+    if hasattr(tool_obj, "invoke"):
+        return tool_obj.invoke({"filename": filename, "runtime": runtime})
+
+    # Fallback (older behavior)
+    return tool_obj.func(filename=filename, runtime=runtime)
+
+
+def test_no_ocr_when_text_is_sufficient(tmp_path, monkeypatch):
+    pdf = tmp_path / "doc.pdf"
+    _touch(pdf)
+
+    monkeypatch.setenv("READ_FILE_OCR", "1")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")
+
+    # Pretend this PDF already has plenty of text
+    monkeypatch.setattr(rft, "read_pdf_text", lambda path: "X" * 5000)
+    monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 10)
+
+    called = {"ocr": 0}
+    monkeypatch.setattr(
+        rft,
+        "_ocr_to_searchable_pdf",
+        lambda src, dst, **kwargs: called.__setitem__("ocr", called["ocr"] + 1),
+    )
+
+    out = _call_tool("doc.pdf", tmp_path)
+    print("EXTRACTED_LEN:", len(out))
+    print("EXTRACTED_PREVIEW:", out[:300])
+
+    assert len(out) == 5000
+    assert called["ocr"] == 0
+
+
+def test_ocr_runs_and_uses_ocr_pdf(tmp_path, monkeypatch):
+    pdf = tmp_path / "scan.pdf"
+    _touch(pdf)
+
+    monkeypatch.setenv("READ_FILE_OCR", "1")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")
+
+    monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)
+
+    # Make read_pdf_text return tiny text for original, large for *.ocr.pdf
+    def fake_read_pdf_text(path: Path) -> str:
+        if ".ocr." in str(path) and str(path).endswith(".pdf"):
+            return "OCR_TEXT_" + ("Y" * 4000)
+        return "tiny"
+
+    monkeypatch.setattr(rft, "read_pdf_text", fake_read_pdf_text)
+
+    def fake_ocr(src: str, dst: str, *, mode: str = "skip") -> None:
+        Path(dst).write_bytes(b"%PDF-1.4\n%ocr\n")
+
+    monkeypatch.setattr(rft, "_ocr_to_searchable_pdf", fake_ocr)
+
+    out = _call_tool("scan.pdf", tmp_path)
+    print("EXTRACTED_LEN:", len(out))
+    print("EXTRACTED_PREVIEW:", out[:300])
+
+    assert out.startswith("OCR_TEXT_")
+    assert len(out) > 3000
+    assert (tmp_path / "scan.pdf.ocr.skip.pdf").exists()
+
+
+def test_real_ocr_if_available(tmp_path):
+    if not shutil.which("ocrmypdf"):
+        pytest.skip("ocrmypdf not installed")
+    # generate an image-only PDF here, then call the tool and assert output non-trivial
+
+
+def test_ocr_cache_skips_second_run(tmp_path, monkeypatch):
+    pdf = tmp_path / "scan.pdf"
+    _touch(pdf)
+
+    ocr_pdf = tmp_path / "scan.pdf.ocr.skip.pdf"
+    _touch(ocr_pdf, content=b"%PDF-1.4\n%cached\n")
+
+    # Make cached OCR newer than source
+    time.sleep(0.01)
+    os.utime(ocr_pdf, None)
+
+    monkeypatch.setenv("READ_FILE_OCR", "1")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")
+
+    monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)
+
+    # Original tiny, OCR big
+    def fake_read_pdf_text(path: Path) -> str:
+        return "tiny" if ".ocr." not in str(path) else "Z" * 5000
+
+    monkeypatch.setattr(rft, "read_pdf_text", fake_read_pdf_text)
+
+    called = {"ocr": 0}
+    monkeypatch.setattr(
+        rft,
+        "_ocr_to_searchable_pdf",
+        lambda src, dst, **kwargs: called.__setitem__("ocr", called["ocr"] + 1),
+    )
+
+    out = _call_tool("scan.pdf", tmp_path)
+    print("EXTRACTED_LEN:", len(out))
+    print("EXTRACTED_PREVIEW:", out[:300])
+
+    assert len(out) == 5000
+    assert called["ocr"] == 0
+
+
+def test_ocr_failure_returns_original_text(tmp_path, monkeypatch):
+    pdf = tmp_path / "scan.pdf"
+    _touch(pdf)
+
+    monkeypatch.setenv("READ_FILE_OCR", "1")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
+    monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")
+
+    monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)
+    monkeypatch.setattr(rft, "read_pdf_text", lambda path: "tiny")
+
+    def fail_ocr(src: str, dst: str, *, mode: str = "skip") -> None:
+        raise RuntimeError("ocr failed")
+
+    monkeypatch.setattr(rft, "_ocr_to_searchable_pdf", fail_ocr)
+
+    out = _call_tool("scan.pdf", tmp_path)
+    print("EXTRACTED_LEN:", len(out))
+    print("EXTRACTED_PREVIEW:", out[:300])
+
+    assert out == "tiny"