Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions examples/single_agent_examples/hypothesizer_agent/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
For the Sci-Fi Bill of Rights demo, see sci_fi_bill_of_rights_inputs/README.txt.
It requires downloading some public TXT files.

If you want to use this on PDFs, particularly ones that need OCR, then you
need to install some additional Python libraries. At this time we haven't
required those baked into URSA.

On a mac you need:

```
brew update
brew install ocrmypdf tesseract
# NOTE: Feb 1, 2026 - gettext did not install on my mac so had to
# build from source, this is LENGTHY process, but 100%
# works:
# brew install --build-from-source gettext
# once gettext is installed, you can go back to
# brew install ocrmypdf
pip install pypdf # you need this too in your Python env.
```

Once these are installed, you should see something like this, if OCR is needed:

```
[READING]: your_doc.pdf
[OCR]: mode=skip (441 chars, 22 pages) -> your_doc.pdf.ocr.skip.pdf
[OCR]: still low after skip-text; retrying with force-ocr -> your_doc.pdf.ocr.force.pdf
```

Note that the first `[OCR]` line will only show up if the PDF reading fails and there
are no text layers discovered (this `skips` some complex / lengthy OCR techniques
and tries a quick and dirty one.).
Comment on lines +30 to +32
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Note that the first `[OCR]` line will only show up if the PDF reading fails and there
are no text layers discovered (this `skips` some complex / lengthy OCR techniques
and tries a quick and dirty one.).
Note that the first `[OCR]` line will show up only if the PDF reading fails and
no text layers are discovered (this `skips` some complex / lengthy OCR
techniques and tries a quick and dirty one.).


Note that the second `[OCR]` line will only show up if the `skip` version
still produced no good data to read, this is called the `force` version.

Once a doc has been OCRed (either version) the reader will pick this up automatically
in the future (ie it will only run this the first time it needs to).
159 changes: 147 additions & 12 deletions src/ursa/tools/read_file_tool.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,164 @@
import os
import subprocess
from pathlib import Path

from langchain.tools import ToolRuntime
from langchain_core.tools import tool

from ursa.agents.base import AgentContext
from ursa.util.parse import read_pdf_text, read_text_file


def _pdf_page_count(path: str) -> int:
try:
from pypdf import PdfReader

return len(PdfReader(path).pages)
except Exception:
return 0
Comment on lines +12 to +18
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _pdf_page_count(path: str) -> int:
try:
from pypdf import PdfReader
return len(PdfReader(path).pages)
except Exception:
return 0
def _pdf_page_count(path: str) -> int:
from pypdf import PdfReader
return len(PdfReader(path).pages)

@ndebard pypdf is an existing dependency, so we don't need this check. Also, unless this import is slow, let's move this import to the top of the file.



def _ocr_to_searchable_pdf(
src_pdf: str, out_pdf: str, *, mode: str = "skip"
) -> None:
# mode:
# - "skip": only OCR pages that look like they need it (your current behavior)
# - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
Comment on lines +19 to +27
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _ocr_to_searchable_pdf(
src_pdf: str, out_pdf: str, *, mode: str = "skip"
) -> None:
# mode:
# - "skip": only OCR pages that look like they need it (your current behavior)
# - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]
def ocrmypdf_is_installed() -> bool:
return shutil.which("ocrmypdf") is not None
def _ocr_to_searchable_pdf(
src_pdf: str, out_pdf: str, *, mode: str = "skip"
) -> None:
# mode:
# - "skip": only OCR pages that look like they need it (your current behavior)
# - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
if not ocrmypdf_is_installed():
raise ImportError(
"ocrmypdf was not found in your path. "
"See installation instructions:"
"https://github.com/ocrmypdf/OCRmyPDF?tab=readme-ov-file#installation"
)
cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should check to see ocrmypdf is installed.


if mode == "force":
cmd += ["--force-ocr"]
else:
cmd += ["--skip-text"]

# Optional: dump a sidecar text file for debugging confidence
if os.getenv("READ_FILE_OCR_SIDECAR", "0").lower() in ("1", "true", "yes"):
cmd += ["--sidecar", out_pdf + ".txt"]

cmd += [src_pdf, out_pdf]

# Don’t swallow stderr/stdout when debugging
debug = os.getenv("READ_FILE_OCR_DEBUG", "0").lower() in (
"1",
"true",
"yes",
)
subprocess.run(
cmd,
check=True,
stdout=None if debug else subprocess.PIPE,
stderr=None if debug else subprocess.PIPE,
text=True,
)


@tool
def read_file(filename: str, runtime: ToolRuntime[AgentContext]) -> str:
"""
Reads in a file with a given filename into a string. Can read in PDF
or files that are text/ASCII. Uses a PDF parser if the filename ends
with .pdf (case-insensitive)
"""Read a file from the workspace.

- If filename ends with .pdf, extract text from the PDF.
- If extracted text is very small (likely scanned), optionally run OCR to add a text layer.
- Otherwise read as UTF-8 text.

Args:
filename: string filename to read in
filename: File name relative to the workspace directory.

Returns:
Extracted text content.
"""
full_filename = runtime.context.workspace.joinpath(filename)

print("[READING]: ", full_filename)
print("[READING]:", full_filename)

try:
if full_filename.suffix.lower() == ".pdf":
file_contents = read_pdf_text(full_filename)
else:
file_contents = read_text_file(full_filename)
if not (full_filename.suffix.lower() == ".pdf"):
return read_text_file(full_filename)

# 1) normal extraction
text = read_pdf_text(full_filename) or ""

# 2) decide if OCR fallback is needed
pages = _pdf_page_count(full_filename)
ocr_enabled = os.getenv("READ_FILE_OCR", "1").lower() in (
"1",
"true",
"yes",
)
min_pages = int(os.getenv("READ_FILE_OCR_MIN_PAGES", "3"))
min_chars = int(os.getenv("READ_FILE_OCR_MIN_CHARS", "3000"))

if ocr_enabled and pages >= min_pages and len(text) < min_chars:
src = Path(full_filename)

mode_env = os.getenv("READ_FILE_OCR_MODE", "auto").lower()
force_if_still_low = os.getenv(
"READ_FILE_OCR_FORCE_IF_STILL_LOW", "1"
).lower() in ("1", "true", "yes")

try:
# First pass (skip-text) unless user forces always-force
first_mode = "force" if mode_env == "force" else "skip"
ocr_pdf = str(
src.with_suffix(src.suffix + f".ocr.{first_mode}.pdf")
)

if not os.path.exists(ocr_pdf) or os.path.getmtime(
ocr_pdf
) < os.path.getmtime(full_filename):
print(
f"[OCR]: mode={first_mode} ({len(text)} chars, {pages} pages) -> {ocr_pdf}"
)
_ocr_to_searchable_pdf(
full_filename, ocr_pdf, mode=first_mode
)
else:
print(f"[OCR]: using cached OCR PDF -> {ocr_pdf}")

text2 = read_pdf_text(ocr_pdf) or ""
if len(text2) > len(text):
text = text2

# Second pass: if still low and we weren’t already forcing, try force-ocr
if (
force_if_still_low
and mode_env != "force"
and len(text) < min_chars
):
force_pdf = str(
src.with_suffix(src.suffix + ".ocr.force.pdf")
)
if not os.path.exists(force_pdf) or os.path.getmtime(
force_pdf
) < os.path.getmtime(full_filename):
print(
f"[OCR]: still low after skip-text; retrying with force-ocr -> {force_pdf}"
)
_ocr_to_searchable_pdf(
full_filename, force_pdf, mode="force"
)
else:
print(
f"[OCR]: using cached force OCR PDF -> {force_pdf}"
)

text3 = read_pdf_text(force_pdf) or ""
if len(text3) > len(text):
text = text3

except (FileNotFoundError, subprocess.CalledProcessError) as e:
# Missing ocrmypdf or OCR failed: keep original extraction
print(f"[OCR Error]: {e}")
except Exception as e:
# Any other OCR-related failure: keep original extraction
print(f"[OCR Error]: {e}")

return text

except subprocess.CalledProcessError as e:
# OCR failed; return whatever we got from normal extraction
err = (e.stderr or "")[:500]
print(f"[OCR Error]: {err}")
return text if text else f"[Error]: OCR failed: {err}"
except Exception as e:
print(f"[Error]: {e}")
file_contents = f"[Error]: {e}"
return file_contents
return f"[Error]: {e}"
167 changes: 167 additions & 0 deletions tests/tools/test_read_file_tool_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import os
import shutil
import time
from pathlib import Path

import pytest

import ursa.tools.read_file_tool as rft

# import the module (not just the symbol) so monkeypatch works cleanly
from tests.tools.utils import make_runtime


def _touch(p: Path, content: bytes = b"%PDF-1.4\n%fake\n") -> None:
p.write_bytes(content)
# ensure mtime changes if needed
os.utime(p, None)


# def _call_tool(filename: str, workspace: Path) -> str:
# # If @tool produced a Tool object, it should have .invoke
# # InjectedState usually flows via state; passing state directly works in practice for unit tests.
# return rft.read_file.func(
# filename=filename, state={"workspace": str(workspace)}
# )


def _call_tool(filename: str, workspace: Path) -> str:
tool_obj = rft.read_file

runtime = make_runtime(
workspace=workspace,
llm=None,
tool_call_id="read-file-call",
)
# Prefer the stable tool interface across langchain_core versions
if hasattr(tool_obj, "invoke"):
return tool_obj.invoke({"filename": filename, "runtime": runtime})

# Fallback (older behavior)
return tool_obj.func(filename=filename, runtime=runtime)


def test_no_ocr_when_text_is_sufficient(tmp_path, monkeypatch):
pdf = tmp_path / "doc.pdf"
_touch(pdf)

monkeypatch.setenv("READ_FILE_OCR", "1")
monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")

# Pretend this PDF already has plenty of text
monkeypatch.setattr(rft, "read_pdf_text", lambda path: "X" * 5000)
monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 10)

called = {"ocr": 0}
monkeypatch.setattr(
rft,
"_ocr_to_searchable_pdf",
lambda src, dst, **kwargs: called.__setitem__("ocr", called["ocr"] + 1),
)

out = _call_tool("doc.pdf", tmp_path)
print("EXTRACTED_LEN:", len(out))
print("EXTRACTED_PREVIEW:", out[:300])

assert len(out) == 5000
assert called["ocr"] == 0


def test_ocr_runs_and_uses_ocr_pdf(tmp_path, monkeypatch):
pdf = tmp_path / "scan.pdf"
_touch(pdf)

monkeypatch.setenv("READ_FILE_OCR", "1")
monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")

monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)

# Make read_pdf_text return tiny text for original, large for *.ocr.pdf
def fake_read_pdf_text(path: Path) -> str:
if ".ocr." in str(path) and str(path).endswith(".pdf"):
return "OCR_TEXT_" + ("Y" * 4000)
return "tiny"

monkeypatch.setattr(rft, "read_pdf_text", fake_read_pdf_text)

def fake_ocr(src: str, dst: str, *, mode: str = "skip") -> None:
Path(dst).write_bytes(b"%PDF-1.4\n%ocr\n")

monkeypatch.setattr(rft, "_ocr_to_searchable_pdf", fake_ocr)

out = _call_tool("scan.pdf", tmp_path)
print("EXTRACTED_LEN:", len(out))
print("EXTRACTED_PREVIEW:", out[:300])

assert out.startswith("OCR_TEXT_")
assert len(out) > 3000
assert (tmp_path / "scan.pdf.ocr.skip.pdf").exists()


def test_real_ocr_if_available(tmp_path):
if not shutil.which("ocrmypdf"):
pytest.skip("ocrmypdf not installed")
# generate an image-only PDF here, then call the tool and assert output non-trivial


def test_ocr_cache_skips_second_run(tmp_path, monkeypatch):
pdf = tmp_path / "scan.pdf"
_touch(pdf)

ocr_pdf = tmp_path / "scan.pdf.ocr.skip.pdf"
_touch(ocr_pdf, content=b"%PDF-1.4\n%cached\n")

# Make cached OCR newer than source
time.sleep(0.01)
os.utime(ocr_pdf, None)

monkeypatch.setenv("READ_FILE_OCR", "1")
monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")

monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)

# Original tiny, OCR big
def fake_read_pdf_text(path: Path) -> str:
return "tiny" if ".ocr." not in str(path) else "Z" * 5000

monkeypatch.setattr(rft, "read_pdf_text", fake_read_pdf_text)

called = {"ocr": 0}
monkeypatch.setattr(
rft,
"_ocr_to_searchable_pdf",
lambda src, dst, **kwargs: called.__setitem__("ocr", called["ocr"] + 1),
)

out = _call_tool("scan.pdf", tmp_path)
print("EXTRACTED_LEN:", len(out))
print("EXTRACTED_PREVIEW:", out[:300])

assert len(out) == 5000
assert called["ocr"] == 0


def test_ocr_failure_returns_original_text(tmp_path, monkeypatch):
pdf = tmp_path / "scan.pdf"
_touch(pdf)

monkeypatch.setenv("READ_FILE_OCR", "1")
monkeypatch.setenv("READ_FILE_OCR_MIN_PAGES", "3")
monkeypatch.setenv("READ_FILE_OCR_MIN_CHARS", "3000")

monkeypatch.setattr(rft, "_pdf_page_count", lambda path: 22)
monkeypatch.setattr(rft, "read_pdf_text", lambda path: "tiny")

def fail_ocr(src: str, dst: str, *, mode: str = "skip") -> None:
raise RuntimeError("ocr failed")

monkeypatch.setattr(rft, "_ocr_to_searchable_pdf", fail_ocr)

out = _call_tool("scan.pdf", tmp_path)
print("EXTRACTED_LEN:", len(out))
print("EXTRACTED_PREVIEW:", out[:300])

assert out == "tiny"
Loading
Loading