Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions examples/single_agent_examples/hypothesizer_agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ On a mac you need:
```
brew update
brew install ocrmypdf tesseract
# NOTE: Feb 1, 2026 - gettext did not install on my mac so had to
# build from source, this is LENGTHY process, but 100%
# works:
# NOTE: Feb 1, 2026 - gettext did not install on my Mac so had to
# build from source. This is a LENGTHY but reliable process:
# brew install --build-from-source gettext
# once gettext is installed, you can go back to
# brew install ocrmypdf
Expand All @@ -31,8 +30,9 @@ Note that the first `[OCR]` line will only show up if the PDF reading fails and
are no text layers discovered (this `skips` some complex / lengthy OCR techniques
and tries a quick and dirty one.).

Note that the second `[OCR]` line will only show up if the `skip` version
still produced no good data to read, this is called the `force` version.
Note that the second `[OCR]` line will show up only if the `skip` version
still produced no good data to read. This is called the `force` version.

Once a doc has been OCRed (either version) the reader will pick this up automatically
in the future (ie it will only run this the first time it needs to).
Once a doc has been OCRed (either version) the reader will automatically
remember this for the future (i.e. it will run this only the first time it
needs to).
20 changes: 16 additions & 4 deletions src/ursa/tools/read_file_tool.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,41 @@
import os
import shutil
import subprocess
from pathlib import Path

from langchain.tools import ToolRuntime
from langchain_core.tools import tool
from pypdf import PdfReader

from ursa.agents.base import AgentContext
from ursa.util.parse import read_pdf_text, read_text_file


def _pdf_page_count(path: str) -> int:
def _pdf_page_count(path: Path) -> int:
try:
from pypdf import PdfReader

return len(PdfReader(path).pages)
except Exception:
except Exception as e:
print("[Error]: ", e)
return 0


def ocrmypdf_is_installed() -> bool:
return shutil.which("ocrmypdf") is not None


def _ocr_to_searchable_pdf(
src_pdf: str, out_pdf: str, *, mode: str = "skip"
) -> None:
# mode:
# - "skip": only OCR pages that look like they need it (your current behavior)
# - "force": rasterize + OCR everything (fixes vector/outlined “no images” PDFs)
if not ocrmypdf_is_installed():
raise ImportError(
"ocrmypdf was not found in your path. "
"See installation instructions:"
"https://github.com/ocrmypdf/OCRmyPDF?tab=readme-ov-file#installation"
)

cmd = ["ocrmypdf", "--rotate-pages", "--deskew", "--clean"]

if mode == "force":
Expand Down