Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pdf_craft/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def transform_markdown(
max_ocr_tokens: int | None = None,
max_ocr_output_tokens: int | None = None,
on_ocr_event: Callable[[OCREvent], None] = lambda _: None,
page_indexes: range | None = None,
) -> OCRTokensMetering:

return Transform(
Expand All @@ -67,6 +68,7 @@ def transform_markdown(
max_ocr_tokens=max_ocr_tokens,
max_ocr_output_tokens=max_ocr_output_tokens,
on_ocr_event=on_ocr_event,
page_indexes=page_indexes,
)


Expand Down Expand Up @@ -95,6 +97,7 @@ def transform_epub(
max_ocr_tokens: int | None = None,
max_ocr_output_tokens: int | None = None,
on_ocr_event: Callable[[OCREvent], None] = lambda _: None,
page_indexes: range | None = None,
) -> OCRTokensMetering:

return Transform(
Expand Down Expand Up @@ -123,4 +126,5 @@ def transform_epub(
max_ocr_tokens=max_ocr_tokens,
max_ocr_output_tokens=max_ocr_output_tokens,
on_ocr_event=on_ocr_event,
page_indexes=page_indexes,
)
41 changes: 25 additions & 16 deletions pdf_craft/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def transform_markdown(
max_ocr_tokens: int | None = None,
max_ocr_output_tokens: int | None = None,
on_ocr_event: Callable[[OCREvent], None] = lambda _: None,
page_indexes: range | None = None,
) -> OCRTokensMetering: # pyright: ignore[reportReturnType]

if markdown_assets_path is None:
Expand All @@ -81,6 +82,7 @@ def transform_markdown(
max_tokens=max_ocr_tokens,
max_output_tokens=max_ocr_output_tokens,
on_ocr_event=on_ocr_event,
page_indexes=page_indexes,
)
render_markdown_file(
chapters_path=chapters_path,
Expand Down Expand Up @@ -124,6 +126,7 @@ def transform_epub(
max_ocr_tokens: int | None = None,
max_ocr_output_tokens: int | None = None,
on_ocr_event: Callable[[OCREvent], None] = lambda _: None,
page_indexes: range | None = None,
) -> OCRTokensMetering: # pyright: ignore[reportReturnType]
try:
with EnsureFolder(
Expand All @@ -146,6 +149,7 @@ def transform_epub(
max_tokens=max_ocr_tokens,
max_output_tokens=max_ocr_output_tokens,
on_ocr_event=on_ocr_event,
page_indexes=page_indexes,
)
book_meta = book_meta or self._extract_book_meta(pdf_path)

Expand Down Expand Up @@ -190,6 +194,7 @@ def _extract_from_pdf(
max_tokens: int | None,
max_output_tokens: int | None,
on_ocr_event: Callable[[OCREvent], None],
page_indexes: range | None = None,
):

asserts_path = analysing_path / "assets"
Expand All @@ -208,22 +213,26 @@ def _extract_from_pdf(
input_tokens=0,
output_tokens=0,
)
for event in self._ocr.recognize(
pdf_path=pdf_path,
asset_path=asserts_path,
ocr_path=pages_path,
ocr_size=ocr_size,
dpi=dpi,
max_page_image_file_size=max_page_image_file_size,
includes_footnotes=includes_footnotes,
ignore_pdf_errors=ignore_pdf_errors,
ignore_ocr_errors=ignore_ocr_errors,
plot_path=plot_path,
cover_path=cover_path,
aborted=aborted,
max_tokens=max_tokens,
max_output_tokens=max_output_tokens,
):
recognize_kwargs = {
"pdf_path": pdf_path,
"asset_path": asserts_path,
"ocr_path": pages_path,
"ocr_size": ocr_size,
"dpi": dpi,
"max_page_image_file_size": max_page_image_file_size,
"includes_footnotes": includes_footnotes,
"ignore_pdf_errors": ignore_pdf_errors,
"ignore_ocr_errors": ignore_ocr_errors,
"plot_path": plot_path,
"cover_path": cover_path,
"aborted": aborted,
"max_tokens": max_tokens,
"max_output_tokens": max_output_tokens,
}
if page_indexes is not None:
recognize_kwargs["page_indexes"] = page_indexes

for event in self._ocr.recognize(**recognize_kwargs):
on_ocr_event(event)
metering.input_tokens += event.input_tokens
metering.output_tokens += event.output_tokens
Expand Down