Skip to content

Commit 5bb0fa1

Browse files
authored
feat: pass all toc pages to LLM (#338)
1 parent f7450cf commit 5bb0fa1

File tree

6 files changed

+315
-59
lines changed

6 files changed

+315
-59
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
/.venv
44
/analysing
55
/models-cache
6+
/format.json
67

78
# Byte-compiled / optimized / DLL files
89
__pycache__/

format.template.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"key": "<YOUR_API_KEY>",
3+
"url": "https://sample-llm.com/v1",
4+
"model": "sample-llm-model",
5+
"token_encoding": "o200k_base",
6+
"timeout": 360.0,
7+
"retry_times": 10,
8+
"retry_interval_seconds": 0.75,
9+
"temperature": 0.6,
10+
"top_p": 0.6
11+
}

pdf_craft/common/reader.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from pathlib import Path
3-
from typing import Callable, Generator, Generic, TypeVar
3+
from typing import Callable, Container, Generator, Generic, TypeVar
44
from xml.etree.ElementTree import Element
55

66
from .xml import read_xml
@@ -25,11 +25,15 @@ def __init__(
2525
indexed_files.append((idx, p))
2626

2727
indexed_files.sort(key=lambda t: t[0])
28-
self._file_paths: list[Path] = [path for _, path in indexed_files]
28+
self._indexed_files: list[tuple[int, Path]] = indexed_files
2929
self._decode: Callable[[Element], T] = decode
3030

31-
def read(self) -> Generator[T, None, None]:
32-
for xml_path in self._file_paths:
31+
def read(
32+
self, page_indexes: Container[int] | None = None
33+
) -> Generator[T, None, None]:
34+
for page_index, xml_path in self._indexed_files:
35+
if page_indexes is not None and page_index not in page_indexes:
36+
continue
3337
root = read_xml(xml_path)
3438
try:
3539
yield self._decode(root)

pdf_craft/toc/analysing.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,16 @@ def _do_analyse_toc(
8888

8989
try:
9090
ref2level = analyse_toc_levels_by_llm(
91-
toc_pages=toc_pages,
9291
llm=llm,
92+
toc_page_refs=toc_pages,
93+
toc_page_contents=list(
94+
pages.read(
95+
page_indexes={toc_page.page_index for toc_page in toc_pages},
96+
)
97+
),
9398
)
9499
toc_page_indexes.extend(ref.page_index for ref in toc_pages)
100+
95101
except LLMAnalysisError as e:
96102
print(f"LLM analysis failed, falling back to statistical method: {e}")
97103
ref2level = analyse_toc_levels(

0 commit comments

Comments
 (0)