Skip to content

Commit 3877b36

Browse files
committed
Update processor.py
1 parent 7841c0e commit 3877b36

File tree

1 file changed

+37
-24
lines changed

1 file changed

+37
-24
lines changed

images/exporter-build/scripts/processor.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,48 @@ def __init__(self, book_json_path, output_dir="build"):
2222

2323
def extract_precise_toc(self, doc, offset):
2424
"""
25-
使用 get_text("dict") 提取标题精确位置,并同步到 TOC
25+
根据 get_toc() 返回的初步目录,在对应页码进行文本定位,获取 y 坐标并偏移
2626
"""
27+
# PyMuPDF get_toc() 可能返回 3 或 4 个元素的列表: [lvl, title, page, (dest_dict)]
2728
raw_toc = doc.get_toc()
28-
# 获取所有文本块,识别可能的标题 (font size > 14)
29-
headings_map = {}
30-
for page_num in range(len(doc)):
31-
page = doc[page_num]
32-
blocks = page.get_text("dict")["blocks"]
33-
for b in blocks:
34-
if "lines" in b:
35-
for line in b["lines"]:
36-
for s in line["spans"]:
37-
# 粗放式匹配:字体大且粗的可能是标题
38-
if s["size"] > 12:
39-
text = s["text"].strip()
40-
# 存入映射,对 key 进行标准化处理(去除空格、处理罕见字符等)
41-
if text:
42-
headings_map[text.lower()] = (page_num, s["bbox"][1])
43-
4429
refined_toc = []
30+
4531
for entry in raw_toc:
46-
lvl, title, page, dest = entry
47-
# 尝试匹配文本高度,使用小写标准化匹配
48-
match_title = title.strip().lower()
49-
if match_title in headings_map:
50-
p_idx, y_coord = headings_map[match_title]
51-
dest = {"kind": fitz.LINK_GOTO, "to": fitz.Point(0, y_coord)}
32+
lvl = entry[0]
33+
title = entry[1]
34+
page_1 = entry[2] # 1st-based page in current doc
35+
36+
# 默认目标 (整页跳转)
37+
# PyMuPDF set_toc 期待 dest 为字典,或者 None (默认跳转到页顶)
38+
new_page_1 = page_1 + offset
39+
dest = {"kind": fitz.LINK_GOTO, "page": new_page_1 - 1, "to": fitz.Point(0, 0)}
40+
41+
# 尝试在特定页面查找标题以获取精确 Y 坐标
42+
page_0 = page_1 - 1
43+
if 0 <= page_0 < len(doc):
44+
found_y = None
45+
page_obj = doc[page_0]
46+
# get_text("dict") 包含了文本块的边界框
47+
blocks = page_obj.get_text("dict")["blocks"]
48+
target_title_norm = title.strip().lower()
5249

53-
refined_toc.append([lvl, title, page + offset, dest])
50+
for b in blocks:
51+
if "lines" in b:
52+
for line in b["lines"]:
53+
for s in line["spans"]:
54+
if s["text"].strip().lower() == target_title_norm:
55+
found_y = s["bbox"][1] # y0 (top coordinate)
56+
break
57+
if found_y is not None: break
58+
if found_y is not None: break
59+
60+
if found_y is not None:
61+
dest["to"] = fitz.Point(0, found_y)
62+
else:
63+
print(f" Note: Could not find precise position for '{title}' on page {page_1}, using page top.")
64+
65+
refined_toc.append([lvl, title, new_page_1, dest])
66+
5467
return refined_toc
5568

5669
def process(self):

0 commit comments

Comments
 (0)