@@ -22,35 +22,48 @@ def __init__(self, book_json_path, output_dir="build"):
2222
2323 def extract_precise_toc (self , doc , offset ):
2424 """
25- 使用 get_text("dict") 提取标题精确位置,并同步到 TOC 。
25+ 根据 get_toc() 返回的初步目录,在对应页码进行文本定位,获取 y 坐标并偏移 。
2626 """
27+ # PyMuPDF get_toc() 可能返回 3 或 4 个元素的列表: [lvl, title, page, (dest_dict)]
2728 raw_toc = doc .get_toc ()
28- # 获取所有文本块,识别可能的标题 (font size > 14)
29- headings_map = {}
30- for page_num in range (len (doc )):
31- page = doc [page_num ]
32- blocks = page .get_text ("dict" )["blocks" ]
33- for b in blocks :
34- if "lines" in b :
35- for line in b ["lines" ]:
36- for s in line ["spans" ]:
37- # 粗放式匹配:字体大且粗的可能是标题
38- if s ["size" ] > 12 :
39- text = s ["text" ].strip ()
40- # 存入映射,对 key 进行标准化处理(去除空格、处理罕见字符等)
41- if text :
42- headings_map [text .lower ()] = (page_num , s ["bbox" ][1 ])
43-
4429 refined_toc = []
30+
4531 for entry in raw_toc :
46- lvl , title , page , dest = entry
47- # 尝试匹配文本高度,使用小写标准化匹配
48- match_title = title .strip ().lower ()
49- if match_title in headings_map :
50- p_idx , y_coord = headings_map [match_title ]
51- dest = {"kind" : fitz .LINK_GOTO , "to" : fitz .Point (0 , y_coord )}
32+ lvl = entry [0 ]
33+ title = entry [1 ]
34+ page_1 = entry [2 ] # 1st-based page in current doc
35+
36+ # 默认目标 (整页跳转)
37+ # PyMuPDF set_toc 期待 dest 为字典,或者 None (默认跳转到页顶)
38+ new_page_1 = page_1 + offset
39+ dest = {"kind" : fitz .LINK_GOTO , "page" : new_page_1 - 1 , "to" : fitz .Point (0 , 0 )}
40+
41+ # 尝试在特定页面查找标题以获取精确 Y 坐标
42+ page_0 = page_1 - 1
43+ if 0 <= page_0 < len (doc ):
44+ found_y = None
45+ page_obj = doc [page_0 ]
46+ # get_text("dict") 包含了文本块的边界框
47+ blocks = page_obj .get_text ("dict" )["blocks" ]
48+ target_title_norm = title .strip ().lower ()
5249
53- refined_toc .append ([lvl , title , page + offset , dest ])
50+ for b in blocks :
51+ if "lines" in b :
52+ for line in b ["lines" ]:
53+ for s in line ["spans" ]:
54+ if s ["text" ].strip ().lower () == target_title_norm :
55+ found_y = s ["bbox" ][1 ] # y0 (top coordinate)
56+ break
57+ if found_y is not None : break
58+ if found_y is not None : break
59+
60+ if found_y is not None :
61+ dest ["to" ] = fitz .Point (0 , found_y )
62+ else :
63+ print (f" Note: Could not find precise position for '{ title } ' on page { page_1 } , using page top." )
64+
65+ refined_toc .append ([lvl , title , new_page_1 , dest ])
66+
5467 return refined_toc
5568
5669 def process (self ):
0 commit comments