Skip to content

Commit f84386b

Browse files
committed
fix: 处理某些pdf中不包括目录和内部链接不能完整导入的问题
(cherry picked from commit fb8b967)
1 parent 5082ab9 commit f84386b

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

apps/common/handle/impl/pdf_split_handle.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@
3131
max_kb = logging.getLogger("max_kb")
3232

3333

34+
def check_links_in_pdf(doc):
35+
for page_number in range(len(doc)):
36+
page = doc[page_number]
37+
links = page.get_links()
38+
if links:
39+
for link in links:
40+
if link['kind'] == 1:
41+
return True
42+
return False
43+
3444
class PdfSplitHandle(BaseSplitHandle):
3545
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
3646
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
@@ -175,6 +185,9 @@ def handle_toc(doc, limit):
175185

176186
@staticmethod
177187
def handle_links(doc, pattern_list, with_filter, limit):
188+
# 检查文档是否包含内部链接
189+
if not check_links_in_pdf(doc):
190+
return
178191
# 创建存储章节内容的数组
179192
chapters = []
180193
toc_start_page = -1

0 commit comments

Comments
 (0)