Skip to content

Commit 9255089

Browse files
committed
feat: enhance PDF content extraction with font size analysis
1 parent c76f514 commit 9255089

File tree

1 file changed

+43
-26
lines changed

1 file changed

+43
-26
lines changed

apps/common/handle/impl/text/pdf_split_handle.py

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -91,42 +91,59 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
9191

9292
@staticmethod
9393
def handle_pdf_content(file, pdf_document):
94+
# 第一步:收集所有字体大小
95+
font_sizes = []
96+
for page_num in range(len(pdf_document)):
97+
page = pdf_document.load_page(page_num)
98+
blocks = page.get_text("dict")["blocks"]
99+
for block in blocks:
100+
if block["type"] == 0:
101+
for line in block["lines"]:
102+
for span in line["spans"]:
103+
if span["size"] > 0:
104+
font_sizes.append(span["size"])
105+
106+
# 计算正文字体大小(众数)
107+
if not font_sizes:
108+
body_font_size = 12
109+
else:
110+
from collections import Counter
111+
body_font_size = Counter(font_sizes).most_common(1)[0][0]
112+
113+
# 第二步:提取内容
94114
content = ""
95115
for page_num in range(len(pdf_document)):
96116
start_time = time.time()
97117
page = pdf_document.load_page(page_num)
98-
text = page.get_text()
118+
blocks = page.get_text("dict")["blocks"]
99119

100-
if text and text.strip(): # 如果页面中有文本内容
101-
page_content = text
102-
else:
103-
try:
104-
new_doc = fitz.open()
105-
new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
106-
page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
107-
new_doc.save(page_num_pdf)
108-
new_doc.close()
109-
110-
loader = PyPDFLoader(page_num_pdf, extract_images=True)
111-
page_content = "\n" + loader.load()[0].page_content
112-
except NotImplementedError as e:
113-
# 文件格式不支持,直接退出
114-
raise e
115-
except BaseException as e:
116-
# 当页出错继续进行下一页,防止一个页面出错导致整个文件解析失败
117-
maxkb_logger.error(f"File: {file.name}, Page: {page_num + 1}, error: {e}")
118-
continue
119-
finally:
120-
os.remove(page_num_pdf)
121-
122-
content += page_content
120+
for block in blocks:
121+
if block["type"] == 0: # 文本块
122+
for line in block["lines"]:
123+
if not line["spans"]:
124+
continue
125+
126+
text = "".join([span["text"] for span in line["spans"]])
127+
font_size = line["spans"][0]["size"]
128+
129+
# 根据与正文字体的差值判断
130+
size_diff = font_size - body_font_size
131+
132+
if size_diff > 2: # 明显大于正文
133+
content += f"## {text}\n\n"
134+
elif size_diff > 0.5: # 略大于正文
135+
content += f"### {text}\n\n"
136+
else: # 正文
137+
content += f"{text}\n"
138+
139+
elif block["type"] == 1: # 图片块
140+
content += f"![image](image_{page_num}_{block['number']})\n\n"
123141

124-
# Null characters are not allowed.
125142
content = content.replace('\0', '')
126143

127144
elapsed_time = time.time() - start_time
128145
maxkb_logger.debug(
129-
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
146+
f"File: {file.name}, Page: {page_num + 1}, Time: {elapsed_time:.3f}s")
130147

131148
return content
132149

0 commit comments

Comments
 (0)