Replies: 2 comments 1 reply
-
No, this won't work. Extract text via get_text("dict") and any drawings (yes, there exist some) via get_drawings(), and write both out to another PDF. All used fonts are serifed, so you should be to use respectively "CJK" for output. The info in the spans should be sufficient to create write adequate output. |
Beta Was this translation helpful? Give feedback.
1 reply
-
I used below code to convert to clean pdf. import fitz # PyMuPDF
# 输入和输出 PDF 文件路径
input_pdf_path = r'F:\29-1.pdf'
output_pdf_path = r'F:\29-1output.pdf'
# 打开输入 PDF 文件
doc = fitz.open(input_pdf_path)
# 创建一个新的 PDF 文件用于输出
output_doc = fitz.open()
# 等宽字体
font_path = r'F:\fonts\SarasaTermSCNerd.ttc' # 下载字体文件simhei.ttf
font_path = r'F:\fonts\NotoOldShapeNormal.TTF' # 下载字体文件simhei.ttf
font = fitz.Font(fontfile=font_path) # pymupdf加载字体, choose a font with small caps
# 遍历输入 PDF 的每一页
for page_num in range(len(doc)):
# 获取当前页
page = doc[page_num]
# 提取文本
text = page.get_text("dict")
text_content = text["blocks"] # 获取文本块
# 将提取的文本添加到新页面
output_page = output_doc.new_page()
output_page.insert_font(fontname="F0", fontbuffer=font.buffer)
for block in text_content:
if 'lines' in block: # 确保这是文本块
for line in block["lines"]:
# 计算文本位置
text_position = (line["bbox"][0], line["bbox"][1])
# 使用默认字体插入文本
output_page.insert_text(text_position, line["spans"][0]["text"], fontname="F0", fontsize=12)
# 提取绘图
drawings = page.get_drawings()
for drawing in drawings:
# 使用 draw_path 绘制多边形
if drawing.get("points"): # 确保有点
points = drawing["points"]
path = fitz.Path() # 创建一个新的路径
path.append_svg_path(drawing["svg"]) # 从SVG路径添加点
output_page.draw_path(path, color=(0, 0, 0), fill=None) # 绘制路径
# 保存输出 PDF 文件
# output_doc.save(output_pdf_path)
output_doc.save(output_pdf_path, garbage=3, deflate=True)
output_doc.close()
doc.close()
print(f"成功将内容写入 {output_pdf_path}") |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
29-1.pdf
Beta Was this translation helpful? Give feedback.
All reactions