@@ -41,17 +41,18 @@ def do_parse(
4141 f_dump_model_output = True , # Whether to dump model output files
4242 f_dump_orig_pdf = True , # Whether to dump original PDF files
4343 f_dump_content_list = True , # Whether to dump content list files
44- f_dump_md_html = True , # Whether to convert markdown to HTML
45- f_dump_md_docx = True , # Whether to convert markdown to docx (via Pandoc)
44+ f_dump_md_html = False , # Whether to convert markdown to HTML
45+ f_dump_md_docx = False , # Whether to convert markdown to docx (via Pandoc)
4646 f_make_md_mode = MakeMode .MM_MD , # The mode for making markdown content, default is MM_MD
4747 start_page_id = 0 , # Start page ID for parsing, default is 0
4848 end_page_id = None , # End page ID for parsing, default is None (parse all pages until the end of the document)
4949):
5050 layout_config = {
51- # "model_type": LayoutModelType.PP_DOCLAYOUT_PLUS_L ,
51+ # "model_type": LayoutModelType.PP_DOCLAYOUTV2 ,
5252 # "conf_thresh": 0.4,
5353 # "batch_num": 1,
54- # "model_dir_or_path": r"C:\ocr\models\ppmodel\layout\PP-DocLayout_plus-L\pp_doclayout_plus_l.onnx",
54+ # "model_dir_or_path": r"C:\ocr\models\ppmodel\layout\PP-DocLayoutV2\pp_doclayoutv2.onnx",
55+ # "markdown_ignore_labels": ["number", "footnote", "header", "header_image", "footer", "footer_image", "aside_text",]
5556 }
5657
5758 ocr_config = {
@@ -85,7 +86,7 @@ def do_parse(
8586
8687 table_config = {
8788 # "force_ocr": False, # 表格文字,是否强制使用ocr,默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
88- # 注:文字版pdf可以使用pypdfium2提取到表格内图片,扫描版或图片需要使用PP_DOCLAYOUT_PLUS_L版面识别模型 ,才能识别到表格内的图片
89+ # 注:文字版pdf可以使用pypdfium2提取到表格内图片,扫描版或图片需要使用PP_DOCLAYOUT_PLUS_L/PP_DOCLAYOUTV2版面识别模型 ,才能识别到表格内的图片
8990 # "skip_text_in_image": True, # 是否跳过表格里图片中的文字(如表格单元格中嵌入的图片、图标、扫描底图等)
9091 # "use_img2table": False, # 是否优先使用img2table库提取表格,需要手动安装(pip install img2table),基于opencv识别准确度不如使用模型,但是速度很快,默认关闭
9192
0 commit comments