添加表格force_ocr参数

hzkitty · hzkitty · commit 2a0dc6244645 · 2025-09-17T00:00:13.000+08:00
diff --git a/demo.py b/demo.py
@@ -81,6 +81,7 @@ def do_parse(
         # os.environ['MINERU_MODEL_SOURCE'] = 'local'
 
         table_config = {
+            # "force_ocr": False, # 表格文字，是否强制使用ocr，默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
             # "model_type": TableModelType.UNET_SLANET_PLUS,  # （默认） 有线表格使用unet，无线表格使用slanet_plus
             # "model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet，无线表格使用unitable
             # "model_type": TableModelType.SLANEXT,  # 有线表格使用slanext_wired，无线表格使用slanext_wireless
@@ -232,11 +233,11 @@ def parse_doc(
 
     doc_path_list = [
         # r"D:\CodeProjects\doc\KittyDoc\github\KittyDoc\tests\checkbox_test.png",
+        # "D:\\file\\text-pdf\\示例1-论文模板.pdf",
         # "D:\\file\\text-pdf\\比亚迪财报.pdf",
-        "D:\\file\\text-pdf\\GBT3620.1-2016.pdf",
+        # "D:\\file\\text-pdf\\GBT3620.1-2016.pdf",
         # r'C:\ocr\img\table\3766ae2b506b8f345fcc9eee39b31ac8.png'
         # r'D:\file\text-pdf\img\文字文稿123.pdf',
-        # "D:\\file\\text-pdf\\示例1-论文模板.pdf",
         # r'D:\file\text-pdf\img\table_test.pdf'
         # "D:\\file\\text-pdf\\示例1-论文模板.pdf",
         # "D:\\file\\text-pdf\\示例7-研究报告.pdf",
diff --git a/demo/demo.py b/demo/demo.py
@@ -77,6 +77,7 @@ def do_parse(
         # os.environ['MINERU_MODEL_SOURCE'] = 'local'
 
         table_config = {
+            # "force_ocr": False, # 表格文字，是否强制使用ocr，默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
             # "model_type": TableModelType.UNET_SLANET_PLUS,  # （默认） 有线表格使用unet，无线表格使用slanet_plus
             # "model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet，无线表格使用unitable
             # "model_type": TableModelType.SLANEXT,  # 有线表格使用slanext_wired，无线表格使用slanext_wireless
diff --git a/docs/analyze_param.md b/docs/analyze_param.md
@@ -108,6 +108,7 @@ formula_config = {
 
 |               参数名                |           说明           |         默认值          | 备注 |
 |:--------------------------------:|:----------------------:|:--------------------:|:--:|
+|            force_ocr            |           表格文字是否强制使用ocr           | False | 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本 |
 |            model_type            |           模型           | UNET_SLANET_PLUS | 有线表格使用unet，无线表格使用slanet_plus |
 |        model_dir_or_path         |          模型地址          |          None           | 单个模型使用。如SLANET_PLUS、UNITABLE |
 |      cls.model_dir_or_path       |        表格分类模型地址        |         None           |  |
@@ -123,6 +124,7 @@ formula_config = {
 from kitty_doc.model.table.rapid_table_self import ModelType as TableModelType
 
 table_config = {
+    "force_ocr": False, # 表格文字，是否强制使用ocr，默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
     "model_type": TableModelType.UNET_SLANET_PLUS,  # （默认） 有线表格使用unet，无线表格使用slanet_plus
     #"model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet，无线表格使用unitable
     #"model_type": TableModelType.SLANEXT,  # 有线表格使用slanext_wired，无线表格使用slanext_wireless
diff --git a/kitty_doc/backend/pipeline/batch_analyze.py b/kitty_doc/backend/pipeline/batch_analyze.py
@@ -37,6 +37,7 @@ def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable
         self.formula_enable = get_formula_enable(formula_enable)
         self.formula_level = formula_config.get("formula_level", 0) if formula_config else 0
         self.table_enable = get_table_enable(table_enable)
+        self.table_force_ocr = table_config.get("force_ocr", False) if table_config else False
         self.checkbox_enable = checkbox_config.get("checkbox_enable", False) if checkbox_config else False
         self.layout_config = layout_config
         self.ocr_config = ocr_config
@@ -325,7 +326,7 @@ def __call__(self, images_with_extra_info: list) -> list:
                     for table_res_dict in table_list:
                         _lang = table_res_dict['lang']
                         ocr_result = None
-                        if not table_res_dict['ocr_enable']:
+                        if not self.table_force_ocr and not table_res_dict['ocr_enable']:
                             # RapidTable非OCR文本提取 OcrText
                             pdf_doc = table_res_dict['pdf_doc']
                             # 进行 OCR-det 识别文字框
diff --git a/kitty_doc/model/table/rapid_table_self/main.py b/kitty_doc/model/table/rapid_table_self/main.py
@@ -10,7 +10,7 @@
 import numpy as np
 from .wired_table_rec.main import WiredTableInput, WiredTableRecognition
 
-from .table_matcher.table_match_pipeline import TableMatchPipeline
+# from .table_matcher.table_match_pipeline import TableMatchPipeline
 from .model_processor.main import ModelProcessor
 from .table_matcher import TableMatch
 from .utils import (
@@ -43,7 +43,7 @@ def __init__(self, cfg: Optional[RapidTableInput] = None):
             self.ocr_engine = self._init_ocr_engine(self.cfg.ocr_params)
 
         self.table_matcher = TableMatch()
-        self.table_matcher_pipeline = TableMatchPipeline()
+        # self.table_matcher_pipeline = TableMatchPipeline()
         self.load_img = LoadImage()
 
     def _init_ocr_engine(self, params: Dict[Any, Any]):
diff --git a/kitty_doc/version.py b/kitty_doc/version.py
@@ -1,2 +1,2 @@
-__version__ = "0.0.1"
-__mineru_version__ = "2.2.1"
+__version__ = "0.1.0"
+__mineru_version__ = "2.2.2"
diff --git a/requirements.txt b/requirements.txt
@@ -20,9 +20,6 @@ onnxruntime>=1.18.1
 openvino>=2024.6.0
 ftfy>=6.3.1,<7
 matplotlib>=3.10,<4
-json_repair
-#scikit-learn
-#img2table
-# wired_table_rec相关包
-scipy
-Shapely
+shapely>=2.0.7,<3
+scikit-image>=0.25.0,<1.0.0
+beautifulsoup4>=4.13.5,<5