Skip to content

Commit 2a0dc62

Browse files
committed
添加表格force_ocr参数
1 parent f66cb3d commit 2a0dc62

File tree

7 files changed

+15
-13
lines changed

7 files changed

+15
-13
lines changed

demo.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def do_parse(
8181
# os.environ['MINERU_MODEL_SOURCE'] = 'local'
8282

8383
table_config = {
84+
# "force_ocr": False, # 表格文字,是否强制使用ocr,默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
8485
# "model_type": TableModelType.UNET_SLANET_PLUS, # (默认) 有线表格使用unet,无线表格使用slanet_plus
8586
# "model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet,无线表格使用unitable
8687
# "model_type": TableModelType.SLANEXT, # 有线表格使用slanext_wired,无线表格使用slanext_wireless
@@ -232,11 +233,11 @@ def parse_doc(
232233

233234
doc_path_list = [
234235
# r"D:\CodeProjects\doc\KittyDoc\github\KittyDoc\tests\checkbox_test.png",
236+
# "D:\\file\\text-pdf\\示例1-论文模板.pdf",
235237
# "D:\\file\\text-pdf\\比亚迪财报.pdf",
236-
"D:\\file\\text-pdf\\GBT3620.1-2016.pdf",
238+
# "D:\\file\\text-pdf\\GBT3620.1-2016.pdf",
237239
# r'C:\ocr\img\table\3766ae2b506b8f345fcc9eee39b31ac8.png'
238240
# r'D:\file\text-pdf\img\文字文稿123.pdf',
239-
# "D:\\file\\text-pdf\\示例1-论文模板.pdf",
240241
# r'D:\file\text-pdf\img\table_test.pdf'
241242
# "D:\\file\\text-pdf\\示例1-论文模板.pdf",
242243
# "D:\\file\\text-pdf\\示例7-研究报告.pdf",

demo/demo.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def do_parse(
7777
# os.environ['MINERU_MODEL_SOURCE'] = 'local'
7878

7979
table_config = {
80+
# "force_ocr": False, # 表格文字,是否强制使用ocr,默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
8081
# "model_type": TableModelType.UNET_SLANET_PLUS, # (默认) 有线表格使用unet,无线表格使用slanet_plus
8182
# "model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet,无线表格使用unitable
8283
# "model_type": TableModelType.SLANEXT, # 有线表格使用slanext_wired,无线表格使用slanext_wireless

docs/analyze_param.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ formula_config = {
108108

109109
| 参数名 | 说明 | 默认值 | 备注 |
110110
|:--------------------------------:|:----------------------:|:--------------------:|:--:|
111+
| force_ocr | 表格文字是否强制使用ocr | False | 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本 |
111112
| model_type | 模型 | UNET_SLANET_PLUS | 有线表格使用unet,无线表格使用slanet_plus |
112113
| model_dir_or_path | 模型地址 | None | 单个模型使用。如SLANET_PLUS、UNITABLE |
113114
| cls.model_dir_or_path | 表格分类模型地址 | None | |
@@ -123,6 +124,7 @@ formula_config = {
123124
from kitty_doc.model.table.rapid_table_self import ModelType as TableModelType
124125

125126
table_config = {
127+
"force_ocr": False, # 表格文字,是否强制使用ocr,默认 False 根据 parse_method 来判断是否需要ocr还是从pdf中直接提取文本
126128
"model_type": TableModelType.UNET_SLANET_PLUS, # (默认) 有线表格使用unet,无线表格使用slanet_plus
127129
#"model_type": TableModelType.UNET_UNITABLE, # 有线表格使用unet,无线表格使用unitable
128130
#"model_type": TableModelType.SLANEXT, # 有线表格使用slanext_wired,无线表格使用slanext_wireless

kitty_doc/backend/pipeline/batch_analyze.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable
3737
self.formula_enable = get_formula_enable(formula_enable)
3838
self.formula_level = formula_config.get("formula_level", 0) if formula_config else 0
3939
self.table_enable = get_table_enable(table_enable)
40+
self.table_force_ocr = table_config.get("force_ocr", False) if table_config else False
4041
self.checkbox_enable = checkbox_config.get("checkbox_enable", False) if checkbox_config else False
4142
self.layout_config = layout_config
4243
self.ocr_config = ocr_config
@@ -325,7 +326,7 @@ def __call__(self, images_with_extra_info: list) -> list:
325326
for table_res_dict in table_list:
326327
_lang = table_res_dict['lang']
327328
ocr_result = None
328-
if not table_res_dict['ocr_enable']:
329+
if not self.table_force_ocr and not table_res_dict['ocr_enable']:
329330
# RapidTable非OCR文本提取 OcrText
330331
pdf_doc = table_res_dict['pdf_doc']
331332
# 进行 OCR-det 识别文字框

kitty_doc/model/table/rapid_table_self/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import numpy as np
1111
from .wired_table_rec.main import WiredTableInput, WiredTableRecognition
1212

13-
from .table_matcher.table_match_pipeline import TableMatchPipeline
13+
# from .table_matcher.table_match_pipeline import TableMatchPipeline
1414
from .model_processor.main import ModelProcessor
1515
from .table_matcher import TableMatch
1616
from .utils import (
@@ -43,7 +43,7 @@ def __init__(self, cfg: Optional[RapidTableInput] = None):
4343
self.ocr_engine = self._init_ocr_engine(self.cfg.ocr_params)
4444

4545
self.table_matcher = TableMatch()
46-
self.table_matcher_pipeline = TableMatchPipeline()
46+
# self.table_matcher_pipeline = TableMatchPipeline()
4747
self.load_img = LoadImage()
4848

4949
def _init_ocr_engine(self, params: Dict[Any, Any]):

kitty_doc/version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = "0.0.1"
2-
__mineru_version__ = "2.2.1"
1+
__version__ = "0.1.0"
2+
__mineru_version__ = "2.2.2"

requirements.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ onnxruntime>=1.18.1
2020
openvino>=2024.6.0
2121
ftfy>=6.3.1,<7
2222
matplotlib>=3.10,<4
23-
json_repair
24-
#scikit-learn
25-
#img2table
26-
# wired_table_rec相关包
27-
scipy
28-
Shapely
23+
shapely>=2.0.7,<3
24+
scikit-image>=0.25.0,<1.0.0
25+
beautifulsoup4>=4.13.5,<5

0 commit comments

Comments
 (0)