chore: update files

SWHL · SWHL · commit fc4c1b95ff48 · 2025-06-22T15:10:09.000+08:00
diff --git a/README.md b/README.md
@@ -78,12 +78,12 @@ unitable是来源unitable的transformer模型，精度最高，暂仅支持pytor
 
     table_engine = RapidTable(input_args)
 
-    img_path = "<https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg>"
-    rapid_ocr_output = ocr_engine(img_path)
-    ocr_result = list(
-        zip(rapid_ocr_output.boxes, rapid_ocr_output.txts, rapid_ocr_output.scores)
-    )
-    results = table_engine(img_path, ocr_result)
+    img_path = "https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg"
+
+    ori_ocr_res = ocr_engine(img_path)
+    ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+
+    results = table_engine(img_path, ocr_results=ocr_results)
     results.vis(save_dir="outputs", save_name="vis")
     ```
 
@@ -162,19 +162,17 @@ table_engine = RapidTable(input_args)
 
 img_path = "https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg"
 
-# 使用单字识别
-# rapid_ocr_output = ocr_engine(img_path, return_word_box=True)
-# word_results = rapid_ocr_output.word_results
-# ocr_result = [
+# # 使用单字识别
+# ori_ocr_res = ocr_engine(img_path, return_word_box=True)
+# ocr_results = [
 #     [word_result[0][2], word_result[0][0], word_result[0][1]]
-#     for word_result in word_results
+#     for word_result in ori_ocr_res.word_results
 # ]
+# ocr_results = list(zip(*ocr_results))
 
-rapid_ocr_output = ocr_engine(img_path)
-ocr_result = list(
-    zip(rapid_ocr_output.boxes, rapid_ocr_output.txts, rapid_ocr_output.scores)
-)
-results = table_engine(img_path, ocr_result)
+ori_ocr_res = ocr_engine(img_path)
+ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+results = table_engine(img_path, ocr_results=ocr_results)
 results.vis(save_dir="outputs", save_name="vis")
 ```
 
@@ -201,11 +199,11 @@ input_args = RapidTableInput(
 table_engine = RapidTable(input_args)
 
 img_path = "https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg"
-rapid_ocr_output = ocr_engine(img_path)
-ocr_result = list(
-    zip(rapid_ocr_output.boxes, rapid_ocr_output.txts, rapid_ocr_output.scores)
-)
-results = table_engine(img_path, ocr_result)
+
+ori_ocr_res = ocr_engine(img_path)
+ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+
+results = table_engine(img_path, ocr_results=ocr_results)
 results.vis(save_dir="outputs", save_name="vis")
 ```
 
diff --git a/demo.py b/demo.py
@@ -10,19 +10,18 @@
 input_args = RapidTableInput(model_type=ModelType.UNITABLE)
 table_engine = RapidTable(input_args)
 
-img_path = "https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg"
+img_path = "tests/test_files/table_without_txt.jpg"
+# img_path = "https://raw.githubusercontent.com/RapidAI/RapidTable/refs/heads/main/tests/test_files/table.jpg"
 
-# 使用单字识别
-# rapid_ocr_output = ocr_engine(img_path, return_word_box=True)
-# word_results = rapid_ocr_output.word_results
-# ocr_result = [
+# # 使用单字识别
+# ori_ocr_res = ocr_engine(img_path, return_word_box=True)
+# ocr_results = [
 #     [word_result[0][2], word_result[0][0], word_result[0][1]]
-#     for word_result in word_results
+#     for word_result in ori_ocr_res.word_results
 # ]
+# ocr_results = list(zip(*ocr_results))
 
-rapid_ocr_output = ocr_engine(img_path)
-ocr_result = list(
-    zip(rapid_ocr_output.boxes, rapid_ocr_output.txts, rapid_ocr_output.scores)
-)
-results = table_engine(img_path, ocr_result)
+ori_ocr_res = ocr_engine(img_path)
+ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+results = table_engine(img_path)
 results.vis(save_dir="outputs", save_name="vis")
diff --git a/rapid_table/main.py b/rapid_table/main.py
@@ -17,6 +17,7 @@
     ModelType,
     RapidTableInput,
     RapidTableOutput,
+    get_boxes_recs,
     import_package,
 )
 
@@ -34,7 +35,11 @@ def __init__(self, cfg: Optional[RapidTableInput] = None):
 
         self.cfg = cfg
         self.table_structure = self._init_table_structer()
-        self.ocr_engine = self._init_ocr_engine()
+
+        self.ocr_engine = None
+        if cfg.use_ocr:
+            self.ocr_engine = self._init_ocr_engine()
+
         self.table_matcher = TableMatch()
         self.load_img = LoadImage()
 
@@ -58,72 +63,48 @@ def _init_table_structer(self):
     def __call__(
         self,
         img_content: Union[str, np.ndarray, bytes, Path],
-        ocr_result: Optional[List[Union[List[List[float]], str, str]]] = None,
+        ocr_results: Optional[Tuple[np.ndarray, Tuple[str], Tuple[float]]] = None,
     ) -> RapidTableOutput:
-        if self.ocr_engine is None and ocr_result is None:
-            raise ValueError(
-                "One of two conditions must be met: ocr_result is not empty, or rapidocr is installed."
-            )
+        s = time.perf_counter()
 
         img = self.load_img(img_content)
 
-        s = time.perf_counter()
-        h, w = img.shape[:2]
-
-        if ocr_result is None:
-            ocr_result = self.ocr_engine(img)
-            ocr_result = list(
-                zip(
-                    ocr_result.boxes,
-                    ocr_result.txts,
-                    ocr_result.scores,
-                )
-            )
-        dt_boxes, rec_res = self.get_boxes_recs(ocr_result, h, w)
+        dt_boxes, rec_res = self.get_ocr_results(img, ocr_results)
+        pred_structures, cell_bboxes, logic_points = self.get_table_rec_results(img)
+        pred_html = self.get_table_matcher(
+            pred_structures, cell_bboxes, dt_boxes, rec_res
+        )
 
-        pred_structures, cell_bboxes, _ = self.table_structure(img)
+        elapse = time.perf_counter() - s
+        return RapidTableOutput(img, pred_html, cell_bboxes, logic_points, elapse)
 
-        # 适配slanet-plus模型输出的box缩放还原
-        if self.cfg.model_type == ModelType.SLANETPLUS:
-            cell_bboxes = self.adapt_slanet_plus(img, cell_bboxes)
+    def get_ocr_results(
+        self, img: np.ndarray, ocr_results: Tuple[np.ndarray, Tuple[str], Tuple[float]]
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+        if ocr_results is not None:
+            return get_boxes_recs(ocr_results, img.shape[:2])
 
-        pred_html = self.table_matcher(pred_structures, cell_bboxes, dt_boxes, rec_res)
+        if not self.cfg.use_ocr:
+            return None, None
 
-        # 过滤掉占位的bbox
-        mask = ~np.all(cell_bboxes == 0, axis=1)
-        cell_bboxes = cell_bboxes[mask]
+        ori_ocr_res = self.ocr_engine(img)
+        if ori_ocr_res.boxes is None:
+            logger.warning("OCR Result is empty")
+            return None, None
 
+        ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+        return get_boxes_recs(ocr_results, img.shape[:2])
+
+    def get_table_rec_results(self, img: np.ndarray):
+        pred_structures, cell_bboxes, _ = self.table_structure(img)
         logic_points = self.table_matcher.decode_logic_points(pred_structures)
-        elapse = time.perf_counter() - s
-        return RapidTableOutput(img, pred_html, cell_bboxes, logic_points, elapse)
+        return pred_structures, cell_bboxes, logic_points
 
-    def get_boxes_recs(
-        self, ocr_result: List[Union[List[List[float]], str, str]], h: int, w: int
-    ) -> Tuple[np.ndarray, Tuple[str, str]]:
-        dt_boxes, rec_res, scores = list(zip(*ocr_result))
-        rec_res = list(zip(rec_res, scores))
-
-        r_boxes = []
-        for box in dt_boxes:
-            box = np.array(box)
-            x_min = max(0, box[:, 0].min() - 1)
-            x_max = min(w, box[:, 0].max() + 1)
-            y_min = max(0, box[:, 1].min() - 1)
-            y_max = min(h, box[:, 1].max() + 1)
-            box = [x_min, y_min, x_max, y_max]
-            r_boxes.append(box)
-        dt_boxes = np.array(r_boxes)
-        return dt_boxes, rec_res
-
-    def adapt_slanet_plus(self, img: np.ndarray, cell_bboxes: np.ndarray) -> np.ndarray:
-        h, w = img.shape[:2]
-        resized = 488
-        ratio = min(resized / h, resized / w)
-        w_ratio = resized / (w * ratio)
-        h_ratio = resized / (h * ratio)
-        cell_bboxes[:, 0::2] *= w_ratio
-        cell_bboxes[:, 1::2] *= h_ratio
-        return cell_bboxes
+    def get_table_matcher(self, pred_structures, cell_bboxes, dt_boxes, rec_res):
+        if dt_boxes is None and rec_res is None:
+            return None
+
+        return self.table_matcher(pred_structures, cell_bboxes, dt_boxes, rec_res)
 
 
 def parse_args(arg_list: Optional[List[str]] = None):
@@ -158,11 +139,9 @@ def main(arg_list: Optional[List[str]] = None):
     if table_engine.ocr_engine is None:
         raise ValueError("ocr engine is None")
 
-    rapid_ocr_output = table_engine.ocr_engine(img_path)
-    ocr_result = list(
-        zip(rapid_ocr_output.boxes, rapid_ocr_output.txts, rapid_ocr_output.scores)
-    )
-    table_results = table_engine(img_path, ocr_result)
+    ori_ocr_res = table_engine.ocr_engine(img_path)
+    ocr_results = [ori_ocr_res.boxes, ori_ocr_res.txts, ori_ocr_res.scores]
+    table_results = table_engine(img_path, ocr_results=ocr_results)
     print(table_results.pred_html)
 
     if args.vis:
diff --git a/rapid_table/table_matcher/main.py b/rapid_table/table_matcher/main.py
@@ -25,6 +25,7 @@ def __init__(self, filter_ocr_result=True, use_master=False):
     def __call__(self, pred_structures, cell_bboxes, dt_boxes, rec_res):
         if self.filter_ocr_result:
             dt_boxes, rec_res = self._filter_ocr_result(cell_bboxes, dt_boxes, rec_res)
+
         matched_index = self.match_result(dt_boxes, cell_bboxes)
         pred_html, pred = self.get_pred_html(pred_structures, matched_index, rec_res)
         return pred_html
diff --git a/rapid_table/table_structure/pp_structure/main.py b/rapid_table/table_structure/pp_structure/main.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from rapid_table.utils.typings import EngineType
+from rapid_table.utils.typings import EngineType, ModelType
 
 from ...inference_engine.base import get_engine
 from ..utils import get_struct_str
@@ -29,22 +29,46 @@ def __init__(self, cfg: Dict[str, Any]):
         if cfg["engine_type"] is None:
             cfg["engine_type"] = EngineType.ONNXRUNTIME
         self.session = get_engine(cfg["engine_type"])(cfg)
+        self.cfg = cfg
 
         self.preprocess_op = TablePreprocess()
 
         self.character = self.session.get_character_list()
         self.postprocess_op = TableLabelDecode(self.character)
 
-    def __call__(self, img: np.ndarray) -> Tuple[List[str], np.ndarray, float]:
+    def __call__(self, ori_img: np.ndarray) -> Tuple[List[str], np.ndarray, float]:
         s = time.perf_counter()
 
-        img, shape_list = self.preprocess_op(img)
+        img, shape_list = self.preprocess_op(ori_img)
 
         bbox_preds, struct_probs = self.session(img.copy())
 
         post_result = self.postprocess_op(bbox_preds, struct_probs, [shape_list])
+
         table_struct_str = get_struct_str(post_result["structure_batch_list"][0][0])
-        bbox_list = post_result["bbox_batch_list"][0]
+        cell_bboxes = post_result["bbox_batch_list"][0]
+
+        if self.cfg["model_type"] == ModelType.SLANETPLUS:
+            cell_bboxes = self.rescale_cell_bboxes(ori_img, cell_bboxes)
+        cell_bboxes = self.filter_blank_bbox(cell_bboxes)
 
         elapse = time.perf_counter() - s
-        return table_struct_str, bbox_list, elapse
+        return table_struct_str, cell_bboxes, elapse
+
+    def rescale_cell_bboxes(
+        self, img: np.ndarray, cell_bboxes: np.ndarray
+    ) -> np.ndarray:
+        h, w = img.shape[:2]
+        resized = 488
+        ratio = min(resized / h, resized / w)
+        w_ratio = resized / (w * ratio)
+        h_ratio = resized / (h * ratio)
+        cell_bboxes[:, 0::2] *= w_ratio
+        cell_bboxes[:, 1::2] *= h_ratio
+        return cell_bboxes
+
+    @staticmethod
+    def filter_blank_bbox(cell_bboxes: np.ndarray) -> np.ndarray:
+        # 过滤掉占位的bbox
+        mask = ~np.all(cell_bboxes == 0, axis=1)
+        return cell_bboxes[mask]
diff --git a/rapid_table/utils/__init__.py b/rapid_table/utils/__init__.py
@@ -5,5 +5,5 @@
 from .load_image import LoadImage
 from .logger import Logger
 from .typings import EngineType, ModelType, RapidTableInput, RapidTableOutput
-from .utils import import_package, is_url, mkdir, read_yaml
+from .utils import get_boxes_recs, import_package, is_url, mkdir, read_yaml
 from .vis import VisTable
diff --git a/rapid_table/utils/typings.py b/rapid_table/utils/typings.py
@@ -29,6 +29,8 @@ class RapidTableInput:
     model_type: Optional[ModelType] = ModelType.SLANETPLUS
     model_dir_or_path: Union[str, Path, None, Dict[str, str]] = None
 
+    use_ocr: bool = True
+
     engine_type: Optional[EngineType] = None
     engine_cfg: dict = field(default_factory=dict)
 
diff --git a/rapid_table/utils/utils.py b/rapid_table/utils/utils.py
@@ -4,14 +4,33 @@
 import hashlib
 import importlib
 from pathlib import Path
-from typing import Union
+from typing import Tuple, Union
 from urllib.parse import urlparse
 
 import cv2
 import numpy as np
 from omegaconf import DictConfig, OmegaConf
 
 
+def get_boxes_recs(
+    ocr_results: Tuple[np.ndarray, Tuple[str], Tuple[float]],
+    img_shape: Tuple[int, int],
+) -> Tuple[np.ndarray, Tuple[str, str]]:
+    rec_res = list(zip(ocr_results[1], ocr_results[2]))
+
+    h, w = img_shape
+    dt_boxes = []
+    for box in ocr_results[0]:
+        box = np.array(box)
+        x_min = max(0, box[:, 0].min() - 1)
+        x_max = min(w, box[:, 0].max() + 1)
+        y_min = max(0, box[:, 1].min() - 1)
+        y_max = min(h, box[:, 1].max() + 1)
+        box = [x_min, y_min, x_max, y_max]
+        dt_boxes.append(box)
+    return np.array(dt_boxes), rec_res
+
+
 def save_img(save_path: Union[str, Path], img: np.ndarray):
     cv2.imwrite(str(save_path), img)
 
diff --git a/rapid_table/utils/vis.py b/rapid_table/utils/vis.py
@@ -24,7 +24,7 @@ def __call__(
         save_drawed_path: Optional[str] = None,
         save_logic_path: Optional[str] = None,
     ):
-        if save_html_path:
+        if pred_html and save_html_path:
             html_with_border = self.insert_border_style(pred_html)
             save_txt(save_html_path, html_with_border)
             self.logger.info(f"Save HTML to {save_html_path}")
diff --git a/tests/test_files/table_without_txt.jpg b/tests/test_files/table_without_txt.jpg
diff --git a/tests/test_main.py b/tests/test_main.py