From c43db703add42e6d47a97abd8876a2c0135ea995 Mon Sep 17 00:00:00 2001
From: Yizhan <yizhanhuang2002@gmail.com>
Date: Tue, 11 Nov 2025 22:20:52 -0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8DTableRecognitionPipeli?=
 =?UTF-8?q?neV2=E4=B8=ADrec=5Ftexts=E5=92=8Crec=5Fscores=E9=95=BF=E5=BA=A6?=
 =?UTF-8?q?=E4=B8=8D=E4=B8=80=E8=87=B4=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

问题描述：
当OCR文本框跨越多个表格单元格需要拆分时，split_ocr_bboxes_by_table_cells方法
只更新了rec_texts列表，但没有同步更新rec_scores列表，导致两者长度不一致。

修复内容：
- 在split_ocr_bboxes_by_table_cells方法中添加对rec_scores的处理
- 拆分OCR框时，为每个拆分后的框提取并保存对应的置信度分数
- 未拆分时，保留原始置信度分数
- 确保rec_texts和rec_scores始终保持相同长度

修复文件：
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py

Fixes: OCR#17051
---
 .../table_recognition/pipeline_v2.py          | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/paddlex/inference/pipelines/table_recognition/pipeline_v2.py b/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
index c7615d9a34..c9a510be18 100644
--- a/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
+++ b/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
@@ -663,16 +663,24 @@ def split_box_by_cells(ocr_box, cell_indices, cells):
         else:
             ocr_det_results = overall_ocr_res["rec_boxes"]
         ocr_texts = overall_ocr_res["rec_texts"]
+        # Get rec_scores if it exists
+        ocr_scores = overall_ocr_res.get("rec_scores", [])
+        if ocr_scores is not None and hasattr(ocr_scores, "tolist"):
+            ocr_scores = ocr_scores.tolist()
+        elif ocr_scores is None:
+            ocr_scores = []
 
         # Make copies to modify
         new_boxes = []
         new_texts = []
+        new_scores = []
 
         # Process each OCR box
         i = 0
         while i < len(ocr_det_results):
             ocr_box = ocr_det_results[i]
             text = ocr_texts[i]
+            score = ocr_scores[i] if i < len(ocr_scores) else None
             # Find cells that significantly overlap with this OCR box
             overlapping_cells = get_overlapping_cells(ocr_box, cells_det_results)
             # Check if we need to split (spans >= k cells)
@@ -683,6 +691,7 @@ def split_box_by_cells(ocr_box, cell_indices, cells):
                 )
                 # Process each split box
                 split_texts = []
+                split_scores = []
                 for box in split_boxes:
                     x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
                     if y2 - y1 > 1 and x2 - x1 > 1:
@@ -698,21 +707,32 @@ def split_box_by_cells(ocr_box, cell_indices, cells):
                             ]  # Assumes "rec_texts" contains a single string
                         else:
                             result = ""
+                        # Extract the score from the OCR result
+                        if "rec_score" in ocr_result:
+                            result_score = ocr_result["rec_score"]
+                        else:
+                            result_score = score if score is not None else 0.0
                     else:
                         result = ""
+                        result_score = score if score is not None else 0.0
                     split_texts.append(result)
-                # Add split boxes and texts to results
+                    split_scores.append(result_score)
+                # Add split boxes, texts, and scores to results
                 new_boxes.extend(split_boxes)
                 new_texts.extend(split_texts)
+                new_scores.extend(split_scores)
             else:
-                # Keep original box and text
+                # Keep original box, text, and score
                 new_boxes.append(ocr_box)
                 new_texts.append(text)
+                new_scores.append(score if score is not None else 0.0)
             i += 1
 
         # Update the results dictionary
         overall_ocr_res["rec_boxes"] = new_boxes
         overall_ocr_res["rec_texts"] = new_texts
+        if "rec_scores" in overall_ocr_res or len(new_scores) > 0:
+            overall_ocr_res["rec_scores"] = new_scores
 
         return overall_ocr_res