fix: Table model - optimizing align_table_cells_to_pdf in matching_post_cessor (#93)

maxmnemonic · Maksym Lysak · web-flow · commit 6b7b0363c680 · 2025-04-23T09:44:17.000+02:00
Signed-off-by: Maksym Lysak &lt;mly@zurich.ibm.com&gt;
Co-authored-by: Maksym Lysak &lt;mly@zurich.ibm.com&gt;
diff --git a/docling_ibm_models/tableformer/data_management/matching_post_processor.py b/docling_ibm_models/tableformer/data_management/matching_post_processor.py
@@ -468,112 +468,77 @@ def find_overlapping_pairs_indexes(bboxes):
         return table_cells
 
     def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
-        r"""
-        USED in 8.a step
-        NOT USED in 6. step
-
-        Align table cell bboxes with good matches
-        to encapsulate matching pdf cells
-
-        Parameters
-        ----------
-        table_cells : list of dict
-            Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
-        pdf_cells : list of dict
-            List of PDF cells as defined by Docling
-        matches : dictionary of lists of table_cells
-            A dictionary which is indexed by the pdf_cell_id as key and the value is a list
-            of the table_cells that fall inside that pdf cell
-
-        Returns
-        -------
-        clean_table_cells : list of dict
-            Aligned and cleaned table cells
         """
-        # 6
-        # align table cells with matching pdf cells
-        new_table_cells = []
-
-        for pdf_cell_id in matches:
-            match_list = matches[pdf_cell_id]
-            one_table_cells = []
-            for i in range(len(match_list)):
-                otc = int(match_list[i]["table_cell_id"])
-                if otc not in one_table_cells:
-                    one_table_cells.append(otc)
-
-            # Get bbox of pdf_cell:
-            pdf_cell_bbox = []
-            for pdf_cell in pdf_cells:
-                if pdf_cell["id"] == int(pdf_cell_id):
-                    pdf_cell_bbox = pdf_cell["bbox"]
-
-            # Get bbox of pdf_cell:
-            for table_cell in table_cells:
-                if table_cell["cell_id"] in one_table_cells:
-                    # Align bbox vertically to cover PDF cell
-                    new_bbox = [
-                        pdf_cell_bbox[0],
-                        pdf_cell_bbox[1],
-                        pdf_cell_bbox[2],
-                        pdf_cell_bbox[3],
-                    ]
-                    # We are sure cell is not empty,
-                    # because we assign PDF cell to it
-                    new_table_cell_class = "2"
-
-                    if "cell_class" in table_cell:
-                        new_table_cell_class = table_cell["cell_class"]
-
-                    new_table_cell = {
-                        "bbox": new_bbox,
-                        "cell_id": table_cell["cell_id"],
-                        "column_id": table_cell["column_id"],
-                        "label": table_cell["label"],
-                        "row_id": table_cell["row_id"],
-                        "cell_class": new_table_cell_class,
-                    }
+        Align table cell bboxes with good matches to encapsulate matching pdf cells
+        """
+        pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
+        table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
 
-                    if "colspan_val" in table_cell:
-                        new_table_cell["colspan_val"] = table_cell["colspan_val"]
-                    if "rowspan_val" in table_cell:
-                        new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
-                    new_table_cells.append(new_table_cell)
+        # Track unique cells we're going to add
+        processed_cells = set()
 
-        # Rebuild table_cells list deduplicating repeating cells,
-        # encapsulating all duplicate cells dimensions
+        # First pass - create initial new_table_cells with aligned bboxes
+        new_table_cells = []
 
-        for new_table_cell in new_table_cells:
-            cell_id_to_find = new_table_cell["cell_id"]
+        for pdf_cell_id, match_list in matches.items():
+            # Extract unique table cell ids from match_list
+            table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
 
-            x1s = []
-            y1s = []
-            x2s = []
-            y2s = []
+            # Get bbox of pdf_cell
+            pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
+            if not pdf_cell_bbox:
+                continue
 
-            found = 0
+            # Process each unique table cell
+            for cell_id in table_cell_ids:
+                if cell_id in processed_cells:
+                    continue
+
+                table_cell = table_cell_dict.get(cell_id)
+                if not table_cell:
+                    continue
+
+                # Create new table cell with aligned bbox
+                new_table_cell = table_cell.copy()
+                new_table_cell["bbox"] = list(pdf_cell_bbox)
+
+                # Set cell class
+                if "cell_class" not in new_table_cell:
+                    new_table_cell["cell_class"] = "2"
+
+                new_table_cells.append(new_table_cell)
+                processed_cells.add(cell_id)
+
+        # Second pass - aggregate bboxes for duplicate cells
+        cell_to_bboxes = {}
+        for cell in new_table_cells:
+            cell_id = cell["cell_id"]
+            if cell_id not in cell_to_bboxes:
+                cell_to_bboxes[cell_id] = []
+            cell_to_bboxes[cell_id].append(cell["bbox"])
+
+        # Create final clean table cells
+        clean_table_cells = []
+        processed_ids = set()
+
+        for cell in new_table_cells:
+            cell_id = cell["cell_id"]
+            if cell_id in processed_ids:
+                continue
 
-            for found_cell in new_table_cells:
-                if found_cell["cell_id"] == cell_id_to_find:
-                    found += 1
-                    x1s.append(found_cell["bbox"][0])
-                    y1s.append(found_cell["bbox"][1])
-                    x2s.append(found_cell["bbox"][2])
-                    y2s.append(found_cell["bbox"][3])
+            bboxes = cell_to_bboxes[cell_id]
+            if len(bboxes) > 1:
+                # Merge bboxes
+                x1s = [bbox[0] for bbox in bboxes]
+                y1s = [bbox[1] for bbox in bboxes]
+                x2s = [bbox[2] for bbox in bboxes]
+                y2s = [bbox[3] for bbox in bboxes]
 
-            min_x1 = min(x1s)
-            min_y1 = min(y1s)
-            max_x2 = max(x2s)
-            max_y2 = max(y2s)
+                cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
 
-            if found > 1:
-                new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
+            clean_table_cells.append(cell)
+            processed_ids.add(cell_id)
 
-        clean_table_cells = [
-            i
-            for n, i in enumerate(new_table_cells)
-            if i not in new_table_cells[n + 1 :]
-        ]
         return clean_table_cells
 
     def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):