Skip to content

Commit 6b7b036

Browse files
maxmnemonicMaksym Lysak
andauthored
fix: Table model - optimizing align_table_cells_to_pdf in matching_post_cessor (#93)
Signed-off-by: Maksym Lysak <[email protected]> Co-authored-by: Maksym Lysak <[email protected]>
1 parent 55b273c commit 6b7b036

File tree

1 file changed

+61
-96
lines changed

1 file changed

+61
-96
lines changed

docling_ibm_models/tableformer/data_management/matching_post_processor.py

Lines changed: 61 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -468,112 +468,77 @@ def find_overlapping_pairs_indexes(bboxes):
468468
return table_cells
469469

470470
def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
471-
r"""
472-
USED in 8.a step
473-
NOT USED in 6. step
474-
475-
Align table cell bboxes with good matches
476-
to encapsulate matching pdf cells
477-
478-
Parameters
479-
----------
480-
table_cells : list of dict
481-
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
482-
pdf_cells : list of dict
483-
List of PDF cells as defined by Docling
484-
matches : dictionary of lists of table_cells
485-
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
486-
of the table_cells that fall inside that pdf cell
487-
488-
Returns
489-
-------
490-
clean_table_cells : list of dict
491-
Aligned and cleaned table cells
492471
"""
493-
# 6
494-
# align table cells with matching pdf cells
495-
new_table_cells = []
496-
497-
for pdf_cell_id in matches:
498-
match_list = matches[pdf_cell_id]
499-
one_table_cells = []
500-
for i in range(len(match_list)):
501-
otc = int(match_list[i]["table_cell_id"])
502-
if otc not in one_table_cells:
503-
one_table_cells.append(otc)
504-
505-
# Get bbox of pdf_cell:
506-
pdf_cell_bbox = []
507-
for pdf_cell in pdf_cells:
508-
if pdf_cell["id"] == int(pdf_cell_id):
509-
pdf_cell_bbox = pdf_cell["bbox"]
510-
511-
# Get bbox of pdf_cell:
512-
for table_cell in table_cells:
513-
if table_cell["cell_id"] in one_table_cells:
514-
# Align bbox vertically to cover PDF cell
515-
new_bbox = [
516-
pdf_cell_bbox[0],
517-
pdf_cell_bbox[1],
518-
pdf_cell_bbox[2],
519-
pdf_cell_bbox[3],
520-
]
521-
# We are sure cell is not empty,
522-
# because we assign PDF cell to it
523-
new_table_cell_class = "2"
524-
525-
if "cell_class" in table_cell:
526-
new_table_cell_class = table_cell["cell_class"]
527-
528-
new_table_cell = {
529-
"bbox": new_bbox,
530-
"cell_id": table_cell["cell_id"],
531-
"column_id": table_cell["column_id"],
532-
"label": table_cell["label"],
533-
"row_id": table_cell["row_id"],
534-
"cell_class": new_table_cell_class,
535-
}
472+
Align table cell bboxes with good matches to encapsulate matching pdf cells
473+
"""
474+
pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
475+
table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
536476

537-
if "colspan_val" in table_cell:
538-
new_table_cell["colspan_val"] = table_cell["colspan_val"]
539-
if "rowspan_val" in table_cell:
540-
new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
541-
new_table_cells.append(new_table_cell)
477+
# Track unique cells we're going to add
478+
processed_cells = set()
542479

543-
# Rebuild table_cells list deduplicating repeating cells,
544-
# encapsulating all duplicate cells dimensions
480+
# First pass - create initial new_table_cells with aligned bboxes
481+
new_table_cells = []
545482

546-
for new_table_cell in new_table_cells:
547-
cell_id_to_find = new_table_cell["cell_id"]
483+
for pdf_cell_id, match_list in matches.items():
484+
# Extract unique table cell ids from match_list
485+
table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
548486

549-
x1s = []
550-
y1s = []
551-
x2s = []
552-
y2s = []
487+
# Get bbox of pdf_cell
488+
pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
489+
if not pdf_cell_bbox:
490+
continue
553491

554-
found = 0
492+
# Process each unique table cell
493+
for cell_id in table_cell_ids:
494+
if cell_id in processed_cells:
495+
continue
496+
497+
table_cell = table_cell_dict.get(cell_id)
498+
if not table_cell:
499+
continue
500+
501+
# Create new table cell with aligned bbox
502+
new_table_cell = table_cell.copy()
503+
new_table_cell["bbox"] = list(pdf_cell_bbox)
504+
505+
# Set cell class
506+
if "cell_class" not in new_table_cell:
507+
new_table_cell["cell_class"] = "2"
508+
509+
new_table_cells.append(new_table_cell)
510+
processed_cells.add(cell_id)
511+
512+
# Second pass - aggregate bboxes for duplicate cells
513+
cell_to_bboxes = {}
514+
for cell in new_table_cells:
515+
cell_id = cell["cell_id"]
516+
if cell_id not in cell_to_bboxes:
517+
cell_to_bboxes[cell_id] = []
518+
cell_to_bboxes[cell_id].append(cell["bbox"])
519+
520+
# Create final clean table cells
521+
clean_table_cells = []
522+
processed_ids = set()
523+
524+
for cell in new_table_cells:
525+
cell_id = cell["cell_id"]
526+
if cell_id in processed_ids:
527+
continue
555528

556-
for found_cell in new_table_cells:
557-
if found_cell["cell_id"] == cell_id_to_find:
558-
found += 1
559-
x1s.append(found_cell["bbox"][0])
560-
y1s.append(found_cell["bbox"][1])
561-
x2s.append(found_cell["bbox"][2])
562-
y2s.append(found_cell["bbox"][3])
529+
bboxes = cell_to_bboxes[cell_id]
530+
if len(bboxes) > 1:
531+
# Merge bboxes
532+
x1s = [bbox[0] for bbox in bboxes]
533+
y1s = [bbox[1] for bbox in bboxes]
534+
x2s = [bbox[2] for bbox in bboxes]
535+
y2s = [bbox[3] for bbox in bboxes]
563536

564-
min_x1 = min(x1s)
565-
min_y1 = min(y1s)
566-
max_x2 = max(x2s)
567-
max_y2 = max(y2s)
537+
cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
568538

569-
if found > 1:
570-
new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
539+
clean_table_cells.append(cell)
540+
processed_ids.add(cell_id)
571541

572-
clean_table_cells = [
573-
i
574-
for n, i in enumerate(new_table_cells)
575-
if i not in new_table_cells[n + 1 :]
576-
]
577542
return clean_table_cells
578543

579544
def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):

0 commit comments

Comments
 (0)