@@ -468,112 +468,77 @@ def find_overlapping_pairs_indexes(bboxes):
468468 return table_cells
469469
470470 def _align_table_cells_to_pdf (self , table_cells , pdf_cells , matches ):
471- r"""
472- USED in 8.a step
473- NOT USED in 6. step
474-
475- Align table cell bboxes with good matches
476- to encapsulate matching pdf cells
477-
478- Parameters
479- ----------
480- table_cells : list of dict
481- Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
482- pdf_cells : list of dict
483- List of PDF cells as defined by Docling
484- matches : dictionary of lists of table_cells
485- A dictionary which is indexed by the pdf_cell_id as key and the value is a list
486- of the table_cells that fall inside that pdf cell
487-
488- Returns
489- -------
490- clean_table_cells : list of dict
491- Aligned and cleaned table cells
492471 """
493- # 6
494- # align table cells with matching pdf cells
495- new_table_cells = []
496-
497- for pdf_cell_id in matches :
498- match_list = matches [pdf_cell_id ]
499- one_table_cells = []
500- for i in range (len (match_list )):
501- otc = int (match_list [i ]["table_cell_id" ])
502- if otc not in one_table_cells :
503- one_table_cells .append (otc )
504-
505- # Get bbox of pdf_cell:
506- pdf_cell_bbox = []
507- for pdf_cell in pdf_cells :
508- if pdf_cell ["id" ] == int (pdf_cell_id ):
509- pdf_cell_bbox = pdf_cell ["bbox" ]
510-
511- # Get bbox of pdf_cell:
512- for table_cell in table_cells :
513- if table_cell ["cell_id" ] in one_table_cells :
514- # Align bbox vertically to cover PDF cell
515- new_bbox = [
516- pdf_cell_bbox [0 ],
517- pdf_cell_bbox [1 ],
518- pdf_cell_bbox [2 ],
519- pdf_cell_bbox [3 ],
520- ]
521- # We are sure cell is not empty,
522- # because we assign PDF cell to it
523- new_table_cell_class = "2"
524-
525- if "cell_class" in table_cell :
526- new_table_cell_class = table_cell ["cell_class" ]
527-
528- new_table_cell = {
529- "bbox" : new_bbox ,
530- "cell_id" : table_cell ["cell_id" ],
531- "column_id" : table_cell ["column_id" ],
532- "label" : table_cell ["label" ],
533- "row_id" : table_cell ["row_id" ],
534- "cell_class" : new_table_cell_class ,
535- }
472+ Align table cell bboxes with good matches to encapsulate matching pdf cells
473+ """
474+ pdf_cell_dict = {pdf_cell ["id" ]: pdf_cell ["bbox" ] for pdf_cell in pdf_cells }
475+ table_cell_dict = {cell ["cell_id" ]: cell for cell in table_cells }
536476
537- if "colspan_val" in table_cell :
538- new_table_cell ["colspan_val" ] = table_cell ["colspan_val" ]
539- if "rowspan_val" in table_cell :
540- new_table_cell ["rowspan_val" ] = table_cell ["rowspan_val" ]
541- new_table_cells .append (new_table_cell )
477+ # Track unique cells we're going to add
478+ processed_cells = set ()
542479
543- # Rebuild table_cells list deduplicating repeating cells,
544- # encapsulating all duplicate cells dimensions
480+ # First pass - create initial new_table_cells with aligned bboxes
481+ new_table_cells = []
545482
546- for new_table_cell in new_table_cells :
547- cell_id_to_find = new_table_cell ["cell_id" ]
483+ for pdf_cell_id , match_list in matches .items ():
484+ # Extract unique table cell ids from match_list
485+ table_cell_ids = set (int (match ["table_cell_id" ]) for match in match_list )
548486
549- x1s = []
550- y1s = []
551- x2s = []
552- y2s = []
487+ # Get bbox of pdf_cell
488+ pdf_cell_bbox = pdf_cell_dict . get ( int ( pdf_cell_id ))
489+ if not pdf_cell_bbox :
490+ continue
553491
554- found = 0
492+ # Process each unique table cell
493+ for cell_id in table_cell_ids :
494+ if cell_id in processed_cells :
495+ continue
496+
497+ table_cell = table_cell_dict .get (cell_id )
498+ if not table_cell :
499+ continue
500+
501+ # Create new table cell with aligned bbox
502+ new_table_cell = table_cell .copy ()
503+ new_table_cell ["bbox" ] = list (pdf_cell_bbox )
504+
505+ # Set cell class
506+ if "cell_class" not in new_table_cell :
507+ new_table_cell ["cell_class" ] = "2"
508+
509+ new_table_cells .append (new_table_cell )
510+ processed_cells .add (cell_id )
511+
512+ # Second pass - aggregate bboxes for duplicate cells
513+ cell_to_bboxes = {}
514+ for cell in new_table_cells :
515+ cell_id = cell ["cell_id" ]
516+ if cell_id not in cell_to_bboxes :
517+ cell_to_bboxes [cell_id ] = []
518+ cell_to_bboxes [cell_id ].append (cell ["bbox" ])
519+
520+ # Create final clean table cells
521+ clean_table_cells = []
522+ processed_ids = set ()
523+
524+ for cell in new_table_cells :
525+ cell_id = cell ["cell_id" ]
526+ if cell_id in processed_ids :
527+ continue
555528
556- for found_cell in new_table_cells :
557- if found_cell [ "cell_id" ] == cell_id_to_find :
558- found += 1
559- x1s . append ( found_cell [ " bbox" ] [0 ])
560- y1s . append ( found_cell [ " bbox" ] [1 ])
561- x2s . append ( found_cell [ " bbox" ] [2 ])
562- y2s . append ( found_cell [ " bbox" ] [3 ])
529+ bboxes = cell_to_bboxes [ cell_id ]
530+ if len ( bboxes ) > 1 :
531+ # Merge bboxes
532+ x1s = [ bbox [0 ] for bbox in bboxes ]
533+ y1s = [ bbox [1 ] for bbox in bboxes ]
534+ x2s = [ bbox [2 ] for bbox in bboxes ]
535+ y2s = [ bbox [3 ] for bbox in bboxes ]
563536
564- min_x1 = min (x1s )
565- min_y1 = min (y1s )
566- max_x2 = max (x2s )
567- max_y2 = max (y2s )
537+ cell ["bbox" ] = [min (x1s ), min (y1s ), max (x2s ), max (y2s )]
568538
569- if found > 1 :
570- new_table_cell [ "bbox" ] = [ min_x1 , min_y1 , max_x2 , max_y2 ]
539+ clean_table_cells . append ( cell )
540+ processed_ids . add ( cell_id )
571541
572- clean_table_cells = [
573- i
574- for n , i in enumerate (new_table_cells )
575- if i not in new_table_cells [n + 1 :]
576- ]
577542 return clean_table_cells
578543
579544 def _deduplicate_cells (self , tab_columns , table_cells , iou_matches , ioc_matches ):
0 commit comments