@@ -383,6 +383,71 @@ def _run_intersection_match(self, cell_matcher, table_cells, pdf_cells):
383383 clean_matches = json .loads (clean_matches_string )
384384 return clean_matches
385385
386+ def _find_overlapping (self , table_cells ):
387+
388+ def correct_overlap (box1 , box2 ):
389+ # Extract coordinates from the bounding boxes
390+ x1_min , y1_min , x1_max , y1_max = box1 ["bbox" ]
391+ x2_min , y2_min , x2_max , y2_max = box2 ["bbox" ]
392+
393+ # Calculate the overlap in both x and y directions
394+ overlap_x = min (x1_max , x2_max ) - max (x1_min , x2_min )
395+ overlap_y = min (y1_max , y2_max ) - max (y1_min , y2_min )
396+
397+ # If there is no overlap, return the original boxes
398+ if overlap_x <= 0 or overlap_y <= 0 :
399+ return box1 , box2
400+
401+ # Decide how to push the boxes apart
402+ if overlap_x < overlap_y :
403+ # Push horizontally
404+ if x1_min < x2_min :
405+ # Move box1 to the left and box2 to the right
406+ box1 ["bbox" ][2 ] -= overlap_x
407+ box2 ["bbox" ][0 ] += overlap_x
408+ else :
409+ # Move box2 to the left and box1 to the right
410+ box2 ["bbox" ][2 ] -= overlap_x
411+ box1 ["bbox" ][0 ] += overlap_x
412+ else :
413+ # Push vertically
414+ if y1_min < y2_min :
415+ # Move box1 up and box2 down
416+ box1 ["bbox" ][3 ] -= overlap_y
417+ box2 ["bbox" ][1 ] += overlap_y
418+ else :
419+ # Move box2 up and box1 down
420+ box2 ["bbox" ][3 ] -= overlap_y
421+ box1 ["bbox" ][1 ] += overlap_y
422+
423+ return box1 , box2
424+
425+ def do_boxes_overlap (box1 , box2 ):
426+ # print("{} - {}".format(box1["bbox"], box2["bbox"]))
427+ # Extract coordinates from the bounding boxes
428+ x1_min , y1_min , x1_max , y1_max = box1 ["bbox" ]
429+ x2_min , y2_min , x2_max , y2_max = box2 ["bbox" ]
430+ # Check if one box is to the left of the other
431+ if x1_max < x2_min or x2_max < x1_min :
432+ return False
433+ # Check if one box is above the other
434+ if y1_max < y2_min or y2_max < y1_min :
435+ return False
436+ return True
437+
438+ def find_overlapping_pairs_indexes (bboxes ):
439+ overlapping_indexes = []
440+ # Compare each box with every other box (combinations)
441+ for i in range (len (bboxes )):
442+ for j in range (i + 1 , len (bboxes )):
443+ if do_boxes_overlap (bboxes [i ], bboxes [j ]):
444+ bboxes [i ], bboxes [j ] = correct_overlap (bboxes [i ], bboxes [j ])
445+
446+ return overlapping_indexes , bboxes
447+
448+ overlapping_indexes , table_cells = find_overlapping_pairs_indexes (table_cells )
449+ return table_cells
450+
386451 def _align_table_cells_to_pdf (self , table_cells , pdf_cells , matches ):
387452 r"""
388453 USED in 8.a step
@@ -1261,7 +1326,9 @@ def process(self, matching_details):
12611326 dedupl_table_cells , key = lambda k : k ["cell_id" ]
12621327 )
12631328
1264- if len (pdf_cells ) > 300 :
1329+ if (
1330+ len (pdf_cells ) > 300
1331+ ): # For performance, skip this step if there are too many pdf_cells
12651332 aligned_table_cells2 = dedupl_table_cells_sorted
12661333 else :
12671334 aligned_table_cells2 = self ._align_table_cells_to_pdf (
@@ -1281,6 +1348,10 @@ def process(self, matching_details):
12811348 table_cells_wo = po2
12821349 max_cell_id = po3
12831350
1351+ # As the last step - correct cell bboxes in a way that they don't overlap:
1352+ if len (table_cells_wo ) <= 300 : # For performance reasons
1353+ table_cells_wo = self ._find_overlapping (table_cells_wo )
1354+
12841355 self ._log ().debug ("*** final_matches_wo" )
12851356 self ._log ().debug (final_matches_wo )
12861357 self ._log ().debug ("*** table_cells_wo" )
0 commit comments