|
4 | 4 | # |
5 | 5 | import json |
6 | 6 | import logging |
| 7 | +import math |
7 | 8 | import statistics |
8 | 9 |
|
9 | 10 | import docling_ibm_models.tableformer.settings as s |
@@ -403,45 +404,63 @@ def correct_overlap(box1, box2): |
403 | 404 | # Push horizontally |
404 | 405 | if x1_min < x2_min: |
405 | 406 | # Move box1 to the left and box2 to the right |
406 | | - box1["bbox"][2] -= overlap_x |
407 | | - box2["bbox"][0] += overlap_x |
| 407 | + box1["bbox"][2] -= math.ceil(overlap_x / 2) + 2 |
| 408 | + box2["bbox"][0] += math.floor(overlap_x / 2) |
408 | 409 | else: |
409 | 410 | # Move box2 to the left and box1 to the right |
410 | | - box2["bbox"][2] -= overlap_x |
411 | | - box1["bbox"][0] += overlap_x |
| 411 | + box2["bbox"][2] -= math.ceil(overlap_x / 2) + 2 |
| 412 | + box1["bbox"][0] += math.floor(overlap_x / 2) |
412 | 413 | else: |
413 | 414 | # Push vertically |
414 | 415 | if y1_min < y2_min: |
415 | 416 | # Move box1 up and box2 down |
416 | | - box1["bbox"][3] -= overlap_y |
417 | | - box2["bbox"][1] += overlap_y |
| 417 | + box1["bbox"][3] -= math.ceil(overlap_y / 2) + 2 |
| 418 | + box2["bbox"][1] += math.floor(overlap_y / 2) |
418 | 419 | else: |
419 | 420 | # Move box2 up and box1 down |
420 | | - box2["bbox"][3] -= overlap_y |
421 | | - box1["bbox"][1] += overlap_y |
| 421 | + box2["bbox"][3] -= math.ceil(overlap_y / 2) + 2 |
| 422 | + box1["bbox"][1] += math.floor(overlap_y / 2) |
| 423 | + |
| 424 | + # Will flip coordinates in proper order, if previous operations reversed it |
| 425 | + box1["bbox"] = [ |
| 426 | + min(box1["bbox"][0], box1["bbox"][2]), |
| 427 | + min(box1["bbox"][1], box1["bbox"][3]), |
| 428 | + max(box1["bbox"][0], box1["bbox"][2]), |
| 429 | + max(box1["bbox"][1], box1["bbox"][3]), |
| 430 | + ] |
| 431 | + box2["bbox"] = [ |
| 432 | + min(box2["bbox"][0], box2["bbox"][2]), |
| 433 | + min(box2["bbox"][1], box2["bbox"][3]), |
| 434 | + max(box2["bbox"][0], box2["bbox"][2]), |
| 435 | + max(box2["bbox"][1], box2["bbox"][3]), |
| 436 | + ] |
422 | 437 |
|
423 | 438 | return box1, box2 |
424 | 439 |
|
425 | 440 | def do_boxes_overlap(box1, box2): |
426 | | - # print("{} - {}".format(box1["bbox"], box2["bbox"])) |
427 | | - # Extract coordinates from the bounding boxes |
428 | | - x1_min, y1_min, x1_max, y1_max = box1["bbox"] |
429 | | - x2_min, y2_min, x2_max, y2_max = box2["bbox"] |
430 | | - # Check if one box is to the left of the other |
431 | | - if x1_max < x2_min or x2_max < x1_min: |
| 441 | + B1 = box1["bbox"] |
| 442 | + B2 = box2["bbox"] |
| 443 | + if ( |
| 444 | + (B1[0] >= B2[2]) |
| 445 | + or (B1[2] <= B2[0]) |
| 446 | + or (B1[3] <= B2[1]) |
| 447 | + or (B1[1] >= B2[3]) |
| 448 | + ): |
432 | 449 | return False |
433 | | - # Check if one box is above the other |
434 | | - if y1_max < y2_min or y2_max < y1_min: |
435 | | - return False |
436 | | - return True |
| 450 | + else: |
| 451 | + return True |
437 | 452 |
|
438 | 453 | def find_overlapping_pairs_indexes(bboxes): |
439 | 454 | overlapping_indexes = [] |
440 | 455 | # Compare each box with every other box (combinations) |
441 | 456 | for i in range(len(bboxes)): |
442 | 457 | for j in range(i + 1, len(bboxes)): |
443 | | - if do_boxes_overlap(bboxes[i], bboxes[j]): |
444 | | - bboxes[i], bboxes[j] = correct_overlap(bboxes[i], bboxes[j]) |
| 458 | + if i != j: |
| 459 | + if bboxes[i] != bboxes[j]: |
| 460 | + if do_boxes_overlap(bboxes[i], bboxes[j]): |
| 461 | + bboxes[i], bboxes[j] = correct_overlap( |
| 462 | + bboxes[i], bboxes[j] |
| 463 | + ) |
445 | 464 |
|
446 | 465 | return overlapping_indexes, bboxes |
447 | 466 |
|
@@ -1144,7 +1163,7 @@ def _clear_pdf_cells(self, pdf_cells): |
1144 | 1163 | new_pdf_cells.append(pdf_cells[i]) |
1145 | 1164 | return new_pdf_cells |
1146 | 1165 |
|
1147 | | - def process(self, matching_details): |
| 1166 | + def process(self, matching_details, correct_overlapping_cells=False): |
1148 | 1167 | r""" |
1149 | 1168 | Do post processing, see details in the comments below |
1150 | 1169 |
|
@@ -1348,9 +1367,10 @@ def process(self, matching_details): |
1348 | 1367 | table_cells_wo = po2 |
1349 | 1368 | max_cell_id = po3 |
1350 | 1369 |
|
1351 | | - # As the last step - correct cell bboxes in a way that they don't overlap: |
1352 | | - if len(table_cells_wo) <= 300: # For performance reasons |
1353 | | - table_cells_wo = self._find_overlapping(table_cells_wo) |
| 1370 | + if correct_overlapping_cells: |
| 1371 | + # As the last step - correct cell bboxes in a way that they don't overlap: |
| 1372 | + if len(table_cells_wo) <= 300: # For performance reasons |
| 1373 | + table_cells_wo = self._find_overlapping(table_cells_wo) |
1354 | 1374 |
|
1355 | 1375 | self._log().debug("*** final_matches_wo") |
1356 | 1376 | self._log().debug(final_matches_wo) |
|
0 commit comments