Skip to content

Commit bd3a7b3

Browse files
authored
Merge pull request #1757 from myhloli/dev
feat(pre_proc): add block type compatibility check for span allocation
2 parents 9fd10b6 + 1991685 commit bd3a7b3

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

magic_pdf/pre_proc/ocr_dict_merge.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,19 @@ def merge_spans_to_line(spans, threshold=0.6):
6060
return lines
6161

6262

63+
def span_block_type_compatible(span_type, block_type):
64+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
65+
return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
66+
elif span_type == ContentType.InterlineEquation:
67+
return block_type in [BlockType.InterlineEquation]
68+
elif span_type == ContentType.Image:
69+
return block_type in [BlockType.ImageBody]
70+
elif span_type == ContentType.Table:
71+
return block_type in [BlockType.TableBody]
72+
else:
73+
return False
74+
75+
6376
def fill_spans_in_blocks(blocks, spans, radio):
6477
"""将allspans中的span按位置关系,放入blocks中."""
6578
block_with_spans = []
@@ -78,8 +91,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
7891
block_spans = []
7992
for span in spans:
8093
span_bbox = span['bbox']
81-
if calculate_overlap_area_in_bbox1_area_ratio(
82-
span_bbox, block_bbox) > radio:
94+
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
8395
block_spans.append(span)
8496

8597
block_dict['spans'] = block_spans

0 commit comments

Comments
 (0)