Skip to content

Commit 5d053f6

Browse files
authored
[DocParser] Add bbox check (#5132)
* check bbox * Update
1 parent fa71eee commit 5d053f6

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

paddlenlp/utils/doc_parser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,12 @@ def _get_box(box):
101101
]
102102
return box
103103

104+
def _normal_box(box):
105+
# Ensure the height and width of bbox are greater than zero
106+
if box[3] - box[1] < 0 or box[2] - box[0] < 0:
107+
return False
108+
return True
109+
104110
def _is_ch(s):
105111
for ch in s:
106112
if "\u4e00" <= ch <= "\u9fff":
@@ -120,6 +126,8 @@ def _is_ch(s):
120126
for segment in ocr_result:
121127
box = segment[0]
122128
box = _get_box(box)
129+
if not _normal_box(box):
130+
continue
123131
text = segment[1][0]
124132
layout.append((box, text))
125133
else:
@@ -130,6 +138,8 @@ def _is_ch(s):
130138
for segment in ocr_result:
131139
box = segment["text_region"]
132140
box = _get_box(box)
141+
if not _normal_box(box):
142+
continue
133143
text = segment["text"]
134144
layout.append((box, text, region["type"]))
135145
else:
@@ -156,6 +166,8 @@ def _is_ch(s):
156166
bbox[0] + cell_box[2],
157167
bbox[1] + cell_box[3],
158168
]
169+
if not _normal_box(box):
170+
continue
159171
if _is_ch(text):
160172
text = text.replace(" ", "")
161173
layout.append((box, text, region["type"]))

0 commit comments

Comments
 (0)