Skip to content

Commit 52b8e52

Browse files
committed
Use slimmed down rectangle containment checks
1 parent a208fec commit 52b8e52

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

src/table.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,17 @@
110110
white_spaces = set(string.whitespace) # for checking white space only cells
111111

112112

113+
def rect_in_rect(inner, outer):
114+
"""Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+
return (
116+
1
117+
and inner[0] >= outer[0]
118+
and inner[1] >= outer[1]
119+
and inner[2] <= outer[2]
120+
and inner[3] <= outer[3]
121+
)
122+
123+
113124
def _iou(r1, r2):
114125
"""Compute intersection over union of two rectangles."""
115126
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -126,7 +137,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126137
"""Check whether any of the words in bbox are cut through by
127138
horizontal line y.
128139
"""
129-
return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
140+
return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))
130141

131142

132143
def get_table_dict_from_rect(textpage, rect):
@@ -182,7 +193,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182193
for i in range(len(nypos) - 1):
183194
row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
184195
# Sub-select words in this row and sort them by left coordinate
185-
row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
196+
row_words = sorted(
197+
[r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
198+
)
186199
# Sub-select x values that do not cut through words
187200
this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
188201
for j in range(len(this_xpos) - 1):

0 commit comments

Comments
 (0)