110110white_spaces = set (string .whitespace ) # for checking white space only cells
111111
112112
113+ def rect_in_rect (inner , outer ):
114+ """Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+ return (
116+ 1
117+ and inner [0 ] >= outer [0 ]
118+ and inner [1 ] >= outer [1 ]
119+ and inner [2 ] <= outer [2 ]
120+ and inner [3 ] <= outer [3 ]
121+ )
122+
123+
113124def _iou (r1 , r2 ):
114125 """Compute intersection over union of two rectangles."""
115126 ix = max (0 , min (r1 [2 ], r2 [2 ]) - max (r1 [0 ], r2 [0 ]))
@@ -126,7 +137,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126137 """Check whether any of the words in bbox are cut through by
127138 horizontal line y.
128139 """
129- return any (r .y0 < y < r .y1 for r in word_rects if r in bbox )
140+ return any (r .y0 < y < r .y1 for r in word_rects if rect_in_rect ( r , bbox ) )
130141
131142
132143def get_table_dict_from_rect (textpage , rect ):
@@ -182,7 +193,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182193 for i in range (len (nypos ) - 1 ):
183194 row_box = pymupdf .Rect (bbox .x0 , nypos [i ], bbox .x1 , nypos [i + 1 ])
184195 # Sub-select words in this row and sort them by left coordinate
185- row_words = sorted ([r for r in word_rects if r in row_box ], key = lambda r : r .x0 )
196+ row_words = sorted (
197+ [r for r in word_rects if rect_in_rect (r , row_box )], key = lambda r : r .x0
198+ )
186199 # Sub-select x values that do not cut through words
187200 this_xpos = [x for x in nxpos if not any (r .x0 < x < r .x1 for r in row_words )]
188201 for j in range (len (this_xpos ) - 1 ):
0 commit comments