110110white_spaces = set (string .whitespace ) # for checking white space only cells
111111
112112
113+ def rect_in_rect (inner , outer ):
114+ """Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+ return (
116+ 1
117+ and inner [0 ] >= outer [0 ]
118+ and inner [1 ] >= outer [1 ]
119+ and inner [2 ] <= outer [2 ]
120+ and inner [3 ] <= outer [3 ]
121+ )
122+
123+
124+ def chars_in_rect (CHARS , rect ):
125+ """Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+ return any (
127+ 1
128+ and rect [0 ] <= c ["x0" ]
129+ and c ["x1" ] <= rect [2 ]
130+ and rect [1 ] <= c ["y0" ]
131+ and rect [3 ] >= c ["y1" ]
132+ for c in CHARS
133+ )
134+
135+
113136def _iou (r1 , r2 ):
114137 """Compute intersection over union of two rectangles."""
115138 ix = max (0 , min (r1 [2 ], r2 [2 ]) - max (r1 [0 ], r2 [0 ]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126149 """Check whether any of the words in bbox are cut through by
127150 horizontal line y.
128151 """
129- return any (r .y0 < y < r .y1 for r in word_rects if r in bbox )
152+ return any (r .y0 < y < r .y1 for r in word_rects if rect_in_rect ( r , bbox ) )
130153
131154
132155def get_table_dict_from_rect (textpage , rect ):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182205 for i in range (len (nypos ) - 1 ):
183206 row_box = pymupdf .Rect (bbox .x0 , nypos [i ], bbox .x1 , nypos [i + 1 ])
184207 # Sub-select words in this row and sort them by left coordinate
185- row_words = sorted ([r for r in word_rects if r in row_box ], key = lambda r : r .x0 )
208+ row_words = sorted (
209+ [r for r in word_rects if rect_in_rect (r , row_box )], key = lambda r : r .x0
210+ )
186211 # Sub-select x values that do not cut through words
187212 this_xpos = [x for x in nxpos if not any (r .x0 < x < r .x1 for r in row_words )]
188213 for j in range (len (this_xpos ) - 1 ):
@@ -1721,12 +1746,12 @@ def row_has_bold(bbox):
17211746
17221747 Returns True if any spans are bold else False.
17231748 """
1724- blocks = page . get_text ( "dict" , flags = pymupdf . TEXTFLAGS_TEXT , clip = bbox )[
1725- "blocks"
1726- ]
1727- spans = [ s for b in blocks for l in b [ "lines" ] for s in l [ "spans" ]]
1728-
1729- return any ( s [ "flags" ] & pymupdf . TEXT_FONT_BOLD for s in spans )
1749+ global CHARS
1750+ return any (
1751+ c [ "bold" ]
1752+ for c in CHARS
1753+ if rect_in_rect (( c [ "x0" ], c [ "y0" ], c [ "x1" ], c [ "y1" ]), bbox )
1754+ )
17301755
17311756 try :
17321757 row = self .rows [0 ]
@@ -2171,6 +2196,9 @@ def make_chars(page, clip=None):
21712196 for span in sorted (line ["spans" ], key = lambda s : s ["bbox" ][0 ]):
21722197 fontname = span ["font" ]
21732198 fontsize = span ["size" ]
2199+ span_bold = bool (
2200+ span ["flags" ] & pymupdf .TEXT_FONT_BOLD or span ["char_flags" ] & 8
2201+ )
21742202 color = pymupdf .sRGB_to_pdf (span ["color" ])
21752203 for char in sorted (span ["chars" ], key = lambda c : c ["bbox" ][0 ]):
21762204 bbox = pymupdf .Rect (char ["bbox" ])
@@ -2194,6 +2222,7 @@ def make_chars(page, clip=None):
21942222 "size" : fontsize if upright else bbox .y1 - bbox .y0 ,
21952223 "stroking_color" : color ,
21962224 "stroking_pattern" : None ,
2225+ "bold" : span_bold ,
21972226 "text" : text ,
21982227 "top" : bbox .y0 ,
21992228 "upright" : upright ,
@@ -2212,6 +2241,7 @@ def make_chars(page, clip=None):
22122241# else to lines.
22132242# ------------------------------------------------------------------------
22142243def make_edges (page , clip = None , tset = None , paths = None , add_lines = None , add_boxes = None ):
2244+ global CHARS
22152245 snap_x = tset .snap_x_tolerance
22162246 snap_y = tset .snap_y_tolerance
22172247 min_length = tset .edge_min_length
@@ -2303,7 +2333,7 @@ def clean_graphics(npaths=None):
23032333 repeat = True # keep checking the rest
23042334
23052335 # move rect 0 over to result list if there is some text in it
2306- if not white_spaces . issuperset ( page . get_textbox ( prect0 , textpage = TEXTPAGE ) ):
2336+ if chars_in_rect ( CHARS , prect0 ):
23072337 # contains text, so accept it as a table bbox candidate
23082338 new_rects .append (prect0 )
23092339 del prects [0 ] # remove from rect list
0 commit comments