110110white_spaces = set (string .whitespace ) # for checking white space only cells
111111
112112
113+ def rect_in_rect (inner , outer ):
114+ """Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+ return (
116+ 1
117+ and inner [0 ] >= outer [0 ]
118+ and inner [1 ] >= outer [1 ]
119+ and inner [2 ] <= outer [2 ]
120+ and inner [3 ] <= outer [3 ]
121+ )
122+
123+
124+ def chars_in_rect (CHARS , rect ):
125+ """Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+ return any (
127+ 1
128+ and rect [0 ] <= c ["x0" ]
129+ and c ["x1" ] <= rect [2 ]
130+ and rect [1 ] <= c ["y0" ]
131+ and rect [3 ] >= c ["y1" ]
132+ for c in CHARS
133+ )
134+
135+
113136def _iou (r1 , r2 ):
114137 """Compute intersection over union of two rectangles."""
115138 ix = max (0 , min (r1 [2 ], r2 [2 ]) - max (r1 [0 ], r2 [0 ]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126149 """Check whether any of the words in bbox are cut through by
127150 horizontal line y.
128151 """
129- return any (r .y0 < y < r .y1 for r in word_rects if r in bbox )
152+ return any (r .y0 < y < r .y1 for r in word_rects if rect_in_rect ( r , bbox ) )
130153
131154
132155def get_table_dict_from_rect (textpage , rect ):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182205 for i in range (len (nypos ) - 1 ):
183206 row_box = pymupdf .Rect (bbox .x0 , nypos [i ], bbox .x1 , nypos [i + 1 ])
184207 # Sub-select words in this row and sort them by left coordinate
185- row_words = sorted ([r for r in word_rects if r in row_box ], key = lambda r : r .x0 )
208+ row_words = sorted (
209+ [r for r in word_rects if rect_in_rect (r , row_box )], key = lambda r : r .x0
210+ )
186211 # Sub-select x values that do not cut through words
187212 this_xpos = [x for x in nxpos if not any (r .x0 < x < r .x1 for r in row_words )]
188213 for j in range (len (this_xpos ) - 1 ):
@@ -1721,6 +1746,12 @@ def row_has_bold(bbox):
17211746
17221747 Returns True if any spans are bold else False.
17231748 """
1749+ #global CHARS
1750+ #return any(
1751+ # c["bold"]
1752+ # for c in CHARS
1753+ # if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
1754+ #)
17241755 blocks = page .get_text ("dict" , flags = pymupdf .TEXTFLAGS_TEXT , clip = bbox )[
17251756 "blocks"
17261757 ]
@@ -2171,6 +2202,9 @@ def make_chars(page, clip=None):
21712202 for span in sorted (line ["spans" ], key = lambda s : s ["bbox" ][0 ]):
21722203 fontname = span ["font" ]
21732204 fontsize = span ["size" ]
2205+ span_bold = bool (
2206+ span ["flags" ] & pymupdf .TEXT_FONT_BOLD or span ["char_flags" ] & 8
2207+ )
21742208 color = pymupdf .sRGB_to_pdf (span ["color" ])
21752209 for char in sorted (span ["chars" ], key = lambda c : c ["bbox" ][0 ]):
21762210 bbox = pymupdf .Rect (char ["bbox" ])
@@ -2194,6 +2228,7 @@ def make_chars(page, clip=None):
21942228 "size" : fontsize if upright else bbox .y1 - bbox .y0 ,
21952229 "stroking_color" : color ,
21962230 "stroking_pattern" : None ,
2231+ "bold" : span_bold ,
21972232 "text" : text ,
21982233 "top" : bbox .y0 ,
21992234 "upright" : upright ,
@@ -2212,6 +2247,7 @@ def make_chars(page, clip=None):
22122247# else to lines.
22132248# ------------------------------------------------------------------------
22142249def make_edges (page , clip = None , tset = None , paths = None , add_lines = None , add_boxes = None ):
2250+ global CHARS
22152251 snap_x = tset .snap_x_tolerance
22162252 snap_y = tset .snap_y_tolerance
22172253 min_length = tset .edge_min_length
@@ -2303,6 +2339,7 @@ def clean_graphics(npaths=None):
23032339 repeat = True # keep checking the rest
23042340
23052341 # move rect 0 over to result list if there is some text in it
2342+ #if chars_in_rect(CHARS, prect0):
23062343 if not white_spaces .issuperset (page .get_textbox (prect0 , textpage = TEXTPAGE )):
23072344 # contains text, so accept it as a table bbox candidate
23082345 new_rects .append (prect0 )
0 commit comments