8989
9090EDGES = [] # vector graphics from PyMuPDF
9191CHARS = [] # text characters from PyMuPDF
92- TEXTPAGE = None
92+ TEXTPAGE = None # textpage for cell text extraction
9393TEXT_BOLD = mupdf .FZ_STEXT_BOLD
9494TEXT_STRIKEOUT = mupdf .FZ_STEXT_STRIKEOUT
9595FLAGS = (
@@ -121,6 +121,18 @@ def rect_in_rect(inner, outer):
121121 )
122122
123123
124+ def chars_in_rect (CHARS , rect ):
125+ """Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+ return any (
127+ 1
128+ and rect [0 ] <= c ["x0" ]
129+ and c ["x1" ] <= rect [2 ]
130+ and rect [1 ] <= c ["y0" ]
131+ and rect [3 ] >= c ["y1" ]
132+ for c in CHARS
133+ )
134+
135+
124136def _iou (r1 , r2 ):
125137 """Compute intersection over union of two rectangles."""
126138 ix = max (0 , min (r1 [2 ], r2 [2 ]) - max (r1 [0 ], r2 [0 ]))
@@ -1509,6 +1521,7 @@ def __init__(self, bbox, cells, names, above):
15091521class Table :
15101522 def __init__ (self , page , cells ):
15111523 self .page = page
1524+ self .textpage = None
15121525 self .cells = cells
15131526 self .header = self ._get_header () # PyMuPDF extension
15141527
@@ -1601,7 +1614,7 @@ def to_markdown(self, clean=False, fill_empty=True):
16011614 for j , cell in enumerate (row ):
16021615 if cell is not None :
16031616 cells [i ][j ] = extract_cells (
1604- TEXTPAGE , cell_boxes [i ][j ], markdown = True
1617+ self . textpage , cell_boxes [i ][j ], markdown = True
16051618 )
16061619
16071620 if fill_empty : # fill "None" cells where possible
@@ -1734,12 +1747,11 @@ def row_has_bold(bbox):
17341747
17351748 Returns True if any spans are bold else False.
17361749 """
1737- blocks = page .get_text ("dict" , flags = pymupdf .TEXTFLAGS_TEXT , clip = bbox )[
1738- "blocks"
1739- ]
1740- spans = [s for b in blocks for l in b ["lines" ] for s in l ["spans" ]]
1741-
1742- return any (s ["flags" ] & pymupdf .TEXT_FONT_BOLD for s in spans )
1750+ return any (
1751+ c ["bold" ]
1752+ for c in CHARS
1753+ if rect_in_rect ((c ["x0" ], c ["y0" ], c ["x1" ], c ["y1" ]), bbox )
1754+ )
17431755
17441756 try :
17451757 row = self .rows [0 ]
@@ -2021,6 +2033,7 @@ class TableFinder:
20212033
20222034 def __init__ (self , page , settings = None ):
20232035 self .page = weakref .proxy (page )
2036+ self .textpage = None
20242037 self .settings = TableSettings .resolve (settings )
20252038 self .edges = self .get_edges ()
20262039 self .intersections = edges_to_intersections (
@@ -2165,7 +2178,6 @@ def __getitem__(self, i):
21652178# -----------------------------------------------------------------------------
21662179def make_chars (page , clip = None ):
21672180 """Extract text as "rawdict" to fill CHARS."""
2168- global TEXTPAGE
21692181 page_number = page .number + 1
21702182 page_height = page .rect .height
21712183 ctm = page .transformation_matrix
@@ -2184,6 +2196,9 @@ def make_chars(page, clip=None):
21842196 for span in sorted (line ["spans" ], key = lambda s : s ["bbox" ][0 ]):
21852197 fontname = span ["font" ]
21862198 fontsize = span ["size" ]
2199+ span_bold = bool (
2200+ span ["flags" ] & pymupdf .TEXT_FONT_BOLD or span ["char_flags" ] & 8
2201+ )
21872202 color = pymupdf .sRGB_to_pdf (span ["color" ])
21882203 for char in sorted (span ["chars" ], key = lambda c : c ["bbox" ][0 ]):
21892204 bbox = pymupdf .Rect (char ["bbox" ])
@@ -2207,6 +2222,7 @@ def make_chars(page, clip=None):
22072222 "size" : fontsize if upright else bbox .y1 - bbox .y0 ,
22082223 "stroking_color" : color ,
22092224 "stroking_pattern" : None ,
2225+ "bold" : span_bold ,
22102226 "text" : text ,
22112227 "top" : bbox .y0 ,
22122228 "upright" : upright ,
@@ -2217,6 +2233,7 @@ def make_chars(page, clip=None):
22172233 "y1" : bbox_ctm .y1 ,
22182234 }
22192235 CHARS .append (char_dict )
2236+ return TEXTPAGE
22202237
22212238
22222239# ------------------------------------------------------------------------
@@ -2316,7 +2333,7 @@ def clean_graphics(npaths=None):
23162333 repeat = True # keep checking the rest
23172334
23182335 # move rect 0 over to result list if there is some text in it
2319- if not white_spaces . issuperset ( page . get_textbox ( prect0 , textpage = TEXTPAGE ) ):
2336+ if chars_in_rect ( CHARS , prect0 ):
23202337 # contains text, so accept it as a table bbox candidate
23212338 new_rects .append (prect0 )
23222339 del prects [0 ] # remove from rect list
@@ -2599,9 +2616,9 @@ def find_tables(
25992616 paths = None , # accept vector graphics as parameter
26002617):
26012618 pymupdf ._warn_layout_once ()
2602- global CHARS , EDGES
2603- CHARS = []
2604- EDGES = []
2619+ CHARS . clear ()
2620+ EDGES . clear ()
2621+ TEXTPAGE = None
26052622 old_small = bool (pymupdf .TOOLS .set_small_glyph_heights ()) # save old value
26062623 pymupdf .TOOLS .set_small_glyph_heights (True ) # we need minimum bboxes
26072624 if page .rotation != 0 :
@@ -2669,7 +2686,7 @@ def find_tables(
26692686 tset = TableSettings .resolve (settings = settings )
26702687 page .table_settings = tset
26712688
2672- make_chars (page , clip = clip ) # create character list of page
2689+ TEXTPAGE = make_chars (page , clip = clip ) # create character list of page
26732690 make_edges (
26742691 page ,
26752692 clip = clip ,
@@ -2680,7 +2697,7 @@ def find_tables(
26802697 ) # create lines and curves
26812698
26822699 tbf = TableFinder (page , settings = tset )
2683-
2700+ tbf . textpage = TEXTPAGE # store textpage for later use
26842701 if boxes :
26852702 # only keep Finder tables that match a layout box
26862703 tbf .tables = [
@@ -2706,5 +2723,6 @@ def find_tables(
27062723 if old_xref is not None :
27072724 page = page_rotation_reset (page , old_xref , old_rot , old_mediabox )
27082725 pymupdf .TOOLS .unset_quad_corrections (old_quad_corrections )
2709-
2726+ for table in tbf .tables :
2727+ table .textpage = TEXTPAGE
27102728 return tbf
0 commit comments