Skip to content

Commit af475a7

Browse files
committed
Avoid expensive text extractions
Adjust test script: Function test_2979() will no longer show the tail of error messages, because of the avoided text extractions.
1 parent 52b8e52 commit af475a7

File tree

2 files changed

+35
-22
lines changed

2 files changed

+35
-22
lines changed

src/table.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989

9090
EDGES = [] # vector graphics from PyMuPDF
9191
CHARS = [] # text characters from PyMuPDF
92-
TEXTPAGE = None
92+
TEXTPAGE = None # textpage for cell text extraction
9393
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
9494
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
9595
FLAGS = (
@@ -121,6 +121,18 @@ def rect_in_rect(inner, outer):
121121
)
122122

123123

124+
def chars_in_rect(CHARS, rect):
125+
"""Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+
return any(
127+
1
128+
and rect[0] <= c["x0"]
129+
and c["x1"] <= rect[2]
130+
and rect[1] <= c["y0"]
131+
and rect[3] >= c["y1"]
132+
for c in CHARS
133+
)
134+
135+
124136
def _iou(r1, r2):
125137
"""Compute intersection over union of two rectangles."""
126138
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -1509,6 +1521,7 @@ def __init__(self, bbox, cells, names, above):
15091521
class Table:
15101522
def __init__(self, page, cells):
15111523
self.page = page
1524+
self.textpage = None
15121525
self.cells = cells
15131526
self.header = self._get_header() # PyMuPDF extension
15141527

@@ -1601,7 +1614,7 @@ def to_markdown(self, clean=False, fill_empty=True):
16011614
for j, cell in enumerate(row):
16021615
if cell is not None:
16031616
cells[i][j] = extract_cells(
1604-
TEXTPAGE, cell_boxes[i][j], markdown=True
1617+
self.textpage, cell_boxes[i][j], markdown=True
16051618
)
16061619

16071620
if fill_empty: # fill "None" cells where possible
@@ -1734,12 +1747,11 @@ def row_has_bold(bbox):
17341747
17351748
Returns True if any spans are bold else False.
17361749
"""
1737-
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
1738-
"blocks"
1739-
]
1740-
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1741-
1742-
return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
1750+
return any(
1751+
c["bold"]
1752+
for c in CHARS
1753+
if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
1754+
)
17431755

17441756
try:
17451757
row = self.rows[0]
@@ -2021,6 +2033,7 @@ class TableFinder:
20212033

20222034
def __init__(self, page, settings=None):
20232035
self.page = weakref.proxy(page)
2036+
self.textpage = None
20242037
self.settings = TableSettings.resolve(settings)
20252038
self.edges = self.get_edges()
20262039
self.intersections = edges_to_intersections(
@@ -2165,7 +2178,6 @@ def __getitem__(self, i):
21652178
# -----------------------------------------------------------------------------
21662179
def make_chars(page, clip=None):
21672180
"""Extract text as "rawdict" to fill CHARS."""
2168-
global TEXTPAGE
21692181
page_number = page.number + 1
21702182
page_height = page.rect.height
21712183
ctm = page.transformation_matrix
@@ -2184,6 +2196,9 @@ def make_chars(page, clip=None):
21842196
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
21852197
fontname = span["font"]
21862198
fontsize = span["size"]
2199+
span_bold = bool(
2200+
span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
2201+
)
21872202
color = pymupdf.sRGB_to_pdf(span["color"])
21882203
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
21892204
bbox = pymupdf.Rect(char["bbox"])
@@ -2207,6 +2222,7 @@ def make_chars(page, clip=None):
22072222
"size": fontsize if upright else bbox.y1 - bbox.y0,
22082223
"stroking_color": color,
22092224
"stroking_pattern": None,
2225+
"bold": span_bold,
22102226
"text": text,
22112227
"top": bbox.y0,
22122228
"upright": upright,
@@ -2217,6 +2233,7 @@ def make_chars(page, clip=None):
22172233
"y1": bbox_ctm.y1,
22182234
}
22192235
CHARS.append(char_dict)
2236+
return TEXTPAGE
22202237

22212238

22222239
# ------------------------------------------------------------------------
@@ -2316,7 +2333,7 @@ def clean_graphics(npaths=None):
23162333
repeat = True # keep checking the rest
23172334

23182335
# move rect 0 over to result list if there is some text in it
2319-
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
2336+
if chars_in_rect(CHARS, prect0):
23202337
# contains text, so accept it as a table bbox candidate
23212338
new_rects.append(prect0)
23222339
del prects[0] # remove from rect list
@@ -2599,9 +2616,9 @@ def find_tables(
25992616
paths=None, # accept vector graphics as parameter
26002617
):
26012618
pymupdf._warn_layout_once()
2602-
global CHARS, EDGES
2603-
CHARS = []
2604-
EDGES = []
2619+
CHARS.clear()
2620+
EDGES.clear()
2621+
TEXTPAGE = None
26052622
old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value
26062623
pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
26072624
if page.rotation != 0:
@@ -2669,7 +2686,7 @@ def find_tables(
26692686
tset = TableSettings.resolve(settings=settings)
26702687
page.table_settings = tset
26712688

2672-
make_chars(page, clip=clip) # create character list of page
2689+
TEXTPAGE = make_chars(page, clip=clip) # create character list of page
26732690
make_edges(
26742691
page,
26752692
clip=clip,
@@ -2680,7 +2697,7 @@ def find_tables(
26802697
) # create lines and curves
26812698

26822699
tbf = TableFinder(page, settings=tset)
2683-
2700+
tbf.textpage = TEXTPAGE # store textpage for later use
26842701
if boxes:
26852702
# only keep Finder tables that match a layout box
26862703
tbf.tables = [
@@ -2706,5 +2723,6 @@ def find_tables(
27062723
if old_xref is not None:
27072724
page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
27082725
pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections)
2709-
2726+
for table in tbf.tables:
2727+
table.textpage = TEXTPAGE
27102728
return tbf

tests/test_tables.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,7 @@ def test_2979():
184184
), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
185185

186186
wt = pymupdf.TOOLS.mupdf_warnings()
187-
if pymupdf.mupdf_version_tuple >= (1, 26, 8):
188-
assert (
189-
wt
190-
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...\nActualtext with no position. Text may be lost or mispositioned.\n... repeated 96 times..."
191-
)
192-
elif pymupdf.mupdf_version_tuple >= (1, 26, 0):
187+
if pymupdf.mupdf_version_tuple >= (1, 26, 0):
193188
assert (
194189
wt
195190
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."

0 commit comments

Comments
 (0)