Skip to content

Commit 7266db9

Browse files
committed
Adjust test script
Use slimmed down rectangle containment checks and avoid expensive text extractions. Function test_2979() will no longer show the tail of error messages, because of the avoided text extractions.
1 parent 8264a4b commit 7266db9

File tree

2 files changed

+41
-19
lines changed

2 files changed

+41
-19
lines changed

src/table.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,29 @@
110110
white_spaces = set(string.whitespace) # for checking white space only cells
111111

112112

113+
def rect_in_rect(inner, outer):
114+
"""Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+
return (
116+
1
117+
and inner[0] >= outer[0]
118+
and inner[1] >= outer[1]
119+
and inner[2] <= outer[2]
120+
and inner[3] <= outer[3]
121+
)
122+
123+
124+
def chars_in_rect(CHARS, rect):
125+
"""Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+
return any(
127+
1
128+
and rect[0] <= c["x0"]
129+
and c["x1"] <= rect[2]
130+
and rect[1] <= c["y0"]
131+
and rect[3] >= c["y1"]
132+
for c in CHARS
133+
)
134+
135+
113136
def _iou(r1, r2):
114137
"""Compute intersection over union of two rectangles."""
115138
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126149
"""Check whether any of the words in bbox are cut through by
127150
horizontal line y.
128151
"""
129-
return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
152+
return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))
130153

131154

132155
def get_table_dict_from_rect(textpage, rect):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182205
for i in range(len(nypos) - 1):
183206
row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
184207
# Sub-select words in this row and sort them by left coordinate
185-
row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
208+
row_words = sorted(
209+
[r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
210+
)
186211
# Sub-select x values that do not cut through words
187212
this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
188213
for j in range(len(this_xpos) - 1):
@@ -1721,12 +1746,11 @@ def row_has_bold(bbox):
17211746
17221747
Returns True if any spans are bold else False.
17231748
"""
1724-
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
1725-
"blocks"
1726-
]
1727-
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1728-
1729-
return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
1749+
return any(
1750+
c["bold"]
1751+
for c in CHARS
1752+
if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
1753+
)
17301754

17311755
try:
17321756
row = self.rows[0]
@@ -2152,7 +2176,6 @@ def __getitem__(self, i):
21522176
# -----------------------------------------------------------------------------
21532177
def make_chars(page, clip=None):
21542178
"""Extract text as "rawdict" to fill CHARS."""
2155-
global TEXTPAGE
21562179
page_number = page.number + 1
21572180
page_height = page.rect.height
21582181
ctm = page.transformation_matrix
@@ -2171,6 +2194,9 @@ def make_chars(page, clip=None):
21712194
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
21722195
fontname = span["font"]
21732196
fontsize = span["size"]
2197+
span_bold = bool(
2198+
span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
2199+
)
21742200
color = pymupdf.sRGB_to_pdf(span["color"])
21752201
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
21762202
bbox = pymupdf.Rect(char["bbox"])
@@ -2194,6 +2220,7 @@ def make_chars(page, clip=None):
21942220
"size": fontsize if upright else bbox.y1 - bbox.y0,
21952221
"stroking_color": color,
21962222
"stroking_pattern": None,
2223+
"bold": span_bold,
21972224
"text": text,
21982225
"top": bbox.y0,
21992226
"upright": upright,
@@ -2303,7 +2330,7 @@ def clean_graphics(npaths=None):
23032330
repeat = True # keep checking the rest
23042331

23052332
# move rect 0 over to result list if there is some text in it
2306-
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
2333+
if chars_in_rect(CHARS, prect0):
23072334
# contains text, so accept it as a table bbox candidate
23082335
new_rects.append(prect0)
23092336
del prects[0] # remove from rect list
@@ -2586,9 +2613,9 @@ def find_tables(
25862613
paths=None, # accept vector graphics as parameter
25872614
):
25882615
pymupdf._warn_layout_once()
2589-
global CHARS, EDGES
2590-
CHARS = []
2591-
EDGES = []
2616+
CHARS.clear()
2617+
EDGES.clear()
2618+
TEXTPAGE = None
25922619
old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value
25932620
pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
25942621
if page.rotation != 0:

tests/test_tables.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,7 @@ def test_2979():
184184
), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
185185

186186
wt = pymupdf.TOOLS.mupdf_warnings()
187-
if pymupdf.mupdf_version_tuple >= (1, 26, 8):
188-
assert (
189-
wt
190-
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...\nActualtext with no position. Text may be lost or mispositioned.\n... repeated 96 times..."
191-
)
192-
elif pymupdf.mupdf_version_tuple >= (1, 26, 0):
187+
if pymupdf.mupdf_version_tuple >= (1, 26, 0):
193188
assert (
194189
wt
195190
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."

0 commit comments

Comments
 (0)