Skip to content

Commit 53187bc

Browse files
committed
Update table.py
1 parent 8264a4b commit 53187bc

File tree

1 file changed

+39
-9
lines changed

1 file changed

+39
-9
lines changed

src/table.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,29 @@
110110
white_spaces = set(string.whitespace) # for checking white space only cells
111111

112112

113+
def rect_in_rect(inner, outer):
114+
"""Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+
return (
116+
1
117+
and inner[0] >= outer[0]
118+
and inner[1] >= outer[1]
119+
and inner[2] <= outer[2]
120+
and inner[3] <= outer[3]
121+
)
122+
123+
124+
def chars_in_rect(CHARS, rect):
125+
"""Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+
return any(
127+
1
128+
and rect[0] <= c["x0"]
129+
and c["x1"] <= rect[2]
130+
and rect[1] <= c["y0"]
131+
and rect[3] >= c["y1"]
132+
for c in CHARS
133+
)
134+
135+
113136
def _iou(r1, r2):
114137
"""Compute intersection over union of two rectangles."""
115138
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126149
"""Check whether any of the words in bbox are cut through by
127150
horizontal line y.
128151
"""
129-
return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
152+
return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))
130153

131154

132155
def get_table_dict_from_rect(textpage, rect):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182205
for i in range(len(nypos) - 1):
183206
row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
184207
# Sub-select words in this row and sort them by left coordinate
185-
row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
208+
row_words = sorted(
209+
[r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
210+
)
186211
# Sub-select x values that do not cut through words
187212
this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
188213
for j in range(len(this_xpos) - 1):
@@ -1721,12 +1746,12 @@ def row_has_bold(bbox):
17211746
17221747
Returns True if any spans are bold else False.
17231748
"""
1724-
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
1725-
"blocks"
1726-
]
1727-
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1728-
1729-
return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
1749+
global CHARS
1750+
return any(
1751+
c["bold"]
1752+
for c in CHARS
1753+
if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
1754+
)
17301755

17311756
try:
17321757
row = self.rows[0]
@@ -2171,6 +2196,9 @@ def make_chars(page, clip=None):
21712196
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
21722197
fontname = span["font"]
21732198
fontsize = span["size"]
2199+
span_bold = bool(
2200+
span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
2201+
)
21742202
color = pymupdf.sRGB_to_pdf(span["color"])
21752203
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
21762204
bbox = pymupdf.Rect(char["bbox"])
@@ -2194,6 +2222,7 @@ def make_chars(page, clip=None):
21942222
"size": fontsize if upright else bbox.y1 - bbox.y0,
21952223
"stroking_color": color,
21962224
"stroking_pattern": None,
2225+
"bold": span_bold,
21972226
"text": text,
21982227
"top": bbox.y0,
21992228
"upright": upright,
@@ -2212,6 +2241,7 @@ def make_chars(page, clip=None):
22122241
# else to lines.
22132242
# ------------------------------------------------------------------------
22142243
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
2244+
global CHARS
22152245
snap_x = tset.snap_x_tolerance
22162246
snap_y = tset.snap_y_tolerance
22172247
min_length = tset.edge_min_length
@@ -2303,7 +2333,7 @@ def clean_graphics(npaths=None):
23032333
repeat = True # keep checking the rest
23042334

23052335
# move rect 0 over to result list if there is some text in it
2306-
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
2336+
if chars_in_rect(CHARS, prect0):
23072337
# contains text, so accept it as a table bbox candidate
23082338
new_rects.append(prect0)
23092339
del prects[0] # remove from rect list

0 commit comments

Comments
 (0)