Skip to content

Commit 493ef07

Browse files
committed
Update table.py
Update table.py DEBUG: disable using "chars_in_rect()". Update table.py DEBUG 2
1 parent 8264a4b commit 493ef07

File tree

1 file changed

+39
-2
lines changed

1 file changed

+39
-2
lines changed

src/table.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,29 @@
110110
white_spaces = set(string.whitespace) # for checking white space only cells
111111

112112

113+
def rect_in_rect(inner, outer):
114+
"""Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
115+
return (
116+
1
117+
and inner[0] >= outer[0]
118+
and inner[1] >= outer[1]
119+
and inner[2] <= outer[2]
120+
and inner[3] <= outer[3]
121+
)
122+
123+
124+
def chars_in_rect(CHARS, rect):
125+
"""Check whether any of the chars in CHAR are inside rectangle 'rect'."""
126+
return any(
127+
1
128+
and rect[0] <= c["x0"]
129+
and c["x1"] <= rect[2]
130+
and rect[1] <= c["y0"]
131+
and rect[3] >= c["y1"]
132+
for c in CHARS
133+
)
134+
135+
113136
def _iou(r1, r2):
114137
"""Compute intersection over union of two rectangles."""
115138
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
126149
"""Check whether any of the words in bbox are cut through by
127150
horizontal line y.
128151
"""
129-
return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
152+
return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))
130153

131154

132155
def get_table_dict_from_rect(textpage, rect):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
182205
for i in range(len(nypos) - 1):
183206
row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
184207
# Sub-select words in this row and sort them by left coordinate
185-
row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
208+
row_words = sorted(
209+
[r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
210+
)
186211
# Sub-select x values that do not cut through words
187212
this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
188213
for j in range(len(this_xpos) - 1):
@@ -1721,6 +1746,12 @@ def row_has_bold(bbox):
17211746
17221747
Returns True if any spans are bold else False.
17231748
"""
1749+
#global CHARS
1750+
#return any(
1751+
# c["bold"]
1752+
# for c in CHARS
1753+
# if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
1754+
#)
17241755
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
17251756
"blocks"
17261757
]
@@ -2171,6 +2202,9 @@ def make_chars(page, clip=None):
21712202
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
21722203
fontname = span["font"]
21732204
fontsize = span["size"]
2205+
span_bold = bool(
2206+
span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
2207+
)
21742208
color = pymupdf.sRGB_to_pdf(span["color"])
21752209
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
21762210
bbox = pymupdf.Rect(char["bbox"])
@@ -2194,6 +2228,7 @@ def make_chars(page, clip=None):
21942228
"size": fontsize if upright else bbox.y1 - bbox.y0,
21952229
"stroking_color": color,
21962230
"stroking_pattern": None,
2231+
"bold": span_bold,
21972232
"text": text,
21982233
"top": bbox.y0,
21992234
"upright": upright,
@@ -2212,6 +2247,7 @@ def make_chars(page, clip=None):
22122247
# else to lines.
22132248
# ------------------------------------------------------------------------
22142249
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
2250+
global CHARS
22152251
snap_x = tset.snap_x_tolerance
22162252
snap_y = tset.snap_y_tolerance
22172253
min_length = tset.edge_min_length
@@ -2303,6 +2339,7 @@ def clean_graphics(npaths=None):
23032339
repeat = True # keep checking the rest
23042340

23052341
# move rect 0 over to result list if there is some text in it
2342+
#if chars_in_rect(CHARS, prect0):
23062343
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
23072344
# contains text, so accept it as a table bbox candidate
23082345
new_rects.append(prect0)

0 commit comments

Comments
 (0)