Skip to content

Commit ecb0835

Browse files
committed
Table Detection Improvements
* Support new detection parameter "add_boxes" which allows specifying "virtual" rectangles to help detection. * Support new parameter "paths" to allow specifying previously extracted vector graphics. * Several minor improvements, especially we now export line breaks inside table cells as HTML "<br>" tags instead of replacing "\n" by spaces.
1 parent 71715cf commit ecb0835

File tree

2 files changed

+241
-89
lines changed

2 files changed

+241
-89
lines changed

src/table.py

Lines changed: 144 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from collections.abc import Sequence
8080
from dataclasses import dataclass
8181
from operator import itemgetter
82+
import weakref
8283

8384
# -------------------------------------------------------------------
8485
# Start of PyMuPDF interface code
@@ -87,6 +88,8 @@
8788
Rect,
8889
Matrix,
8990
TEXTFLAGS_TEXT,
91+
TEXT_FONT_BOLD,
92+
TEXT_FONT_SUPERSCRIPT,
9093
TOOLS,
9194
EMPTY_RECT,
9295
sRGB_to_pdf,
@@ -1061,7 +1064,7 @@ def get_center(word):
10611064
if not overlap:
10621065
condensed_bboxes.append(bbox)
10631066

1064-
if len(condensed_bboxes) == 0:
1067+
if not condensed_bboxes:
10651068
return []
10661069

10671070
condensed_rects = map(bbox_to_rect, condensed_bboxes)
@@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:
13671370

13681371
return table_arr
13691372

1370-
def to_markdown(self, clean=True):
1373+
def to_markdown(self, clean=False, fill_empty=True):
13711374
"""Output table content as a string in Github-markdown format.
13721375
1373-
If clean is true, markdown syntax is removed from cell content."""
1376+
If "clean" then markdown syntax is removed from cell content.
1377+
If "fill_empty" then cell content None is replaced by the values
1378+
above (columns) or left (rows) in an effort to approximate row and
1379+
columns spans.
1380+
1381+
"""
13741382
output = "|"
1383+
rows = self.row_count
1384+
cols = self.col_count
1385+
cells = self.extract()[:] # make local copy of table text content
1386+
1387+
if fill_empty: # fill "None" cells where possible
1388+
1389+
# for rows, copy content from left to right
1390+
for j in range(rows):
1391+
for i in range(cols - 1):
1392+
if cells[j][i + 1] is None:
1393+
cells[j][i + 1] = cells[j][i]
13751394

1376-
# generate header string and MD underline
1395+
# for columns, copy top to bottom
1396+
for i in range(cols):
1397+
for j in range(rows - 1):
1398+
if cells[j + 1][i] is None:
1399+
cells[j + 1][i] = cells[j][i]
1400+
1401+
# generate header string and MD separator
13771402
for i, name in enumerate(self.header.names):
1378-
if name is None or name == "": # generate a name if empty
1403+
if not name: # generate a name if empty
13791404
name = f"Col{i+1}"
1380-
name = name.replace("\n", " ") # remove any line breaks
1405+
name = name.replace("\n", "<br>") # use HTML line breaks
13811406
if clean: # remove sensitive syntax
13821407
name = html.escape(name.replace("-", "&#45;"))
13831408
output += name + "|"
13841409

13851410
output += "\n"
1411+
# insert GitHub header line separator
13861412
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
13871413

13881414
# skip first row in details if header is part of the table
13891415
j = 0 if self.header.external else 1
13901416

13911417
# iterate over detail rows
1392-
for row in self.extract()[j:]:
1418+
for row in cells[j:]:
13931419
line = "|"
13941420
for i, cell in enumerate(row):
1395-
# output None cells with empty string
1396-
cell = "" if cell is None else cell.replace("\n", " ")
1421+
# replace None cells with empty string
1422+
# use HTML line break tag
1423+
cell = "" if not cell else cell.replace("\n", "<br>")
13971424
if clean: # remove sensitive syntax
13981425
cell = html.escape(cell.replace("-", "&#45;"))
13991426
line += cell + "|"
@@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
14621489
page = self.page
14631490
y_delta = y_tolerance
14641491

1465-
def top_row_is_bold(bbox):
1466-
"""Check if row 0 has bold text anywhere.
1492+
def top_row_bg_color(self):
1493+
"""
1494+
Compare top row background color with color of same-sized bbox
1495+
above. If different, return True indicating that the original
1496+
table top row is already the header.
1497+
"""
1498+
bbox0 = Rect(self.rows[0].bbox)
1499+
bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
1500+
top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
1501+
top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
1502+
if top_color0 != top_colort:
1503+
return True # top row is header
1504+
return False
14671505

1468-
If this is true, then any non-bold text in lines above disqualify
1469-
these lines as header.
1506+
def row_has_bold(bbox):
1507+
"""Check if a row contains some bold text.
14701508
1471-
bbox is the (potentially repaired) row 0 bbox.
1509+
If e.g. true for the top row, then it will be used as (internal)
1510+
column header row if any of the following is true:
1511+
* the previous (above) text line has no bold span
1512+
* the second table row text has no bold span
14721513
1473-
Returns True or False
1514+
Returns True if any spans are bold else False.
14741515
"""
1475-
for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
1476-
for l in b["lines"]:
1477-
for s in l["spans"]:
1478-
if s["flags"] & 16:
1479-
return True
1480-
return False
1516+
blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
1517+
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1518+
1519+
return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
14811520

14821521
try:
14831522
row = self.rows[0]
@@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
14891528
# return this if we determine that the top row is the header
14901529
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
14911530

1492-
# one-line tables have no extra header
1531+
# 1-line tables have no extra header
14931532
if len(self.rows) < 2:
14941533
return header_top_row
14951534

1496-
# x-ccordinates of columns between x0 and x1 of the table
1535+
# 1-column tables have no extra header
14971536
if len(cells) < 2:
14981537
return header_top_row
14991538

1500-
col_x = [
1501-
c[2] if c is not None else None for c in cells[:-1]
1502-
] # column (x) coordinates
1539+
# assume top row is the header if second row is empty
1540+
row2 = self.rows[1] # second row
1541+
if all(c is None for c in row2.cells): # no valid cell bboxes in row2
1542+
return header_top_row
15031543

15041544
# Special check: is top row bold?
1505-
# If first line above table is not bold, but top-left table cell is bold,
1506-
# we take first table row as header
1507-
top_row_bold = top_row_is_bold(bbox)
1545+
top_row_bold = row_has_bold(bbox)
1546+
1547+
# assume top row is header if it is bold and any cell
1548+
# of 2nd row is non-bold
1549+
if top_row_bold and not row_has_bold(row2.bbox):
1550+
return header_top_row
1551+
1552+
if top_row_bg_color(self):
1553+
# if area above top row has a different background color,
1554+
# then top row is already the header
1555+
return header_top_row
15081556

1509-
# clip = area above table
1557+
# column coordinates (x1 values) in top row
1558+
col_x = [c[2] if c is not None else None for c in cells[:-1]]
1559+
1560+
# clip = page area above the table
15101561
# We will inspect this area for text qualifying as column header.
15111562
clip = +bbox # take row 0 bbox
15121563
clip.y0 = 0 # start at top of page
15131564
clip.y1 = bbox.y0 # end at top of table
15141565

1515-
spans = [] # the text spans inside clip
1516-
for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
1517-
for l in b["lines"]:
1518-
for s in l["spans"]:
1519-
if (
1520-
not s["flags"] & 1 and s["text"].strip()
1521-
): # ignore superscripts and empty text
1522-
spans.append(s)
1566+
blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
1567+
# non-empty, non-superscript spans above table, sorted descending by y1
1568+
spans = sorted(
1569+
[
1570+
s
1571+
for b in blocks
1572+
for l in b["lines"]
1573+
for s in l["spans"]
1574+
if not (
1575+
white_spaces.issuperset(s["text"])
1576+
or s["flags"] & TEXT_FONT_SUPERSCRIPT
1577+
)
1578+
],
1579+
key=lambda s: s["bbox"][3],
1580+
reverse=True,
1581+
)
15231582

15241583
select = [] # y1 coordinates above, sorted descending
15251584
line_heights = [] # line heights above, sorted descending
15261585
line_bolds = [] # bold indicator per line above, same sorting
15271586

1528-
# spans sorted descending
1529-
spans.sort(key=lambda s: s["bbox"][3], reverse=True)
15301587
# walk through the spans and fill above 3 lists
15311588
for i in range(len(spans)):
15321589
s = spans[i]
15331590
y1 = s["bbox"][3] # span bottom
15341591
h = y1 - s["bbox"][1] # span bbox height
1535-
bold = s["flags"] & 16
1592+
bold = s["flags"] & TEXT_FONT_BOLD
15361593

15371594
# use first item to start the lists
15381595
if i == 0:
@@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
15411598
line_bolds.append(bold)
15421599
continue
15431600

1544-
# get last items from the 3 lists
1601+
# get previous items from the 3 lists
15451602
y0 = select[-1]
15461603
h0 = line_heights[-1]
15471604
bold0 = line_bolds[-1]
@@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
15651622
if select == []: # nothing above the table?
15661623
return header_top_row
15671624

1568-
select = select[:5] # only accept up to 5 lines in any header
1625+
select = select[:5] # accept up to 5 lines for an external header
15691626

1570-
# take top row as header if text above table is too far apart
1627+
# assume top row as header if text above is too far away
15711628
if bbox.y0 - select[0] >= line_heights[0]:
15721629
return header_top_row
15731630

1574-
# if top table row is bold, but line above is not:
1631+
# accept top row as header if bold, but line above is not
15751632
if top_row_bold and not line_bolds[0]:
15761633
return header_top_row
15771634

@@ -1738,7 +1795,7 @@ class TableFinder:
17381795
"""
17391796

17401797
def __init__(self, page, settings=None):
1741-
self.page = page
1798+
self.page = weakref.proxy(page)
17421799
self.settings = TableSettings.resolve(settings)
17431800
self.edges = self.get_edges()
17441801
self.intersections = edges_to_intersections(
@@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
19421999
# We are ignoring Bézier curves completely and are converting everything
19432000
# else to lines.
19442001
# ------------------------------------------------------------------------
1945-
def make_edges(page, clip=None, tset=None, add_lines=None):
2002+
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
19462003
snap_x = tset.snap_x_tolerance
19472004
snap_y = tset.snap_y_tolerance
19482005
min_length = tset.edge_min_length
@@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
19942051
return True
19952052
return False
19962053

1997-
def clean_graphics():
2054+
def clean_graphics(npaths=None):
19982055
"""Detect and join rectangles of "connected" vector graphics."""
1999-
2000-
paths = [] # paths relevant for table detection
2001-
for p in page.get_drawings():
2002-
# ignore fill-only graphics if they do not simulate lines,
2003-
# which means one of width or height are small.
2056+
if npaths is None:
2057+
allpaths = page.get_drawings()
2058+
else: # accept passed-in vector graphics
2059+
allpaths = npaths[:] # paths relevant for table detection
2060+
paths = []
2061+
for p in allpaths:
2062+
# If only looking at lines, we ignore fill-only paths,
2063+
# except simulated lines (i.e. small width or height).
20042064
if (
2005-
p["type"] == "f"
2006-
and lines_strict
2065+
lines_strict
2066+
and p["type"] == "f"
20072067
and p["rect"].width > snap_x
20082068
and p["rect"].height > snap_y
20092069
):
@@ -2038,7 +2098,7 @@ def clean_graphics():
20382098

20392099
return new_rects, paths
20402100

2041-
bboxes, paths = clean_graphics()
2101+
bboxes, paths = clean_graphics(npaths=paths)
20422102

20432103
def is_parallel(p1, p2):
20442104
"""Check if line is roughly axis-parallel."""
@@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
22092269
if line_dict:
22102270
EDGES.append(line_to_edge(line_dict))
22112271

2272+
if add_boxes is not None: # add user-specified rectangles
2273+
assert isinstance(add_boxes, (tuple, list))
2274+
else:
2275+
add_boxes = []
2276+
for box in add_boxes:
2277+
r = Rect(box)
2278+
line_dict = make_line(path, r.tl, r.bl, clip)
2279+
if line_dict:
2280+
EDGES.append(line_to_edge(line_dict))
2281+
line_dict = make_line(path, r.bl, r.br, clip)
2282+
if line_dict:
2283+
EDGES.append(line_to_edge(line_dict))
2284+
line_dict = make_line(path, r.br, r.tr, clip)
2285+
if line_dict:
2286+
EDGES.append(line_to_edge(line_dict))
2287+
line_dict = make_line(path, r.tr, r.tl, clip)
2288+
if line_dict:
2289+
EDGES.append(line_to_edge(line_dict))
2290+
22122291

22132292
def page_rotation_set0(page):
22142293
"""Nullify page rotation.
@@ -2290,7 +2369,9 @@ def find_tables(
22902369
text_x_tolerance=3,
22912370
text_y_tolerance=3,
22922371
strategy=None, # offer abbreviation
2293-
add_lines=None, # optional user-specified lines
2372+
add_lines=None, # user-specified lines
2373+
add_boxes=None, # user-specified rectangles
2374+
paths=None, # accept vector graphics as parameter
22942375
):
22952376
global CHARS, EDGES
22962377
CHARS = []
@@ -2344,7 +2425,12 @@ def find_tables(
23442425

23452426
make_chars(page, clip=clip) # create character list of page
23462427
make_edges(
2347-
page, clip=clip, tset=tset, add_lines=add_lines
2428+
page,
2429+
clip=clip,
2430+
tset=tset,
2431+
paths=paths,
2432+
add_lines=add_lines,
2433+
add_boxes=add_boxes,
23482434
) # create lines and curves
23492435
tables = TableFinder(page, settings=tset)
23502436

0 commit comments

Comments
 (0)