Skip to content

Commit 528d06f

Browse files
committed
Table Detection Improvements
* Support new detection parameter "add_boxes" which allows specifying "virtual" rectangles to help detection. * Support new parameter "paths" to allow specifying previously extracted vector graphics. * Several minor improvements, especially we now export line breaks inside table cells as HTML "<br>" tags instead of replacing "\n" by spaces.
1 parent 71715cf commit 528d06f

File tree

2 files changed

+231
-87
lines changed

2 files changed

+231
-87
lines changed

src/table.py

Lines changed: 134 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from collections.abc import Sequence
8080
from dataclasses import dataclass
8181
from operator import itemgetter
82+
import weakref
8283

8384
# -------------------------------------------------------------------
8485
# Start of PyMuPDF interface code
@@ -1367,33 +1368,57 @@ def char_in_bbox(char, bbox) -> bool:
13671368

13681369
return table_arr
13691370

1370-
def to_markdown(self, clean=True):
1371+
def to_markdown(self, clean=False, fill_empty=True):
13711372
"""Output table content as a string in Github-markdown format.
13721373
1373-
If clean is true, markdown syntax is removed from cell content."""
1374+
If "clean" then markdown syntax is removed from cell content.
1375+
If "fill_empty" then cell content None is replaced by the values
1376+
above (columns) or left (rows) in an effort to approximate row and
1377+
columns spans.
1378+
1379+
"""
13741380
output = "|"
1381+
rows = self.row_count
1382+
cols = self.col_count
1383+
cells = self.extract()[:] # make local copy of table text content
1384+
1385+
if fill_empty: # fill "None" cells where possible
1386+
1387+
# for rows, copy content from left to right
1388+
for j in range(rows):
1389+
for i in range(cols - 1):
1390+
if cells[j][i + 1] is None:
1391+
cells[j][i + 1] = cells[j][i]
13751392

1376-
# generate header string and MD underline
1393+
# for columns, copy top to bottom
1394+
for i in range(cols):
1395+
for j in range(rows - 1):
1396+
if cells[j + 1][i] is None:
1397+
cells[j + 1][i] = cells[j][i]
1398+
1399+
# generate header string and MD separator
13771400
for i, name in enumerate(self.header.names):
1378-
if name is None or name == "": # generate a name if empty
1401+
if not name: # generate a name if empty
13791402
name = f"Col{i+1}"
1380-
name = name.replace("\n", " ") # remove any line breaks
1403+
name = name.replace("\n", "<br>") # use HTML line breaks
13811404
if clean: # remove sensitive syntax
13821405
name = html.escape(name.replace("-", "&#45;"))
13831406
output += name + "|"
13841407

13851408
output += "\n"
1409+
# insert GitHub header line separator
13861410
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
13871411

13881412
# skip first row in details if header is part of the table
13891413
j = 0 if self.header.external else 1
13901414

13911415
# iterate over detail rows
1392-
for row in self.extract()[j:]:
1416+
for row in cells[j:]:
13931417
line = "|"
13941418
for i, cell in enumerate(row):
1395-
# output None cells with empty string
1396-
cell = "" if cell is None else cell.replace("\n", " ")
1419+
# replace None cells with empty string
1420+
# use HTML line break tag
1421+
cell = "" if not cell else cell.replace("\n", "<br>")
13971422
if clean: # remove sensitive syntax
13981423
cell = html.escape(cell.replace("-", "&#45;"))
13991424
line += cell + "|"
@@ -1462,22 +1487,33 @@ def _get_header(self, y_tolerance=3):
14621487
page = self.page
14631488
y_delta = y_tolerance
14641489

1465-
def top_row_is_bold(bbox):
1466-
"""Check if row 0 has bold text anywhere.
1490+
def top_row_bg_color(self):
1491+
"""
1492+
Compare top row background color with color of same-sized bbox
1493+
above. If different, return True indicating that the original
1494+
table top row is already the header.
1495+
"""
1496+
bbox0 = Rect(self.rows[0].bbox)
1497+
bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
1498+
top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
1499+
top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
1500+
if top_color0 != top_colort:
1501+
return True # top row is header
1502+
return False
14671503

1468-
If this is true, then any non-bold text in lines above disqualify
1469-
these lines as header.
1504+
def row_has_bold(bbox):
1505+
"""Check if a row contains some bold text.
14701506
1471-
bbox is the (potentially repaired) row 0 bbox.
1507+
If e.g. true for the top row, then it will be used as (internal)
1508+
column header row if any of the following is true:
1509+
* the previous (above) text line has no bold span
1510+
* the second table row text has no bold span
14721511
1473-
Returns True or False
1512+
Returns True if any spans are bold else False.
14741513
"""
1475-
for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
1476-
for l in b["lines"]:
1477-
for s in l["spans"]:
1478-
if s["flags"] & 16:
1479-
return True
1480-
return False
1514+
blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
1515+
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1516+
return any([bool(s["flags"] & 16) for s in spans]) # pylint: disable=R1729
14811517

14821518
try:
14831519
row = self.rows[0]
@@ -1489,44 +1525,57 @@ def top_row_is_bold(bbox):
14891525
# return this if we determine that the top row is the header
14901526
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
14911527

1492-
# one-line tables have no extra header
1528+
# 1-line tables have no extra header
14931529
if len(self.rows) < 2:
14941530
return header_top_row
14951531

1496-
# x-ccordinates of columns between x0 and x1 of the table
1532+
# 1-column tables have no extra header
14971533
if len(cells) < 2:
14981534
return header_top_row
14991535

1500-
col_x = [
1501-
c[2] if c is not None else None for c in cells[:-1]
1502-
] # column (x) coordinates
1536+
# assume top row is the header if second row is empty
1537+
row2 = self.rows[1] # second row
1538+
if all([c is None for c in row2.cells]): # pylint: disable=R1729
1539+
return header_top_row
15031540

15041541
# Special check: is top row bold?
1505-
# If first line above table is not bold, but top-left table cell is bold,
1506-
# we take first table row as header
1507-
top_row_bold = top_row_is_bold(bbox)
1542+
top_row_bold = row_has_bold(bbox)
1543+
1544+
# assume top row is header if it is bold and any cell
1545+
# of 2nd row is non-bold
1546+
if top_row_bold and not row_has_bold(row2.bbox):
1547+
return header_top_row
1548+
1549+
if top_row_bg_color(self):
1550+
return header_top_row
15081551

1509-
# clip = area above table
1552+
# column coordinates (x1 values) in top row
1553+
col_x = [c[2] if c is not None else None for c in cells[:-1]]
1554+
1555+
# clip = page area above the table
15101556
# We will inspect this area for text qualifying as column header.
15111557
clip = +bbox # take row 0 bbox
15121558
clip.y0 = 0 # start at top of page
15131559
clip.y1 = bbox.y0 # end at top of table
15141560

1515-
spans = [] # the text spans inside clip
1516-
for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
1517-
for l in b["lines"]:
1518-
for s in l["spans"]:
1519-
if (
1520-
not s["flags"] & 1 and s["text"].strip()
1521-
): # ignore superscripts and empty text
1522-
spans.append(s)
1561+
blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
1562+
# non-empty, non-superscript spans above table, sorted descending by y1
1563+
spans = sorted(
1564+
[
1565+
s
1566+
for b in blocks
1567+
for l in b["lines"]
1568+
for s in l["spans"]
1569+
if not (white_spaces.issuperset(s["text"]) or s["flags"] & 1)
1570+
],
1571+
key=lambda s: s["bbox"][3],
1572+
reverse=True,
1573+
)
15231574

15241575
select = [] # y1 coordinates above, sorted descending
15251576
line_heights = [] # line heights above, sorted descending
15261577
line_bolds = [] # bold indicator per line above, same sorting
15271578

1528-
# spans sorted descending
1529-
spans.sort(key=lambda s: s["bbox"][3], reverse=True)
15301579
# walk through the spans and fill above 3 lists
15311580
for i in range(len(spans)):
15321581
s = spans[i]
@@ -1541,7 +1590,7 @@ def top_row_is_bold(bbox):
15411590
line_bolds.append(bold)
15421591
continue
15431592

1544-
# get last items from the 3 lists
1593+
# get previous items from the 3 lists
15451594
y0 = select[-1]
15461595
h0 = line_heights[-1]
15471596
bold0 = line_bolds[-1]
@@ -1565,13 +1614,13 @@ def top_row_is_bold(bbox):
15651614
if select == []: # nothing above the table?
15661615
return header_top_row
15671616

1568-
select = select[:5] # only accept up to 5 lines in any header
1617+
select = select[:5] # accept up to 5 lines for an external header
15691618

1570-
# take top row as header if text above table is too far apart
1619+
# assume top row as header if text above is too far away
15711620
if bbox.y0 - select[0] >= line_heights[0]:
15721621
return header_top_row
15731622

1574-
# if top table row is bold, but line above is not:
1623+
# accept top row as header if bold, but line above is not
15751624
if top_row_bold and not line_bolds[0]:
15761625
return header_top_row
15771626

@@ -1738,7 +1787,7 @@ class TableFinder:
17381787
"""
17391788

17401789
def __init__(self, page, settings=None):
1741-
self.page = page
1790+
self.page = weakref.proxy(page)
17421791
self.settings = TableSettings.resolve(settings)
17431792
self.edges = self.get_edges()
17441793
self.intersections = edges_to_intersections(
@@ -1942,7 +1991,7 @@ def make_chars(page, clip=None):
19421991
# We are ignoring Bézier curves completely and are converting everything
19431992
# else to lines.
19441993
# ------------------------------------------------------------------------
1945-
def make_edges(page, clip=None, tset=None, add_lines=None):
1994+
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
19461995
snap_x = tset.snap_x_tolerance
19471996
snap_y = tset.snap_y_tolerance
19481997
min_length = tset.edge_min_length
@@ -1994,16 +2043,19 @@ def are_neighbors(r1, r2):
19942043
return True
19952044
return False
19962045

1997-
def clean_graphics():
2046+
def clean_graphics(npaths=None):
19982047
"""Detect and join rectangles of "connected" vector graphics."""
1999-
2000-
paths = [] # paths relevant for table detection
2001-
for p in page.get_drawings():
2002-
# ignore fill-only graphics if they do not simulate lines,
2003-
# which means one of width or height are small.
2048+
if npaths is None:
2049+
allpaths = page.get_drawings()
2050+
else: # accept passed-in vector graphics
2051+
allpaths = npaths[:] # paths relevant for table detection
2052+
paths = []
2053+
for p in allpaths:
2054+
# If only looking at lines, we ignore fill-only paths,
2055+
# except simulated lines (i.e. small width or height).
20042056
if (
2005-
p["type"] == "f"
2006-
and lines_strict
2057+
lines_strict
2058+
and p["type"] == "f"
20072059
and p["rect"].width > snap_x
20082060
and p["rect"].height > snap_y
20092061
):
@@ -2038,7 +2090,7 @@ def clean_graphics():
20382090

20392091
return new_rects, paths
20402092

2041-
bboxes, paths = clean_graphics()
2093+
bboxes, paths = clean_graphics(npaths=paths)
20422094

20432095
def is_parallel(p1, p2):
20442096
"""Check if line is roughly axis-parallel."""
@@ -2209,6 +2261,25 @@ def make_line(p, p1, p2, clip):
22092261
if line_dict:
22102262
EDGES.append(line_to_edge(line_dict))
22112263

2264+
if add_boxes is not None: # add user-specified rectangles
2265+
assert isinstance(add_boxes, (tuple, list))
2266+
else:
2267+
add_boxes = []
2268+
for box in add_boxes:
2269+
r = Rect(box)
2270+
line_dict = make_line(path, r.tl, r.bl, clip)
2271+
if line_dict:
2272+
EDGES.append(line_to_edge(line_dict))
2273+
line_dict = make_line(path, r.bl, r.br, clip)
2274+
if line_dict:
2275+
EDGES.append(line_to_edge(line_dict))
2276+
line_dict = make_line(path, r.br, r.tr, clip)
2277+
if line_dict:
2278+
EDGES.append(line_to_edge(line_dict))
2279+
line_dict = make_line(path, r.tr, r.tl, clip)
2280+
if line_dict:
2281+
EDGES.append(line_to_edge(line_dict))
2282+
22122283

22132284
def page_rotation_set0(page):
22142285
"""Nullify page rotation.
@@ -2290,7 +2361,9 @@ def find_tables(
22902361
text_x_tolerance=3,
22912362
text_y_tolerance=3,
22922363
strategy=None, # offer abbreviation
2293-
add_lines=None, # optional user-specified lines
2364+
add_lines=None, # user-specified lines
2365+
add_boxes=None, # user-specified rectangles
2366+
paths=None, # accept vector graphics as parameter
22942367
):
22952368
global CHARS, EDGES
22962369
CHARS = []
@@ -2344,7 +2417,12 @@ def find_tables(
23442417

23452418
make_chars(page, clip=clip) # create character list of page
23462419
make_edges(
2347-
page, clip=clip, tset=tset, add_lines=add_lines
2420+
page,
2421+
clip=clip,
2422+
tset=tset,
2423+
paths=paths,
2424+
add_lines=add_lines,
2425+
add_boxes=add_boxes,
23482426
) # create lines and curves
23492427
tables = TableFinder(page, settings=tset)
23502428

0 commit comments

Comments
 (0)