Skip to content

Commit d293c70

Browse files
committed
Update table.py
1 parent 8657e4e commit d293c70

File tree

1 file changed

+119
-57
lines changed

1 file changed

+119
-57
lines changed

src/table.py

Lines changed: 119 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from collections.abc import Sequence
8080
from dataclasses import dataclass
8181
from operator import itemgetter
82+
import weakref
8283

8384
# -------------------------------------------------------------------
8485
# Start of PyMuPDF interface code
@@ -1367,33 +1368,57 @@ def char_in_bbox(char, bbox) -> bool:
13671368

13681369
return table_arr
13691370

1370-
def to_markdown(self, clean=True):
1371+
def to_markdown(self, clean=False, fill_empty=True):
13711372
"""Output table content as a string in Github-markdown format.
13721373
1373-
If clean is true, markdown syntax is removed from cell content."""
1374+
If "clean" then markdown syntax is removed from cell content.
1375+
If "fill_empty" then cell content None is replaced by the values
1376+
above (columns) or left (rows) in an effort to approximate row and
1377+
columns spans.
1378+
1379+
"""
13741380
output = "|"
1381+
rows = self.row_count
1382+
cols = self.col_count
1383+
cells = self.extract()[:] # make local copy of table text content
1384+
1385+
if fill_empty: # fill "None" cells where possible
13751386

1376-
# generate header string and MD underline
1387+
# for rows, copy content from left to right
1388+
for j in range(rows):
1389+
for i in range(cols - 1):
1390+
if cells[j][i + 1] is None:
1391+
cells[j][i + 1] = cells[j][i]
1392+
1393+
# for columns, copy top to bottom
1394+
for i in range(cols):
1395+
for j in range(rows - 1):
1396+
if cells[j + 1][i] is None:
1397+
cells[j + 1][i] = cells[j][i]
1398+
1399+
# generate header string and MD separator
13771400
for i, name in enumerate(self.header.names):
1378-
if name is None or name == "": # generate a name if empty
1401+
if not name: # generate a name if empty
13791402
name = f"Col{i+1}"
1380-
name = name.replace("\n", " ") # remove any line breaks
1403+
name = name.replace("\n", "<br>") # use HTML line breaks
13811404
if clean: # remove sensitive syntax
13821405
name = html.escape(name.replace("-", "&#45;"))
13831406
output += name + "|"
13841407

13851408
output += "\n"
1409+
# insert GitHub header line separator
13861410
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
13871411

13881412
# skip first row in details if header is part of the table
13891413
j = 0 if self.header.external else 1
13901414

13911415
# iterate over detail rows
1392-
for row in self.extract()[j:]:
1416+
for row in cells[j:]:
13931417
line = "|"
13941418
for i, cell in enumerate(row):
1395-
# output None cells with empty string
1396-
cell = "" if cell is None else cell.replace("\n", " ")
1419+
# replace None cells with empty string
1420+
# use HTML line break tag
1421+
cell = "" if not cell else cell.replace("\n", "<br>")
13971422
if clean: # remove sensitive syntax
13981423
cell = html.escape(cell.replace("-", "&#45;"))
13991424
line += cell + "|"
@@ -1462,22 +1487,19 @@ def _get_header(self, y_tolerance=3):
14621487
page = self.page
14631488
y_delta = y_tolerance
14641489

1465-
def top_row_is_bold(bbox):
1466-
"""Check if row 0 has bold text anywhere.
1467-
1468-
If this is true, then any non-bold text in lines above disqualify
1469-
these lines as header.
1490+
def row_has_bold(bbox):
1491+
"""Check if a row contains some bold text.
14701492
1471-
bbox is the (potentially repaired) row 0 bbox.
1493+
If e.g. true for the top row, then it will be used as (internal)
1494+
column header row if any of the following is true:
1495+
* the previous (above) text line has no bold span
1496+
* the second table row text has no bold span
14721497
1473-
Returns True or False
1498+
Returns True if any spans are bold else False.
14741499
"""
1475-
for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
1476-
for l in b["lines"]:
1477-
for s in l["spans"]:
1478-
if s["flags"] & 16:
1479-
return True
1480-
return False
1500+
blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
1501+
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1502+
return any([bool(s["flags"] & 16) for s in spans])
14811503

14821504
try:
14831505
row = self.rows[0]
@@ -1489,44 +1511,54 @@ def top_row_is_bold(bbox):
14891511
# return this if we determine that the top row is the header
14901512
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
14911513

1492-
# one-line tables have no extra header
1514+
# 1-line tables have no extra header
14931515
if len(self.rows) < 2:
14941516
return header_top_row
14951517

1496-
# x-ccordinates of columns between x0 and x1 of the table
1518+
# 1-column tables have no extra header
14971519
if len(cells) < 2:
14981520
return header_top_row
14991521

1500-
col_x = [
1501-
c[2] if c is not None else None for c in cells[:-1]
1502-
] # column (x) coordinates
1522+
# assume top row is the header if second row is empty
1523+
row2 = self.rows[1] # second row
1524+
if all([c is None for c in row2.cells]):
1525+
return header_top_row
15031526

15041527
# Special check: is top row bold?
1505-
# If first line above table is not bold, but top-left table cell is bold,
1506-
# we take first table row as header
1507-
top_row_bold = top_row_is_bold(bbox)
1528+
top_row_bold = row_has_bold(bbox)
1529+
1530+
# assume top row is header if it is bold and any cell
1531+
# of 2nd row is non-bold
1532+
if top_row_bold and not row_has_bold(row2.bbox):
1533+
return header_top_row
15081534

1509-
# clip = area above table
1535+
# column coordinates (x1 values) in top row
1536+
col_x = [c[2] if c is not None else None for c in cells[:-1]]
1537+
1538+
# clip = page area above the table
15101539
# We will inspect this area for text qualifying as column header.
15111540
clip = +bbox # take row 0 bbox
15121541
clip.y0 = 0 # start at top of page
15131542
clip.y1 = bbox.y0 # end at top of table
15141543

1515-
spans = [] # the text spans inside clip
1516-
for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
1517-
for l in b["lines"]:
1518-
for s in l["spans"]:
1519-
if (
1520-
not s["flags"] & 1 and s["text"].strip()
1521-
): # ignore superscripts and empty text
1522-
spans.append(s)
1544+
blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
1545+
# non-empty, non-superscript spans above table, sorted descending by y1
1546+
spans = sorted(
1547+
[
1548+
s
1549+
for b in blocks
1550+
for l in b["lines"]
1551+
for s in l["spans"]
1552+
if not (white_spaces.issuperset(s["text"]) or s["flags"] & 1)
1553+
],
1554+
key=lambda s: s["bbox"][3],
1555+
reverse=True,
1556+
)
15231557

15241558
select = [] # y1 coordinates above, sorted descending
15251559
line_heights = [] # line heights above, sorted descending
15261560
line_bolds = [] # bold indicator per line above, same sorting
15271561

1528-
# spans sorted descending
1529-
spans.sort(key=lambda s: s["bbox"][3], reverse=True)
15301562
# walk through the spans and fill above 3 lists
15311563
for i in range(len(spans)):
15321564
s = spans[i]
@@ -1541,7 +1573,7 @@ def top_row_is_bold(bbox):
15411573
line_bolds.append(bold)
15421574
continue
15431575

1544-
# get last items from the 3 lists
1576+
# get previous items from the 3 lists
15451577
y0 = select[-1]
15461578
h0 = line_heights[-1]
15471579
bold0 = line_bolds[-1]
@@ -1565,13 +1597,13 @@ def top_row_is_bold(bbox):
15651597
if select == []: # nothing above the table?
15661598
return header_top_row
15671599

1568-
select = select[:5] # only accept up to 5 lines in any header
1600+
select = select[:5] # accept up to 5 lines for an external header
15691601

1570-
# take top row as header if text above table is too far apart
1602+
# assume top row as header if text above is too far away
15711603
if bbox.y0 - select[0] >= line_heights[0]:
15721604
return header_top_row
15731605

1574-
# if top table row is bold, but line above is not:
1606+
# accept top row as header if bold, but line above is not
15751607
if top_row_bold and not line_bolds[0]:
15761608
return header_top_row
15771609

@@ -1738,7 +1770,7 @@ class TableFinder:
17381770
"""
17391771

17401772
def __init__(self, page, settings=None):
1741-
self.page = page
1773+
self.page = weakref.proxy(page)
17421774
self.settings = TableSettings.resolve(settings)
17431775
self.edges = self.get_edges()
17441776
self.intersections = edges_to_intersections(
@@ -1942,7 +1974,7 @@ def make_chars(page, clip=None):
19421974
# We are ignoring Bézier curves completely and are converting everything
19431975
# else to lines.
19441976
# ------------------------------------------------------------------------
1945-
def make_edges(page, clip=None, tset=None, add_lines=None):
1977+
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
19461978
global EDGES
19471979
snap_x = tset.snap_x_tolerance
19481980
snap_y = tset.snap_y_tolerance
@@ -1995,16 +2027,20 @@ def are_neighbors(r1, r2):
19952027
return True
19962028
return False
19972029

1998-
def clean_graphics():
2030+
def clean_graphics(npaths=None):
19992031
"""Detect and join rectangles of "connected" vector graphics."""
2000-
2001-
paths = [] # paths relevant for table detection
2002-
for p in page.get_drawings():
2003-
# ignore fill-only graphics if they do not simulate lines,
2004-
# which means one of width or height are small.
2032+
if npaths is None:
2033+
allpaths = page.get_drawings()
2034+
else:
2035+
allpaths = npaths[:] # paths relevant for table detection
2036+
paths = []
2037+
for p in allpaths:
2038+
# If only looking at lines, we ignore fill-only path
2039+
# except when simulating lines, i.e. width or height
2040+
# are small.
20052041
if (
2006-
p["type"] == "f"
2007-
and lines_strict
2042+
lines_strict
2043+
and p["type"] == "f"
20082044
and p["rect"].width > snap_x
20092045
and p["rect"].height > snap_y
20102046
):
@@ -2039,7 +2075,7 @@ def clean_graphics():
20392075

20402076
return new_rects, paths
20412077

2042-
bboxes, paths = clean_graphics()
2078+
bboxes, paths = clean_graphics(npaths=paths)
20432079

20442080
def is_parallel(p1, p2):
20452081
"""Check if line is roughly axis-parallel."""
@@ -2210,6 +2246,25 @@ def make_line(p, p1, p2, clip):
22102246
if line_dict:
22112247
EDGES.append(line_to_edge(line_dict))
22122248

2249+
if add_boxes is not None: # add user-specified rectangles
2250+
assert isinstance(add_boxes, (tuple, list))
2251+
else:
2252+
add_boxes = []
2253+
for box in add_boxes:
2254+
r = Rect(box)
2255+
line_dict = make_line(path, r.tl, r.bl, clip)
2256+
if line_dict:
2257+
EDGES.append(line_to_edge(line_dict))
2258+
line_dict = make_line(path, r.bl, r.br, clip)
2259+
if line_dict:
2260+
EDGES.append(line_to_edge(line_dict))
2261+
line_dict = make_line(path, r.br, r.tr, clip)
2262+
if line_dict:
2263+
EDGES.append(line_to_edge(line_dict))
2264+
line_dict = make_line(path, r.tr, r.tl, clip)
2265+
if line_dict:
2266+
EDGES.append(line_to_edge(line_dict))
2267+
22132268

22142269
def page_rotation_set0(page):
22152270
"""Nullify page rotation.
@@ -2291,7 +2346,9 @@ def find_tables(
22912346
text_x_tolerance=3,
22922347
text_y_tolerance=3,
22932348
strategy=None, # offer abbreviation
2294-
add_lines=None, # optional user-specified lines
2349+
add_lines=None, # user-specified lines
2350+
add_boxes=None, # user-specified rectangles
2351+
paths=None, # accept vector graphics as parameter
22952352
):
22962353
global CHARS, EDGES
22972354
CHARS = []
@@ -2345,7 +2402,12 @@ def find_tables(
23452402

23462403
make_chars(page, clip=clip) # create character list of page
23472404
make_edges(
2348-
page, clip=clip, tset=tset, add_lines=add_lines
2405+
page,
2406+
clip=clip,
2407+
tset=tset,
2408+
paths=paths,
2409+
add_lines=add_lines,
2410+
add_boxes=add_boxes,
23492411
) # create lines and curves
23502412
tables = TableFinder(page, settings=tset)
23512413

0 commit comments

Comments
 (0)