7979from collections .abc import Sequence
8080from dataclasses import dataclass
8181from operator import itemgetter
82+ import weakref
8283
8384# -------------------------------------------------------------------
8485# Start of PyMuPDF interface code
@@ -1367,33 +1368,57 @@ def char_in_bbox(char, bbox) -> bool:
13671368
13681369 return table_arr
13691370
1370- def to_markdown (self , clean = True ):
1371+ def to_markdown (self , clean = False , fill_empty = True ):
13711372 """Output table content as a string in Github-markdown format.
13721373
1373- If clean is true, markdown syntax is removed from cell content."""
1374+ If "clean" then markdown syntax is removed from cell content.
1375+ If "fill_empty" then cell content None is replaced by the values
1376+ above (columns) or left (rows) in an effort to approximate row and
1377+ columns spans.
1378+
1379+ """
13741380 output = "|"
1381+ rows = self .row_count
1382+ cols = self .col_count
1383+ cells = self .extract ()[:] # make local copy of table text content
1384+
1385+ if fill_empty : # fill "None" cells where possible
13751386
1376- # generate header string and MD underline
1387+ # for rows, copy content from left to right
1388+ for j in range (rows ):
1389+ for i in range (cols - 1 ):
1390+ if cells [j ][i + 1 ] is None :
1391+ cells [j ][i + 1 ] = cells [j ][i ]
1392+
1393+ # for columns, copy top to bottom
1394+ for i in range (cols ):
1395+ for j in range (rows - 1 ):
1396+ if cells [j + 1 ][i ] is None :
1397+ cells [j + 1 ][i ] = cells [j ][i ]
1398+
1399+ # generate header string and MD separator
13771400 for i , name in enumerate (self .header .names ):
1378- if name is None or name == "" : # generate a name if empty
1401+ if not name : # generate a name if empty
13791402 name = f"Col{ i + 1 } "
1380- name = name .replace ("\n " , " " ) # remove any line breaks
1403+ name = name .replace ("\n " , "<br> " ) # use HTML line breaks
13811404 if clean : # remove sensitive syntax
13821405 name = html .escape (name .replace ("-" , "-" ))
13831406 output += name + "|"
13841407
13851408 output += "\n "
1409+ # insert GitHub header line separator
13861410 output += "|" + "|" .join ("---" for i in range (self .col_count )) + "|\n "
13871411
13881412 # skip first row in details if header is part of the table
13891413 j = 0 if self .header .external else 1
13901414
13911415 # iterate over detail rows
1392- for row in self . extract () [j :]:
1416+ for row in cells [j :]:
13931417 line = "|"
13941418 for i , cell in enumerate (row ):
1395- # output None cells with empty string
1396- cell = "" if cell is None else cell .replace ("\n " , " " )
1419+ # replace None cells with empty string
1420+ # use HTML line break tag
1421+ cell = "" if not cell else cell .replace ("\n " , "<br>" )
13971422 if clean : # remove sensitive syntax
13981423 cell = html .escape (cell .replace ("-" , "-" ))
13991424 line += cell + "|"
@@ -1462,22 +1487,19 @@ def _get_header(self, y_tolerance=3):
14621487 page = self .page
14631488 y_delta = y_tolerance
14641489
1465- def top_row_is_bold (bbox ):
1466- """Check if row 0 has bold text anywhere.
1467-
1468- If this is true, then any non-bold text in lines above disqualify
1469- these lines as header.
1490+ def row_has_bold (bbox ):
1491+ """Check if a row contains some bold text.
14701492
1471- bbox is the (potentially repaired) row 0 bbox.
1493+ If e.g. true for the top row, then it will be used as (internal)
1494+ column header row if any of the following is true:
1495+ * the previous (above) text line has no bold span
1496+ * the second table row text has no bold span
14721497
1473- Returns True or False
1498+ Returns True if any spans are bold else False.
14741499 """
1475- for b in page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]:
1476- for l in b ["lines" ]:
1477- for s in l ["spans" ]:
1478- if s ["flags" ] & 16 :
1479- return True
1480- return False
1500+ blocks = page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]
1501+ spans = [s for b in blocks for l in b ["lines" ] for s in l ["spans" ]]
1502+ return any ([bool (s ["flags" ] & 16 ) for s in spans ])
14811503
14821504 try :
14831505 row = self .rows [0 ]
@@ -1489,44 +1511,54 @@ def top_row_is_bold(bbox):
14891511 # return this if we determine that the top row is the header
14901512 header_top_row = TableHeader (bbox , cells , self .extract ()[0 ], False )
14911513
1492- # one -line tables have no extra header
1514+ # 1 -line tables have no extra header
14931515 if len (self .rows ) < 2 :
14941516 return header_top_row
14951517
1496- # x-ccordinates of columns between x0 and x1 of the table
1518+ # 1-column tables have no extra header
14971519 if len (cells ) < 2 :
14981520 return header_top_row
14991521
1500- col_x = [
1501- c [2 ] if c is not None else None for c in cells [:- 1 ]
1502- ] # column (x) coordinates
1522+ # assume top row is the header if second row is empty
1523+ row2 = self .rows [1 ] # second row
1524+ if all ([c is None for c in row2 .cells ]):
1525+ return header_top_row
15031526
15041527 # Special check: is top row bold?
1505- # If first line above table is not bold, but top-left table cell is bold,
1506- # we take first table row as header
1507- top_row_bold = top_row_is_bold (bbox )
1528+ top_row_bold = row_has_bold (bbox )
1529+
1530+ # assume top row is header if it is bold and any cell
1531+ # of 2nd row is non-bold
1532+ if top_row_bold and not row_has_bold (row2 .bbox ):
1533+ return header_top_row
15081534
1509- # clip = area above table
1535+ # column coordinates (x1 values) in top row
1536+ col_x = [c [2 ] if c is not None else None for c in cells [:- 1 ]]
1537+
1538+ # clip = page area above the table
15101539 # We will inspect this area for text qualifying as column header.
15111540 clip = + bbox # take row 0 bbox
15121541 clip .y0 = 0 # start at top of page
15131542 clip .y1 = bbox .y0 # end at top of table
15141543
1515- spans = [] # the text spans inside clip
1516- for b in page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]:
1517- for l in b ["lines" ]:
1518- for s in l ["spans" ]:
1519- if (
1520- not s ["flags" ] & 1 and s ["text" ].strip ()
1521- ): # ignore superscripts and empty text
1522- spans .append (s )
1544+ blocks = page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]
1545+ # non-empty, non-superscript spans above table, sorted descending by y1
1546+ spans = sorted (
1547+ [
1548+ s
1549+ for b in blocks
1550+ for l in b ["lines" ]
1551+ for s in l ["spans" ]
1552+ if not (white_spaces .issuperset (s ["text" ]) or s ["flags" ] & 1 )
1553+ ],
1554+ key = lambda s : s ["bbox" ][3 ],
1555+ reverse = True ,
1556+ )
15231557
15241558 select = [] # y1 coordinates above, sorted descending
15251559 line_heights = [] # line heights above, sorted descending
15261560 line_bolds = [] # bold indicator per line above, same sorting
15271561
1528- # spans sorted descending
1529- spans .sort (key = lambda s : s ["bbox" ][3 ], reverse = True )
15301562 # walk through the spans and fill above 3 lists
15311563 for i in range (len (spans )):
15321564 s = spans [i ]
@@ -1541,7 +1573,7 @@ def top_row_is_bold(bbox):
15411573 line_bolds .append (bold )
15421574 continue
15431575
1544- # get last items from the 3 lists
1576+ # get previous items from the 3 lists
15451577 y0 = select [- 1 ]
15461578 h0 = line_heights [- 1 ]
15471579 bold0 = line_bolds [- 1 ]
@@ -1565,13 +1597,13 @@ def top_row_is_bold(bbox):
15651597 if select == []: # nothing above the table?
15661598 return header_top_row
15671599
1568- select = select [:5 ] # only accept up to 5 lines in any header
1600+ select = select [:5 ] # accept up to 5 lines for an external header
15691601
1570- # take top row as header if text above table is too far apart
1602+ # assume top row as header if text above is too far away
15711603 if bbox .y0 - select [0 ] >= line_heights [0 ]:
15721604 return header_top_row
15731605
1574- # if top table row is bold, but line above is not:
1606+ # accept top row as header if bold, but line above is not
15751607 if top_row_bold and not line_bolds [0 ]:
15761608 return header_top_row
15771609
@@ -1738,7 +1770,7 @@ class TableFinder:
17381770 """
17391771
17401772 def __init__ (self , page , settings = None ):
1741- self .page = page
1773+ self .page = weakref . proxy ( page )
17421774 self .settings = TableSettings .resolve (settings )
17431775 self .edges = self .get_edges ()
17441776 self .intersections = edges_to_intersections (
@@ -1942,7 +1974,7 @@ def make_chars(page, clip=None):
19421974# We are ignoring Bézier curves completely and are converting everything
19431975# else to lines.
19441976# ------------------------------------------------------------------------
1945- def make_edges (page , clip = None , tset = None , add_lines = None ):
1977+ def make_edges (page , clip = None , tset = None , paths = None , add_lines = None , add_boxes = None ):
19461978 global EDGES
19471979 snap_x = tset .snap_x_tolerance
19481980 snap_y = tset .snap_y_tolerance
@@ -1995,16 +2027,20 @@ def are_neighbors(r1, r2):
19952027 return True
19962028 return False
19972029
1998- def clean_graphics ():
2030+ def clean_graphics (npaths = None ):
19992031 """Detect and join rectangles of "connected" vector graphics."""
2000-
2001- paths = [] # paths relevant for table detection
2002- for p in page .get_drawings ():
2003- # ignore fill-only graphics if they do not simulate lines,
2004- # which means one of width or height are small.
2032+ if npaths is None :
2033+ allpaths = page .get_drawings ()
2034+ else :
2035+ allpaths = npaths [:] # paths relevant for table detection
2036+ paths = []
2037+ for p in allpaths :
2038+ # If only looking at lines, we ignore fill-only path
2039+ # except when simulating lines, i.e. width or height
2040+ # are small.
20052041 if (
2006- p [ "type" ] == "f"
2007- and lines_strict
2042+ lines_strict
2043+ and p [ "type" ] == "f"
20082044 and p ["rect" ].width > snap_x
20092045 and p ["rect" ].height > snap_y
20102046 ):
@@ -2039,7 +2075,7 @@ def clean_graphics():
20392075
20402076 return new_rects , paths
20412077
2042- bboxes , paths = clean_graphics ()
2078+ bboxes , paths = clean_graphics (npaths = paths )
20432079
20442080 def is_parallel (p1 , p2 ):
20452081 """Check if line is roughly axis-parallel."""
@@ -2210,6 +2246,25 @@ def make_line(p, p1, p2, clip):
22102246 if line_dict :
22112247 EDGES .append (line_to_edge (line_dict ))
22122248
2249+ if add_boxes is not None : # add user-specified rectangles
2250+ assert isinstance (add_boxes , (tuple , list ))
2251+ else :
2252+ add_boxes = []
2253+ for box in add_boxes :
2254+ r = Rect (box )
2255+ line_dict = make_line (path , r .tl , r .bl , clip )
2256+ if line_dict :
2257+ EDGES .append (line_to_edge (line_dict ))
2258+ line_dict = make_line (path , r .bl , r .br , clip )
2259+ if line_dict :
2260+ EDGES .append (line_to_edge (line_dict ))
2261+ line_dict = make_line (path , r .br , r .tr , clip )
2262+ if line_dict :
2263+ EDGES .append (line_to_edge (line_dict ))
2264+ line_dict = make_line (path , r .tr , r .tl , clip )
2265+ if line_dict :
2266+ EDGES .append (line_to_edge (line_dict ))
2267+
22132268
22142269def page_rotation_set0 (page ):
22152270 """Nullify page rotation.
@@ -2291,7 +2346,9 @@ def find_tables(
22912346 text_x_tolerance = 3 ,
22922347 text_y_tolerance = 3 ,
22932348 strategy = None , # offer abbreviation
2294- add_lines = None , # optional user-specified lines
2349+ add_lines = None , # user-specified lines
2350+ add_boxes = None , # user-specified rectangles
2351+ paths = None , # accept vector graphics as parameter
22952352):
22962353 global CHARS , EDGES
22972354 CHARS = []
@@ -2345,7 +2402,12 @@ def find_tables(
23452402
23462403 make_chars (page , clip = clip ) # create character list of page
23472404 make_edges (
2348- page , clip = clip , tset = tset , add_lines = add_lines
2405+ page ,
2406+ clip = clip ,
2407+ tset = tset ,
2408+ paths = paths ,
2409+ add_lines = add_lines ,
2410+ add_boxes = add_boxes ,
23492411 ) # create lines and curves
23502412 tables = TableFinder (page , settings = tset )
23512413
0 commit comments