7979from collections .abc import Sequence
8080from dataclasses import dataclass
8181from operator import itemgetter
82+ import weakref
8283
8384# -------------------------------------------------------------------
8485# Start of PyMuPDF interface code
@@ -1367,33 +1368,57 @@ def char_in_bbox(char, bbox) -> bool:
13671368
13681369 return table_arr
13691370
1370- def to_markdown (self , clean = True ):
1371+ def to_markdown (self , clean = False , fill_empty = True ):
13711372 """Output table content as a string in Github-markdown format.
13721373
1373- If clean is true, markdown syntax is removed from cell content."""
1374+ If "clean" then markdown syntax is removed from cell content.
1375+ If "fill_empty" then cell content None is replaced by the values
1376+ above (columns) or left (rows) in an effort to approximate row and
1377+ columns spans.
1378+
1379+ """
13741380 output = "|"
1381+ rows = self .row_count
1382+ cols = self .col_count
1383+ cells = self .extract ()[:] # make local copy of table text content
1384+
1385+ if fill_empty : # fill "None" cells where possible
1386+
1387+ # for rows, copy content from left to right
1388+ for j in range (rows ):
1389+ for i in range (cols - 1 ):
1390+ if cells [j ][i + 1 ] is None :
1391+ cells [j ][i + 1 ] = cells [j ][i ]
13751392
1376- # generate header string and MD underline
1393+ # for columns, copy top to bottom
1394+ for i in range (cols ):
1395+ for j in range (rows - 1 ):
1396+ if cells [j + 1 ][i ] is None :
1397+ cells [j + 1 ][i ] = cells [j ][i ]
1398+
1399+ # generate header string and MD separator
13771400 for i , name in enumerate (self .header .names ):
1378- if name is None or name == "" : # generate a name if empty
1401+ if not name : # generate a name if empty
13791402 name = f"Col{ i + 1 } "
1380- name = name .replace ("\n " , " " ) # remove any line breaks
1403+ name = name .replace ("\n " , "<br> " ) # use HTML line breaks
13811404 if clean : # remove sensitive syntax
13821405 name = html .escape (name .replace ("-" , "-" ))
13831406 output += name + "|"
13841407
13851408 output += "\n "
1409+ # insert GitHub header line separator
13861410 output += "|" + "|" .join ("---" for i in range (self .col_count )) + "|\n "
13871411
13881412 # skip first row in details if header is part of the table
13891413 j = 0 if self .header .external else 1
13901414
13911415 # iterate over detail rows
1392- for row in self . extract () [j :]:
1416+ for row in cells [j :]:
13931417 line = "|"
13941418 for i , cell in enumerate (row ):
1395- # output None cells with empty string
1396- cell = "" if cell is None else cell .replace ("\n " , " " )
1419+ # replace None cells with empty string
1420+ # use HTML line break tag
1421+ cell = "" if not cell else cell .replace ("\n " , "<br>" )
13971422 if clean : # remove sensitive syntax
13981423 cell = html .escape (cell .replace ("-" , "-" ))
13991424 line += cell + "|"
@@ -1462,22 +1487,33 @@ def _get_header(self, y_tolerance=3):
14621487 page = self .page
14631488 y_delta = y_tolerance
14641489
1465- def top_row_is_bold (bbox ):
1466- """Check if row 0 has bold text anywhere.
1490+ def top_row_bg_color (self ):
1491+ """
1492+ Compare top row background color with color of same-sized bbox
1493+ above. If different, return True indicating that the original
1494+ table top row is already the header.
1495+ """
1496+ bbox0 = Rect (self .rows [0 ].bbox )
1497+ bboxt = bbox0 + (0 , - bbox0 .height , 0 , - bbox0 .height ) # area above
1498+ top_color0 = page .get_pixmap (clip = bbox0 ).color_topusage ()[1 ]
1499+ top_colort = page .get_pixmap (clip = bboxt ).color_topusage ()[1 ]
1500+ if top_color0 != top_colort :
1501+ return True # top row is header
1502+ return False
14671503
1468- If this is true, then any non-bold text in lines above disqualify
1469- these lines as header .
1504+ def row_has_bold ( bbox ):
1505+ """Check if a row contains some bold text .
14701506
1471- bbox is the (potentially repaired) row 0 bbox.
1507+ If e.g. true for the top row, then it will be used as (internal)
1508+ column header row if any of the following is true:
1509+ * the previous (above) text line has no bold span
1510+ * the second table row text has no bold span
14721511
1473- Returns True or False
1512+ Returns True if any spans are bold else False.
14741513 """
1475- for b in page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]:
1476- for l in b ["lines" ]:
1477- for s in l ["spans" ]:
1478- if s ["flags" ] & 16 :
1479- return True
1480- return False
1514+ blocks = page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]
1515+ spans = [s for b in blocks for l in b ["lines" ] for s in l ["spans" ]]
1516+ return any ([bool (s ["flags" ] & 16 ) for s in spans ])
14811517
14821518 try :
14831519 row = self .rows [0 ]
@@ -1489,44 +1525,57 @@ def top_row_is_bold(bbox):
14891525 # return this if we determine that the top row is the header
14901526 header_top_row = TableHeader (bbox , cells , self .extract ()[0 ], False )
14911527
1492- # one -line tables have no extra header
1528+ # 1 -line tables have no extra header
14931529 if len (self .rows ) < 2 :
14941530 return header_top_row
14951531
1496- # x-ccordinates of columns between x0 and x1 of the table
1532+ # 1-column tables have no extra header
14971533 if len (cells ) < 2 :
14981534 return header_top_row
14991535
1500- col_x = [
1501- c [2 ] if c is not None else None for c in cells [:- 1 ]
1502- ] # column (x) coordinates
1536+ # assume top row is the header if second row is empty
1537+ row2 = self .rows [1 ] # second row
1538+ if all ([c is None for c in row2 .cells ]):
1539+ return header_top_row
15031540
15041541 # Special check: is top row bold?
1505- # If first line above table is not bold, but top-left table cell is bold,
1506- # we take first table row as header
1507- top_row_bold = top_row_is_bold (bbox )
1542+ top_row_bold = row_has_bold (bbox )
1543+
1544+ # assume top row is header if it is bold and any cell
1545+ # of 2nd row is non-bold
1546+ if top_row_bold and not row_has_bold (row2 .bbox ):
1547+ return header_top_row
1548+
1549+ if top_row_bg_color (self ):
1550+ return header_top_row
15081551
1509- # clip = area above table
1552+ # column coordinates (x1 values) in top row
1553+ col_x = [c [2 ] if c is not None else None for c in cells [:- 1 ]]
1554+
1555+ # clip = page area above the table
15101556 # We will inspect this area for text qualifying as column header.
15111557 clip = + bbox # take row 0 bbox
15121558 clip .y0 = 0 # start at top of page
15131559 clip .y1 = bbox .y0 # end at top of table
15141560
1515- spans = [] # the text spans inside clip
1516- for b in page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]:
1517- for l in b ["lines" ]:
1518- for s in l ["spans" ]:
1519- if (
1520- not s ["flags" ] & 1 and s ["text" ].strip ()
1521- ): # ignore superscripts and empty text
1522- spans .append (s )
1561+ blocks = page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]
1562+ # non-empty, non-superscript spans above table, sorted descending by y1
1563+ spans = sorted (
1564+ [
1565+ s
1566+ for b in blocks
1567+ for l in b ["lines" ]
1568+ for s in l ["spans" ]
1569+ if not (white_spaces .issuperset (s ["text" ]) or s ["flags" ] & 1 )
1570+ ],
1571+ key = lambda s : s ["bbox" ][3 ],
1572+ reverse = True ,
1573+ )
15231574
15241575 select = [] # y1 coordinates above, sorted descending
15251576 line_heights = [] # line heights above, sorted descending
15261577 line_bolds = [] # bold indicator per line above, same sorting
15271578
1528- # spans sorted descending
1529- spans .sort (key = lambda s : s ["bbox" ][3 ], reverse = True )
15301579 # walk through the spans and fill above 3 lists
15311580 for i in range (len (spans )):
15321581 s = spans [i ]
@@ -1541,7 +1590,7 @@ def top_row_is_bold(bbox):
15411590 line_bolds .append (bold )
15421591 continue
15431592
1544- # get last items from the 3 lists
1593+ # get previous items from the 3 lists
15451594 y0 = select [- 1 ]
15461595 h0 = line_heights [- 1 ]
15471596 bold0 = line_bolds [- 1 ]
@@ -1565,13 +1614,13 @@ def top_row_is_bold(bbox):
15651614 if select == []: # nothing above the table?
15661615 return header_top_row
15671616
1568- select = select [:5 ] # only accept up to 5 lines in any header
1617+ select = select [:5 ] # accept up to 5 lines for an external header
15691618
1570- # take top row as header if text above table is too far apart
1619+ # assume top row as header if text above is too far away
15711620 if bbox .y0 - select [0 ] >= line_heights [0 ]:
15721621 return header_top_row
15731622
1574- # if top table row is bold, but line above is not:
1623+ # accept top row as header if bold, but line above is not
15751624 if top_row_bold and not line_bolds [0 ]:
15761625 return header_top_row
15771626
@@ -1738,7 +1787,7 @@ class TableFinder:
17381787 """
17391788
17401789 def __init__ (self , page , settings = None ):
1741- self .page = page
1790+ self .page = weakref . proxy ( page )
17421791 self .settings = TableSettings .resolve (settings )
17431792 self .edges = self .get_edges ()
17441793 self .intersections = edges_to_intersections (
@@ -1883,7 +1932,7 @@ def __getitem__(self, i):
18831932# -----------------------------------------------------------------------------
18841933def make_chars (page , clip = None ):
18851934 """Extract text as "rawdict" to fill CHARS."""
1886- global TEXTPAGE
1935+ global CHARS , TEXTPAGE
18871936 page_number = page .number + 1
18881937 page_height = page .rect .height
18891938 ctm = page .transformation_matrix
@@ -1942,7 +1991,8 @@ def make_chars(page, clip=None):
19421991# We are ignoring Bézier curves completely and are converting everything
19431992# else to lines.
19441993# ------------------------------------------------------------------------
1945- def make_edges (page , clip = None , tset = None , add_lines = None ):
1994+ def make_edges (page , clip = None , tset = None , paths = None , add_lines = None , add_boxes = None ):
1995+ global EDGES
19461996 snap_x = tset .snap_x_tolerance
19471997 snap_y = tset .snap_y_tolerance
19481998 min_length = tset .edge_min_length
@@ -1994,16 +2044,19 @@ def are_neighbors(r1, r2):
19942044 return True
19952045 return False
19962046
1997- def clean_graphics ():
2047+ def clean_graphics (npaths = None ):
19982048 """Detect and join rectangles of "connected" vector graphics."""
1999-
2000- paths = [] # paths relevant for table detection
2001- for p in page .get_drawings ():
2002- # ignore fill-only graphics if they do not simulate lines,
2003- # which means one of width or height are small.
2049+ if npaths is None :
2050+ allpaths = page .get_drawings ()
2051+ else : # accept passed-in vector graphics
2052+ allpaths = npaths [:] # paths relevant for table detection
2053+ paths = []
2054+ for p in allpaths :
2055+ # If only looking at lines, we ignore fill-only paths,
2056+ # except simulated lines (i.e. small width or height).
20042057 if (
2005- p [ "type" ] == "f"
2006- and lines_strict
2058+ lines_strict
2059+ and p [ "type" ] == "f"
20072060 and p ["rect" ].width > snap_x
20082061 and p ["rect" ].height > snap_y
20092062 ):
@@ -2038,7 +2091,7 @@ def clean_graphics():
20382091
20392092 return new_rects , paths
20402093
2041- bboxes , paths = clean_graphics ()
2094+ bboxes , paths = clean_graphics (npaths = paths )
20422095
20432096 def is_parallel (p1 , p2 ):
20442097 """Check if line is roughly axis-parallel."""
@@ -2209,6 +2262,25 @@ def make_line(p, p1, p2, clip):
22092262 if line_dict :
22102263 EDGES .append (line_to_edge (line_dict ))
22112264
2265+ if add_boxes is not None : # add user-specified rectangles
2266+ assert isinstance (add_boxes , (tuple , list ))
2267+ else :
2268+ add_boxes = []
2269+ for box in add_boxes :
2270+ r = Rect (box )
2271+ line_dict = make_line (path , r .tl , r .bl , clip )
2272+ if line_dict :
2273+ EDGES .append (line_to_edge (line_dict ))
2274+ line_dict = make_line (path , r .bl , r .br , clip )
2275+ if line_dict :
2276+ EDGES .append (line_to_edge (line_dict ))
2277+ line_dict = make_line (path , r .br , r .tr , clip )
2278+ if line_dict :
2279+ EDGES .append (line_to_edge (line_dict ))
2280+ line_dict = make_line (path , r .tr , r .tl , clip )
2281+ if line_dict :
2282+ EDGES .append (line_to_edge (line_dict ))
2283+
22122284
22132285def page_rotation_set0 (page ):
22142286 """Nullify page rotation.
@@ -2290,7 +2362,9 @@ def find_tables(
22902362 text_x_tolerance = 3 ,
22912363 text_y_tolerance = 3 ,
22922364 strategy = None , # offer abbreviation
2293- add_lines = None , # optional user-specified lines
2365+ add_lines = None , # user-specified lines
2366+ add_boxes = None , # user-specified rectangles
2367+ paths = None , # accept vector graphics as parameter
22942368):
22952369 global CHARS , EDGES
22962370 CHARS = []
@@ -2344,7 +2418,12 @@ def find_tables(
23442418
23452419 make_chars (page , clip = clip ) # create character list of page
23462420 make_edges (
2347- page , clip = clip , tset = tset , add_lines = add_lines
2421+ page ,
2422+ clip = clip ,
2423+ tset = tset ,
2424+ paths = paths ,
2425+ add_lines = add_lines ,
2426+ add_boxes = add_boxes ,
23482427 ) # create lines and curves
23492428 tables = TableFinder (page , settings = tset )
23502429
0 commit comments