7979from collections .abc import Sequence
8080from dataclasses import dataclass
8181from operator import itemgetter
82+ import weakref
8283
8384# -------------------------------------------------------------------
8485# Start of PyMuPDF interface code
8788 Rect ,
8889 Matrix ,
8990 TEXTFLAGS_TEXT ,
91+ TEXT_FONT_BOLD ,
92+ TEXT_FONT_SUPERSCRIPT ,
9093 TOOLS ,
9194 EMPTY_RECT ,
9295 sRGB_to_pdf ,
@@ -1061,7 +1064,7 @@ def get_center(word):
10611064 if not overlap :
10621065 condensed_bboxes .append (bbox )
10631066
1064- if len ( condensed_bboxes ) == 0 :
1067+ if not condensed_bboxes :
10651068 return []
10661069
10671070 condensed_rects = map (bbox_to_rect , condensed_bboxes )
@@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:
13671370
13681371 return table_arr
13691372
1370- def to_markdown (self , clean = True ):
1373+ def to_markdown (self , clean = False , fill_empty = True ):
13711374 """Output table content as a string in Github-markdown format.
13721375
1373- If clean is true, markdown syntax is removed from cell content."""
1376+ If "clean" then markdown syntax is removed from cell content.
1377+ If "fill_empty" then cell content None is replaced by the values
1378+ above (columns) or left (rows) in an effort to approximate row and
1379+ columns spans.
1380+
1381+ """
13741382 output = "|"
1383+ rows = self .row_count
1384+ cols = self .col_count
1385+ cells = self .extract ()[:] # make local copy of table text content
1386+
1387+ if fill_empty : # fill "None" cells where possible
1388+
1389+ # for rows, copy content from left to right
1390+ for j in range (rows ):
1391+ for i in range (cols - 1 ):
1392+ if cells [j ][i + 1 ] is None :
1393+ cells [j ][i + 1 ] = cells [j ][i ]
13751394
1376- # generate header string and MD underline
1395+ # for columns, copy top to bottom
1396+ for i in range (cols ):
1397+ for j in range (rows - 1 ):
1398+ if cells [j + 1 ][i ] is None :
1399+ cells [j + 1 ][i ] = cells [j ][i ]
1400+
1401+ # generate header string and MD separator
13771402 for i , name in enumerate (self .header .names ):
1378- if name is None or name == "" : # generate a name if empty
1403+ if not name : # generate a name if empty
13791404 name = f"Col{ i + 1 } "
1380- name = name .replace ("\n " , " " ) # remove any line breaks
1405+ name = name .replace ("\n " , "<br> " ) # use HTML line breaks
13811406 if clean : # remove sensitive syntax
13821407 name = html .escape (name .replace ("-" , "-" ))
13831408 output += name + "|"
13841409
13851410 output += "\n "
1411+ # insert GitHub header line separator
13861412 output += "|" + "|" .join ("---" for i in range (self .col_count )) + "|\n "
13871413
13881414 # skip first row in details if header is part of the table
13891415 j = 0 if self .header .external else 1
13901416
13911417 # iterate over detail rows
1392- for row in self . extract () [j :]:
1418+ for row in cells [j :]:
13931419 line = "|"
13941420 for i , cell in enumerate (row ):
1395- # output None cells with empty string
1396- cell = "" if cell is None else cell .replace ("\n " , " " )
1421+ # replace None cells with empty string
1422+ # use HTML line break tag
1423+ cell = "" if not cell else cell .replace ("\n " , "<br>" )
13971424 if clean : # remove sensitive syntax
13981425 cell = html .escape (cell .replace ("-" , "-" ))
13991426 line += cell + "|"
@@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
14621489 page = self .page
14631490 y_delta = y_tolerance
14641491
1465- def top_row_is_bold (bbox ):
1466- """Check if row 0 has bold text anywhere.
1492+ def top_row_bg_color (self ):
1493+ """
1494+ Compare top row background color with color of same-sized bbox
1495+ above. If different, return True indicating that the original
1496+ table top row is already the header.
1497+ """
1498+ bbox0 = Rect (self .rows [0 ].bbox )
1499+ bboxt = bbox0 + (0 , - bbox0 .height , 0 , - bbox0 .height ) # area above
1500+ top_color0 = page .get_pixmap (clip = bbox0 ).color_topusage ()[1 ]
1501+ top_colort = page .get_pixmap (clip = bboxt ).color_topusage ()[1 ]
1502+ if top_color0 != top_colort :
1503+ return True # top row is header
1504+ return False
14671505
1468- If this is true, then any non-bold text in lines above disqualify
1469- these lines as header .
1506+ def row_has_bold ( bbox ):
1507+ """Check if a row contains some bold text .
14701508
1471- bbox is the (potentially repaired) row 0 bbox.
1509+ If e.g. true for the top row, then it will be used as (internal)
1510+ column header row if any of the following is true:
1511+ * the previous (above) text line has no bold span
1512+ * the second table row text has no bold span
14721513
1473- Returns True or False
1514+ Returns True if any spans are bold else False.
14741515 """
1475- for b in page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]:
1476- for l in b ["lines" ]:
1477- for s in l ["spans" ]:
1478- if s ["flags" ] & 16 :
1479- return True
1480- return False
1516+ blocks = page .get_text ("dict" , flags = TEXTFLAGS_TEXT , clip = bbox )["blocks" ]
1517+ spans = [s for b in blocks for l in b ["lines" ] for s in l ["spans" ]]
1518+
1519+ return any (s ["flags" ] & TEXT_FONT_BOLD for s in spans )
14811520
14821521 try :
14831522 row = self .rows [0 ]
@@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
14891528 # return this if we determine that the top row is the header
14901529 header_top_row = TableHeader (bbox , cells , self .extract ()[0 ], False )
14911530
1492- # one -line tables have no extra header
1531+ # 1 -line tables have no extra header
14931532 if len (self .rows ) < 2 :
14941533 return header_top_row
14951534
1496- # x-ccordinates of columns between x0 and x1 of the table
1535+ # 1-column tables have no extra header
14971536 if len (cells ) < 2 :
14981537 return header_top_row
14991538
1500- col_x = [
1501- c [2 ] if c is not None else None for c in cells [:- 1 ]
1502- ] # column (x) coordinates
1539+ # assume top row is the header if second row is empty
1540+ row2 = self .rows [1 ] # second row
1541+ if all (c is None for c in row2 .cells ): # no valid cell bboxes in row2
1542+ return header_top_row
15031543
15041544 # Special check: is top row bold?
1505- # If first line above table is not bold, but top-left table cell is bold,
1506- # we take first table row as header
1507- top_row_bold = top_row_is_bold (bbox )
1545+ top_row_bold = row_has_bold (bbox )
1546+
1547+ # assume top row is header if it is bold and any cell
1548+ # of 2nd row is non-bold
1549+ if top_row_bold and not row_has_bold (row2 .bbox ):
1550+ return header_top_row
1551+
1552+ if top_row_bg_color (self ):
1553+ # if area above top row has a different background color,
1554+ # then top row is already the header
1555+ return header_top_row
15081556
1509- # clip = area above table
1557+ # column coordinates (x1 values) in top row
1558+ col_x = [c [2 ] if c is not None else None for c in cells [:- 1 ]]
1559+
1560+ # clip = page area above the table
15101561 # We will inspect this area for text qualifying as column header.
15111562 clip = + bbox # take row 0 bbox
15121563 clip .y0 = 0 # start at top of page
15131564 clip .y1 = bbox .y0 # end at top of table
15141565
1515- spans = [] # the text spans inside clip
1516- for b in page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]:
1517- for l in b ["lines" ]:
1518- for s in l ["spans" ]:
1519- if (
1520- not s ["flags" ] & 1 and s ["text" ].strip ()
1521- ): # ignore superscripts and empty text
1522- spans .append (s )
1566+ blocks = page .get_text ("dict" , clip = clip , flags = TEXTFLAGS_TEXT )["blocks" ]
1567+ # non-empty, non-superscript spans above table, sorted descending by y1
1568+ spans = sorted (
1569+ [
1570+ s
1571+ for b in blocks
1572+ for l in b ["lines" ]
1573+ for s in l ["spans" ]
1574+ if not (
1575+ white_spaces .issuperset (s ["text" ])
1576+ or s ["flags" ] & TEXT_FONT_SUPERSCRIPT
1577+ )
1578+ ],
1579+ key = lambda s : s ["bbox" ][3 ],
1580+ reverse = True ,
1581+ )
15231582
15241583 select = [] # y1 coordinates above, sorted descending
15251584 line_heights = [] # line heights above, sorted descending
15261585 line_bolds = [] # bold indicator per line above, same sorting
15271586
1528- # spans sorted descending
1529- spans .sort (key = lambda s : s ["bbox" ][3 ], reverse = True )
15301587 # walk through the spans and fill above 3 lists
15311588 for i in range (len (spans )):
15321589 s = spans [i ]
15331590 y1 = s ["bbox" ][3 ] # span bottom
15341591 h = y1 - s ["bbox" ][1 ] # span bbox height
1535- bold = s ["flags" ] & 16
1592+ bold = s ["flags" ] & TEXT_FONT_BOLD
15361593
15371594 # use first item to start the lists
15381595 if i == 0 :
@@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
15411598 line_bolds .append (bold )
15421599 continue
15431600
1544- # get last items from the 3 lists
1601+ # get previous items from the 3 lists
15451602 y0 = select [- 1 ]
15461603 h0 = line_heights [- 1 ]
15471604 bold0 = line_bolds [- 1 ]
@@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
15651622 if select == []: # nothing above the table?
15661623 return header_top_row
15671624
1568- select = select [:5 ] # only accept up to 5 lines in any header
1625+ select = select [:5 ] # accept up to 5 lines for an external header
15691626
1570- # take top row as header if text above table is too far apart
1627+ # assume top row as header if text above is too far away
15711628 if bbox .y0 - select [0 ] >= line_heights [0 ]:
15721629 return header_top_row
15731630
1574- # if top table row is bold, but line above is not:
1631+ # accept top row as header if bold, but line above is not
15751632 if top_row_bold and not line_bolds [0 ]:
15761633 return header_top_row
15771634
@@ -1738,7 +1795,7 @@ class TableFinder:
17381795 """
17391796
17401797 def __init__ (self , page , settings = None ):
1741- self .page = page
1798+ self .page = weakref . proxy ( page )
17421799 self .settings = TableSettings .resolve (settings )
17431800 self .edges = self .get_edges ()
17441801 self .intersections = edges_to_intersections (
@@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
19421999# We are ignoring Bézier curves completely and are converting everything
19432000# else to lines.
19442001# ------------------------------------------------------------------------
1945- def make_edges (page , clip = None , tset = None , add_lines = None ):
2002+ def make_edges (page , clip = None , tset = None , paths = None , add_lines = None , add_boxes = None ):
19462003 snap_x = tset .snap_x_tolerance
19472004 snap_y = tset .snap_y_tolerance
19482005 min_length = tset .edge_min_length
@@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
19942051 return True
19952052 return False
19962053
1997- def clean_graphics ():
2054+ def clean_graphics (npaths = None ):
19982055 """Detect and join rectangles of "connected" vector graphics."""
1999-
2000- paths = [] # paths relevant for table detection
2001- for p in page .get_drawings ():
2002- # ignore fill-only graphics if they do not simulate lines,
2003- # which means one of width or height are small.
2056+ if npaths is None :
2057+ allpaths = page .get_drawings ()
2058+ else : # accept passed-in vector graphics
2059+ allpaths = npaths [:] # paths relevant for table detection
2060+ paths = []
2061+ for p in allpaths :
2062+ # If only looking at lines, we ignore fill-only paths,
2063+ # except simulated lines (i.e. small width or height).
20042064 if (
2005- p [ "type" ] == "f"
2006- and lines_strict
2065+ lines_strict
2066+ and p [ "type" ] == "f"
20072067 and p ["rect" ].width > snap_x
20082068 and p ["rect" ].height > snap_y
20092069 ):
@@ -2038,7 +2098,7 @@ def clean_graphics():
20382098
20392099 return new_rects , paths
20402100
2041- bboxes , paths = clean_graphics ()
2101+ bboxes , paths = clean_graphics (npaths = paths )
20422102
20432103 def is_parallel (p1 , p2 ):
20442104 """Check if line is roughly axis-parallel."""
@@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
22092269 if line_dict :
22102270 EDGES .append (line_to_edge (line_dict ))
22112271
2272+ if add_boxes is not None : # add user-specified rectangles
2273+ assert isinstance (add_boxes , (tuple , list ))
2274+ else :
2275+ add_boxes = []
2276+ for box in add_boxes :
2277+ r = Rect (box )
2278+ line_dict = make_line (path , r .tl , r .bl , clip )
2279+ if line_dict :
2280+ EDGES .append (line_to_edge (line_dict ))
2281+ line_dict = make_line (path , r .bl , r .br , clip )
2282+ if line_dict :
2283+ EDGES .append (line_to_edge (line_dict ))
2284+ line_dict = make_line (path , r .br , r .tr , clip )
2285+ if line_dict :
2286+ EDGES .append (line_to_edge (line_dict ))
2287+ line_dict = make_line (path , r .tr , r .tl , clip )
2288+ if line_dict :
2289+ EDGES .append (line_to_edge (line_dict ))
2290+
22122291
22132292def page_rotation_set0 (page ):
22142293 """Nullify page rotation.
@@ -2290,7 +2369,9 @@ def find_tables(
22902369 text_x_tolerance = 3 ,
22912370 text_y_tolerance = 3 ,
22922371 strategy = None , # offer abbreviation
2293- add_lines = None , # optional user-specified lines
2372+ add_lines = None , # user-specified lines
2373+ add_boxes = None , # user-specified rectangles
2374+ paths = None , # accept vector graphics as parameter
22942375):
22952376 global CHARS , EDGES
22962377 CHARS = []
@@ -2344,7 +2425,12 @@ def find_tables(
23442425
23452426 make_chars (page , clip = clip ) # create character list of page
23462427 make_edges (
2347- page , clip = clip , tset = tset , add_lines = add_lines
2428+ page ,
2429+ clip = clip ,
2430+ tset = tset ,
2431+ paths = paths ,
2432+ add_lines = add_lines ,
2433+ add_boxes = add_boxes ,
23482434 ) # create lines and curves
23492435 tables = TableFinder (page , settings = tset )
23502436
0 commit comments