1
1
#!/usr/bin/env python
2
2
3
3
import sys
4
- from bs4 import BeautifulSoup , Comment
4
+ from bs4 import BeautifulSoup , Comment , NavigableString
5
5
import fire
6
6
from pathlib import Path
7
7
import pandas as pd
@@ -69,6 +69,11 @@ def fix_table(df):
69
69
return unescape_table_content (df )
70
70
71
71
72
+ def move_out_references (table ):
73
+ for anchor in table .select ('a[href^="#"]' ):
74
+ anchor .append (NavigableString ("[xxref-" + anchor ["href" ][1 :]+ "]" ))
75
+
76
+
72
77
def html2data (table ):
73
78
data = pd .read_html (str (table ), match = '' )
74
79
if len (data ) > 1 :
@@ -86,7 +91,7 @@ def save_tables(data, outdir):
86
91
for num , table in enumerate (data , 1 ):
87
92
filename = f"table_{ num :02} .csv"
88
93
save_table (table .data , outdir / filename )
89
- metadata .append (dict (filename = filename , caption = table .caption ))
94
+ metadata .append (dict (filename = filename , caption = table .caption , figure_id = table . figure_id ))
90
95
with open (outdir / "metadata.json" , "w" ) as f :
91
96
json .dump (metadata , f )
92
97
@@ -95,20 +100,34 @@ def deepclone(elem):
95
100
return BeautifulSoup (str (elem ), "lxml" )
96
101
97
102
103
+ def set_ids_by_labels (soup ):
104
+ captions = soup .select (".caption" )
105
+ prefix = "tex4ht:label?:"
106
+ for caption in captions :
107
+ el = caption .next_sibling
108
+ if isinstance (el , Comment ) and el .string .startswith (prefix ):
109
+ label = el .string [len (prefix ):].strip ()
110
+ for table in caption .parent .select ("table" ):
111
+ table ["data-figure-id" ] = label
112
+
113
+
98
114
def extract_tables (filename , outdir ):
99
115
with open (filename , "rb" ) as f :
100
116
html = f .read ()
101
117
outdir = Path (outdir ) / Path (filename ).stem
102
118
outdir .mkdir (parents = True , exist_ok = True )
103
119
soup = BeautifulSoup (html , "lxml" )
104
120
flatten_tables (soup )
121
+ set_ids_by_labels (soup )
105
122
tables = soup .select ("div.tabular" )
106
123
107
124
data = []
108
125
for table in tables :
109
- if table .find ("table" ) is not None :
126
+ table_el = table .find ("table" )
127
+ if table_el is not None :
110
128
float_div = table .find_parent ("div" , class_ = "float" )
111
129
#print(table)
130
+ move_out_references (table )
112
131
escape_table_content (table )
113
132
#print(table)
114
133
tab = html2data (table )
@@ -123,8 +142,8 @@ def extract_tables(filename, outdir):
123
142
for t in float_div .find_all ("table" ):
124
143
t .extract ()
125
144
caption = float_div .get_text ()
126
-
127
- data .append (Tabular (tab , caption ))
145
+ figure_id = table_el . get ( "data-figure-id" )
146
+ data .append (Tabular (tab , caption , figure_id ))
128
147
129
148
save_tables (data , outdir )
130
149
0 commit comments