9
9
import json
10
10
import re
11
11
from ast import literal_eval
12
+ from collections import OrderedDict
13
+ from dataclasses import dataclass
14
+ from typing import Set
12
15
13
16
from tabular import Tabular
14
17
15
18
19
+ # begin of dirty hack
20
+ # pandas parsing of html tables is really nice
21
+ # but it has a lot of defaults that can't be
22
+ # modified
23
+
24
+ # one of the defaults is forcing thead rows
25
+ # into column names, ignoring value of `header`
26
+ # param
27
+
28
+ # the second issue is parsing numerical-looking
29
+ # values into floats
30
+
31
+ _old_data_to_frame = pd .io .html ._data_to_frame
32
+ def _new_data_to_frame (** kwargs ):
33
+ head , body , foot = kwargs .pop ("data" )
34
+ if head :
35
+ body = head + body
36
+ if foot :
37
+ body += foot
38
+ return _old_data_to_frame (data = (None , body , None ), ** kwargs )
39
+ pd .io .html ._data_to_frame = _new_data_to_frame
40
+ # end of dirty hack
41
+
42
+
43
+
16
44
def flatten_tables (soup ):
17
- inners = soup .select ("div.tabular table table " )
45
+ inners = soup .select (".ltx_tabular .ltx_tabular " )
18
46
for inner in inners :
19
47
inner .name = 'div'
20
- for elem in inner .select ("tr, td, colgroup, tbody, col" ):
48
+ for elem in inner .select ("tr, td, th, colgroup, tbody, thead, tfoot , col" ):
21
49
elem .name = 'div'
22
50
23
51
@@ -28,50 +56,146 @@ def escape(s):
28
56
def unescape (r ):
29
57
return literal_eval (r )
30
58
31
-
32
- multirow_re = re .compile (r"^\s*rows=(P<rows>\d+)\s*$" )
33
59
whitespace_re = re .compile (r'[\r\n]+|\s{2,}' )
34
60
61
+ def clear_ws (s ):
62
+ return whitespace_re .sub (" " , s .strip ())
63
+
35
64
def escape_table_content (soup ):
36
65
for item in soup .find_all (["td" , "th" ]):
37
- escaped = escape (whitespace_re .sub (" " , item .get_text ().strip ()))
38
-
39
- multirow = item .find ("div" , class_ = "multirow" , recursive = False )
40
- if multirow and multirow .contents and isinstance (multirow .contents [0 ], Comment ):
41
- match = multirow_re .match (str (multirow .contents [0 ]))
42
- if match :
43
- escaped = f"multirow={ match .group ('rows' )} ;{ escaped } "
44
-
66
+ escaped = escape (clear_ws (item .get_text ()))
45
67
item .string = escaped
46
68
47
-
48
- def fix_htlatex_multirow (df ):
49
- rows , cols = df .shape
50
-
51
- for col in range (cols ):
52
- for row in range (rows ):
53
- cell = df .iloc [row , col ]
54
- if cell .startswith ("multirow=" ):
55
- pos = cell .find (';' )
56
- multirows = int (cell [9 :pos ])
57
- assert df .iloc [row + 1 : row + multirows , col ].isna ().all ()
58
- df .iloc [row : row + multirows , col ] = cell [pos + 1 :]
59
-
60
-
61
69
def unescape_table_content (df ):
62
70
return df .applymap (unescape )
63
71
64
72
73
+ @dataclass
74
+ class LayoutCell :
75
+ borders : Set [str ]
76
+ align : Set [str ]
77
+ header : bool
78
+ colspan : int
79
+ rowspan : int
80
+ span : Set [str ]
81
+
82
+ def __str__ (self ):
83
+ borders = ['border-' + x for x in self .borders ]
84
+ align = ['align-' + x for x in self .align ]
85
+ span = ['span-' + x for x in self .span ]
86
+ header = ["header" ] if self .header else []
87
+ return ' ' .join (borders + align + span + header )
88
+
89
+ def to_layout (s ):
90
+ if s == "" :
91
+ return LayoutCell (set (), set (), False , 1 , 1 , set ())
92
+ borders , align , header , colspan , rowspan = s .split ("," )
93
+ borders = set (borders .split ())
94
+ align = set (align .split ())
95
+ header = (header == "True" )
96
+ colspan = int (colspan )
97
+ rowspan = int (rowspan )
98
+ return LayoutCell (borders , align , header , colspan , rowspan , set ())
99
+
100
+
101
+ def fix_layout (layout ):
102
+ rowspan = 1
103
+ for index , row in layout .iterrows ():
104
+ colspan = 1
105
+ for cell in row :
106
+ colspan -= 1
107
+ if colspan == 0 :
108
+ colspan = cell .colspan
109
+ if cell .colspan > 1 :
110
+ if colspan == 1 :
111
+ cell .span .add ("ce" )
112
+ cell .borders -= {"l" , "ll" }
113
+ elif colspan == cell .colspan :
114
+ cell .span .add ("cb" )
115
+ cell .borders -= {"r" , "rr" }
116
+ else :
117
+ cell .span .add ("ci" )
118
+ cell .borders -= {"l" , "ll" , "r" , "rr" }
119
+ for col in layout :
120
+ rowspan = 1
121
+ for cell in layout [col ]:
122
+ rowspan -= 1
123
+ if rowspan == 0 :
124
+ rowspan = cell .rowspan
125
+ if cell .rowspan > 1 :
126
+ if rowspan == 1 :
127
+ cell .span .add ("re" )
128
+ cell .borders -= {"t" , "tt" }
129
+ elif rowspan == cell .rowspan :
130
+ cell .span .add ("rb" )
131
+ cell .borders -= {"b" , "bb" }
132
+ else :
133
+ cell .span .add ("ri" )
134
+ cell .borders -= {"b" , "bb" , "t" , "tt" }
135
+
136
+
137
+ def decouple_layout (df ):
138
+ split = df .applymap (lambda x : ("" , "" ) if x == "" else x .split (";" , 1 ))
139
+ tab = split .applymap (lambda x : x [1 ])
140
+ layout = split .applymap (lambda x : to_layout (x [0 ]))
141
+ fix_layout (layout )
142
+ return tab , layout
143
+
144
+
65
145
def fix_table (df ):
66
146
df = df .fillna (repr ('' ))
67
- fix_htlatex_multirow (df )
68
147
df = df .replace ("''" , np .NaN ).dropna (how = 'all' ).dropna (axis = 'columns' , how = 'all' ).fillna ("''" )
69
- return unescape_table_content (df )
148
+ df = unescape_table_content (df )
149
+ return decouple_layout (df )
150
+
151
+
152
+ def fix_id (s ):
153
+ return s .replace ("." , "-" )
154
+
155
+
156
+ def wrap_elem_content (elem , begin , end ):
157
+ elem .insert (0 , NavigableString (begin ))
158
+ elem .append (NavigableString (end ))
70
159
71
160
72
161
def move_out_references (table ):
73
162
for anchor in table .select ('a[href^="#"]' ):
74
- anchor .append (NavigableString ("[xxref-" + anchor ["href" ][1 :]+ "]" ))
163
+ wrap_elem_content (anchor , f"<ref id='{ fix_id (anchor ['href' ][1 :])} '>" , "</ref>" )
164
+
165
+
166
+ #def move_out_text_styles(table):
167
+ # ltx_font = 'ltx_font_'
168
+ # font_selector = f'[class*="{ltx_font}"]'
169
+ #
170
+ # for elem in table.select(f"span{font_selector}, a{font_selector}, em{font_selector}"):
171
+ # for c in set(elem.attrs["class"]):
172
+ # if c == ltx_font + 'bold':
173
+ # wrap_elem_content(elem, "<b>", "</b>")
174
+ # elif c == ltx_font + 'italic':
175
+ # wrap_elem_content(elem, "<i>", "</i>")
176
+
177
+
178
+ def move_out_styles (table ):
179
+ ltx_border = 'ltx_border_'
180
+ ltx_align = 'ltx_align_'
181
+ ltx_th = 'ltx_th'
182
+
183
+ for elem in table .select ('td, th' ):
184
+ borders = []
185
+ align = []
186
+ header = False
187
+ for c in elem .attrs ["class" ]:
188
+ if c .startswith (ltx_border ):
189
+ borders .append (c [len (ltx_border ):])
190
+ elif c .startswith (ltx_align ):
191
+ align .append (c [len (ltx_align ):])
192
+ elif c == ltx_th :
193
+ header = True
194
+ b = ' ' .join (borders )
195
+ a = ' ' .join (align )
196
+ colspan = elem .attrs .get ("colspan" , "1" )
197
+ rowspan = elem .attrs .get ("rowspan" , "1" )
198
+ wrap_elem_content (elem , f"{ b } ,{ a } ,{ header } ,{ colspan } ,{ rowspan } ;" , "" )
75
199
76
200
77
201
def html2data (table ):
@@ -90,60 +214,88 @@ def save_tables(data, outdir):
90
214
91
215
for num , table in enumerate (data , 1 ):
92
216
filename = f"table_{ num :02} .csv"
217
+ layout = f"layout_{ num :02} .csv"
93
218
save_table (table .data , outdir / filename )
94
- metadata .append (dict (filename = filename , caption = table .caption , figure_id = table .figure_id ))
219
+ save_table (table .layout , outdir / layout )
220
+ metadata .append (dict (filename = filename , layout = layout , caption = table .caption , figure_id = table .figure_id ))
95
221
with open (outdir / "metadata.json" , "w" ) as f :
96
222
json .dump (metadata , f )
97
223
98
224
99
- def deepclone (elem ):
100
- return BeautifulSoup (str (elem ), "lxml" )
101
-
102
-
103
225
def set_ids_by_labels (soup ):
104
- captions = soup .select (".caption" )
105
- prefix = "tex4ht:label?:"
226
+ captions = soup .select (".ltx_caption" )
106
227
for caption in captions :
107
- el = caption .next_sibling
108
- if isinstance ( el , Comment ) and el . string . startswith ( prefix ):
109
- label = el . string [ len ( prefix ):]. strip ()
110
- for table in caption . parent . select ("table " ):
228
+ fig = caption .parent
229
+ if fig . name == "figure" and fig . has_attr ( "id" ):
230
+ label = fig . attrs [ "id" ]
231
+ for table in fig . select (".ltx_tabular " ):
111
232
table ["data-figure-id" ] = label
112
233
234
+ def is_figure (tag ):
235
+ return tag .name == "figure"
236
+ # classes = tag.attrs.get("class", [])
237
+ # return "ltx_figure" in classes or "ltx_float" in classes
238
+
239
+ def fix_span_tables (soup ):
240
+ classes = OrderedDict ([("ltx_tabular" , "table" ), ("ltx_tr" , "tr" ), ("ltx_th" , "th" ),
241
+ ("ltx_tbody" , "tbody" ), ("ltx_thead" , "thead" ), ("ltx_td" , "td" ),
242
+ ("ltx_tfoot" , "tfoot" )])
243
+
244
+ query = ',' .join (["span." + c for c in classes .keys ()])
245
+ for elem in soup .select (query ):
246
+ for k , v in classes .items ():
247
+ if k in elem .attrs ["class" ]:
248
+ elem .name = v
249
+ break
250
+
251
+ # pandas.read_html treats th differently
252
+ # by trying in a few places to get column names
253
+ # for now <th>s are changed to <td>s, but we still
254
+ # have classes (ltx_th) to distinguish them
255
+ def fix_th (soup ):
256
+ for elem in soup .find_all ("th" ):
257
+ elem .name = "td"
258
+
259
+ def remove_footnotes (soup ):
260
+ for elem in soup .select (".ltx_role_footnote" ):
261
+ elem .extract ()
262
+
113
263
114
264
def extract_tables (filename , outdir ):
115
265
with open (filename , "rb" ) as f :
116
266
html = f .read ()
117
267
outdir = Path (outdir )
118
268
outdir .mkdir (parents = True , exist_ok = True )
119
269
soup = BeautifulSoup (html , "lxml" , from_encoding = "utf-8" )
120
- flatten_tables (soup )
121
270
set_ids_by_labels (soup )
122
- tables = soup .select ("div.tabular" )
271
+ fix_span_tables (soup )
272
+ fix_th (soup )
273
+ flatten_tables (soup )
274
+ tables = soup .find_all ("table" , class_ = "ltx_tabular" )
123
275
124
276
data = []
125
277
for table in tables :
126
- table_el = table .find ( "table" )
127
- if table_el is not None :
128
- float_div = table . find_parent ( "div" , class_ = "float" )
129
- #print( table)
130
- move_out_references (table )
131
- escape_table_content (table )
132
- #print (table)
133
- tab = html2data (table )
134
- if tab is None :
135
- continue
136
-
137
- tab = fix_table ( tab )
138
-
139
- caption = None
140
- if float_div is not None :
141
- float_div = deepclone ( float_div )
142
- for t in float_div .find_all ( "table" ):
143
- t . extract ()
144
- caption = float_div .get_text ()
145
- figure_id = table_el .get ("data-figure-id" )
146
- data .append (Tabular (tab , caption , figure_id ))
278
+ if table .find_parent ( class_ = "ltx_authors" ) is not None :
279
+ continue
280
+
281
+ float_div = table . find_parent ( is_figure )
282
+ remove_footnotes (table )
283
+ move_out_references (table )
284
+ move_out_styles (table )
285
+ escape_table_content (table )
286
+ tab = html2data ( table )
287
+ if tab is None :
288
+ continue
289
+
290
+ tab , layout = fix_table ( tab )
291
+
292
+ caption = None
293
+ if float_div is not None :
294
+ cap_el = float_div .find ( "figcaption" )
295
+ if cap_el is not None :
296
+ caption = clear_ws ( cap_el .get_text () )
297
+ figure_id = table .get ("data-figure-id" )
298
+ data .append (Tabular (tab , layout , caption , figure_id ))
147
299
148
300
save_tables (data , outdir )
149
301
0 commit comments