1
1
import re
2
- from bs4 import BeautifulSoup , Comment , Tag
2
+ from bs4 import BeautifulSoup , Comment , Tag , NavigableString
3
3
import codecs
4
4
5
5
def _handle_reference (el ):
6
6
if el .get ('href' , "" ).startswith ("#" ):
7
7
r = str (el .get ('href' ))
8
8
el .clear () # to remove it's content from the descendants iterator
9
- return "xxref-" + r [1 :]
9
+ return "xxref-" + _simplify_anchor (r [1 :])
10
+
11
+
12
+ _anchor_like_classes = {
13
+ 'ltx_appendix' , 'ltx_bibliography' , 'ltx_figure' , 'ltx_float' , 'ltx_graphics' , 'ltx_note' ,
14
+ 'ltx_paragraph' , 'ltx_picture' , 'ltx_section' , 'ltx_subsection' , 'ltx_subsubsection' , 'ltx_theorem' ,
15
+ 'ltx_title_section' , 'ltx_title_subsection'
16
+ }
17
+
18
+ def _insert_anchor (el , anchor_id , prefix = "xxanchor" ):
19
+ el .insert (0 , NavigableString (f' { prefix } -{ anchor_id } ' ))
20
+
21
+ def put_dummy_anchors (soup ):
22
+ for elem in soup .select (
23
+ '.ltx_appendix, .ltx_bibliography, .ltx_bibitem, ' + \
24
+ '.ltx_figure, .ltx_float, ' + \
25
+ '.ltx_picture, .ltx_theorem' ):
26
+ id_str = elem .get ('id' , '' )
27
+ if id_str :
28
+ _insert_anchor (elem , _simplify_anchor (id_str ))
29
+ for elem in soup .select ('h2, h3, h4, h5, h6' ):
30
+ sec = elem .find_parent ("section" )
31
+ if sec :
32
+ id_str = sec .get ('id' )
33
+ if id_str :
34
+ _insert_anchor (elem , _simplify_anchor (id_str ))
35
+ for elem in soup .select (".ltx_table" ):
36
+ id_str = elem .get ('id' , "xxunk" )
37
+ _insert_anchor (elem , _simplify_anchor (id_str ), "xxtable-xxanchor" )
38
+ for elem in soup .select (".ltx_tabular" ):
39
+ elem .extract ()
40
+
41
+ for elem in soup .select ('a[href^="#"]' ):
42
+ r = str (elem .get ('href' ))
43
+ elem .string = "xxref-" + _simplify_anchor (r [1 :])
44
+
45
+ put_footnote_anchors (soup )
46
+
47
+ def put_footnote_anchors (soup ):
48
+ for elem in soup .select ('.ltx_note_content > .ltx_note_mark' ):
49
+ elem .extract ()
50
+
51
+ for elem in soup .select ('.ltx_role_footnote > .ltx_note_mark' ):
52
+ ft = elem .parent
53
+ id_str = ft .get ('id' )
54
+ elem .string = f" xxref-{ _simplify_anchor (id_str )} "
55
+
56
+ for elem in soup .select ('.ltx_note_content > .ltx_tag_note' ):
57
+ ft = elem .find_parent (class_ = "ltx_role_footnote" )
58
+ if ft :
59
+ id_str = ft .get ('id' )
60
+ elem .string = f" xxanchor-{ _simplify_anchor (id_str )} "
61
+
62
+ # remove . from latexml ids (f.e., S2.SS5) so they can be searched for in elastic
63
+ # without disambiguations
64
+ def _simplify_anchor (s ):
65
+ return s .replace ('.' , '' )
10
66
11
67
12
68
def _handle_anchor (el ):
13
- if el .get ('id' , "" ):
69
+ if el .name . lower () == 'a' and el . get ('id' , "" ):
14
70
id_str = el .get ('id' , "" )
15
71
el .clear () # to remove it's content from the descendants iterator
16
72
return "xxanchor-" + id_str
73
+ # classes = get_classes(el)
74
+ # id_str = el.get('id')
75
+ # if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes:
76
+ # print(el.get_text())
77
+ # print(el.name)
78
+ # if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes:
79
+ # print(el.get_text())
80
+ # # this is workaround to deal with differences between
81
+ # # htlatex and latexml html structure
82
+ # # it would be better to make use of latexml structure
83
+ # sec = el.find_parent("section")
84
+ # if sec:
85
+ # id_str = sec.get('id')
86
+ # print(id_str, el.get_text())
87
+ #
88
+ # if id_str and classes:
89
+ # classes = set(classes)
90
+ # if classes.intersection(_anchor_like_classes):
91
+ # print('xxanchor-'+id_str)
92
+ # el.clear() # to remove it's content from the descendants iterator
93
+ # return "xxanchor-" + id_str
17
94
18
95
19
96
def _handle_table (el ):
20
- if el . name . lower () == 'table' :
97
+ if 'ltx_table' in get_classes ( el ) :
21
98
id_str = el .get ('id' , "xxunk" )
22
99
el .clear () # to remove it's content from the descendants iterator
23
100
return f"xxtable-xxanchor-" + id_str
@@ -32,26 +109,31 @@ def _handle_table(el):
32
109
33
110
def transform (el ):
34
111
if isinstance (el , Tag ):
35
- for f in _transforms_el :
36
- r = f (el )
37
- if r is not None :
38
- return transform (r )
112
+ # for f in _transforms_el:
113
+ # r = f(el)
114
+ # if r is not None:
115
+ # return transform(r)
116
+ return el .get_text ()
39
117
elif not isinstance (el , Comment ):
40
118
return str (el )
41
119
return ''
42
120
43
121
122
+ def clean_abstract (t ):
123
+ return re .sub ("^\s*[aA]bstract ?" , "" , t )
124
+
125
+
44
126
def get_text (* els ):
45
- t = " " .join ([transform (t )
46
- for el in els for t in getattr (el , 'descendants' , [el ])])
47
- t = re . sub ( "^[aA]bstract ?" , "" , t )
127
+ # t = " ".join([transform(t)
128
+ # for el in els for t in getattr(el, 'descendants', [el])])
129
+ t = " " . join ([ transform ( e ) for e in els ] )
48
130
t = re .sub ("[ \n \xa0 ]+" , " " , t )
49
131
t = re .sub ("[;,()]* (#[A-Za-z0-9]+) [;,()]*" , r" \1 " , t )
50
132
t = re .sub (r" (#[A-Za-z0-9]+) *\1 " , r" \1 " , t )
51
133
return t .strip ()
52
134
53
135
54
- def content_in_section (header , names = ['h3 ' , 'h4 ' ], skip_comments = True ):
136
+ def content_in_section (header , names = ['h2 ' , 'h3 ' ], skip_comments = True ):
55
137
for el in header .next_siblings :
56
138
if getattr (el , 'name' , '' ) in names :
57
139
break
@@ -60,26 +142,25 @@ def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
60
142
yield el
61
143
62
144
63
- def get_class (el ):
145
+ def get_classes (el ):
64
146
if hasattr (el , 'get' ):
65
- # fixme: less convoluted way to return '' if calss is not found
66
- return (el .get ('class' , ['' ])+ ['' ])[0 ]
147
+ return el .get ('class' , [])
67
148
else :
68
- return ''
149
+ return []
69
150
70
151
71
152
def get_name (el ):
72
153
return hasattr (el , 'name' ) and el .name or ''
73
154
74
155
75
156
def _group_bibliography (el ):
76
- if get_class (el ) == 'thebibliography' :
77
- return [get_text (i ) for i in el .select ('p.bibitem ' )]
157
+ if 'ltx_bibliography' in get_classes (el ):
158
+ return [get_text (i ) for i in el .select ('li.ltx_bibitem ' )]
78
159
return []
79
160
80
161
81
162
def _group_table (el ):
82
- if get_class (el ) == 'table' :
163
+ if 'ltx_table' in get_classes (el ):
83
164
return [get_text (el )]
84
165
return []
85
166
@@ -92,7 +173,7 @@ def __init__(self):
92
173
def collect (self , el ):
93
174
if get_name (el ) == 'table' :
94
175
self .join_next_p = True
95
- elif get_name (el ) == "p" :
176
+ elif 'ltx_para' in get_classes (el ):
96
177
if self .join_next_p :
97
178
self .join_next_p = False
98
179
self .els .append (el )
@@ -123,7 +204,7 @@ def reset(self):
123
204
]
124
205
125
206
126
- def group_content (elements ):
207
+ def group_content2 (elements ):
127
208
par_gruop = ParagraphGrouper ()
128
209
for el in elements :
129
210
fragments = [frag for grouper in _group_el for frag in grouper (el )]
@@ -138,15 +219,100 @@ def group_content(elements):
138
219
yield frag
139
220
140
221
141
- def set_ids_by_labels (soup ):
142
- captions = soup .select (".caption" )
143
- prefix = "tex4ht:label?:"
144
- for caption in captions :
145
- el = caption .next_sibling
146
- if isinstance (el , Comment ) and el .string .startswith (prefix ):
147
- label = el .string [len (prefix ):].strip ()
148
- for table in caption .parent .select ("table" ):
149
- table ["id" ] = label
222
+ def walk (elem ):
223
+ for el in elem .children :
224
+ classes = get_classes (el )
225
+ if el .name == 'section' or 'ltx_biblist' in classes :
226
+ yield from walk (el )
227
+ else :
228
+ yield el
229
+
230
+ class Grouper :
231
+ def __init__ (self ):
232
+ self .out = []
233
+ self .section_idx = - 1
234
+ self .subsection_idx = 0
235
+ self .header = ""
236
+ self .in_section = False # move elements before first section into that section
237
+ self .section_output = False # if a section is empty and new section begins, output it for keep header
238
+
239
+ def get_output_text (self ):
240
+ return " " .join (self .out )
241
+
242
+ def flush (self ):
243
+ if self .in_section :
244
+ r = max (self .section_idx , 0 ), self .subsection_idx , self .header , self .get_output_text ()
245
+ self .out = []
246
+ self .section_output = True
247
+ self .subsection_idx += 1
248
+ yield r
249
+
250
+ def new_section (self , header_el ):
251
+ if not self .section_output : # output (possibly) empty section so header won't be lost
252
+ yield from self .flush ()
253
+ self .section_output = False
254
+ self .in_section = True
255
+ self .section_idx += 1
256
+ self .subsection_idx = 0
257
+ self .header = get_text (header_el )
258
+
259
+ def append (self , el ):
260
+ t = get_text (el ).strip ()
261
+ if t != "" :
262
+ self .out .append (t )
263
+
264
+ def group_content (self , doc ):
265
+ for el in walk (doc ):
266
+ classes = get_classes (el )
267
+ if el .name in ["h2" , "h3" ]:
268
+ yield from self .new_section (el )
269
+ elif el .name == "h1" :
270
+ continue
271
+ elif 'ltx_para' in classes or el .name == "figure" or 'ltx_bibitem' in classes :
272
+ self .append (el )
273
+ yield from self .flush ()
274
+ else :
275
+ self .append (el )
276
+
277
+
278
+ def group_content (doc ):
279
+ yield from Grouper ().group_content (doc )
280
+
281
+ def group_content3 (doc ):
282
+ out = []
283
+ section_idx = - 1
284
+ subsection_idx = 0
285
+ header = ""
286
+ has_paragraph = False
287
+ for el in walk (doc ):
288
+ classes = get_classes (el )
289
+ if el .name in ["h2" , "h3" ]:
290
+ if len (out ) and has_paragraph :
291
+ yield (max (section_idx , 0 ), subsection_idx , header , " " .join ([get_text (o ) for o in out ]))
292
+ out = []
293
+ section_idx += 1
294
+ subsection_idx = 0
295
+ header = get_text (el )
296
+ continue
297
+ elif 'ltx_title' in classes and el .name != "h1" :
298
+ if len (out ) and has_paragraph :
299
+ yield (max (section_idx , 0 ), subsection_idx , header , " " .join ([get_text (o ) for o in out ]))
300
+ out = []
301
+ out += [el ]
302
+
303
+ elif 'ltx_title_document' in classes :
304
+ continue
305
+ elif 'ltx_para' in classes or el .name == "figure" or 'ltx_bibitem' in classes :
306
+ if len (out ) and has_paragraph :
307
+ yield (max (section_idx , 0 ), subsection_idx , header , " " .join ([get_text (o ) for o in out ]))
308
+ subsection_idx += 1
309
+ out = []
310
+ has_paragraph = True
311
+ out += [el ]
312
+ else :
313
+ out .append (el )
314
+ if len (out ):
315
+ yield (max (section_idx , 0 ), subsection_idx , header , " " .join ([get_text (o ) for o in out ]))
150
316
151
317
def read_html (file ):
152
318
with codecs .open (file , 'r' , encoding = 'UTF-8' ) as f :
0 commit comments