1
1
import pandas as pd
2
2
import re
3
- import numpy as np
4
- import elasticsearch
5
- from bs4 import BeautifulSoup , Comment , Tag
6
- import codecs
7
- import textwrap
3
+ from bs4 import BeautifulSoup
8
4
9
- from datetime import datetime
10
- from elasticsearch_dsl import Document , Date , Nested , Boolean , Object , \
11
- analyzer , InnerDoc , Completion , Keyword , Text , Integer , tokenizer , token_filter
5
+ from elasticsearch_dsl import Document , Boolean , Object , \
6
+ analyzer , InnerDoc , Keyword , Text , Integer , tokenizer , token_filter
12
7
13
- from IPython .display import display , Markdown , Latex
8
+ from IPython .display import display , Markdown
14
9
15
10
from elasticsearch_dsl import connections
16
11
12
+ from sota_extractor2 .data .doc_utils import get_text , content_in_section , group_content , set_ids_by_labels , read_html
17
13
from .. import config
18
14
19
15
@@ -26,153 +22,6 @@ def printmd(*args): # fixme: make it work without jupyter notebook
26
22
display (Markdown (" " .join (map (str , args ))))
27
23
28
24
29
- def _handle_reference (el ):
30
- if el .get ('href' , "" ).startswith ("#" ):
31
- r = str (el .get ('href' ))
32
- el .clear () # to remove it's content from the descendants iterator
33
- return "xxref-" + r [1 :]
34
-
35
-
36
- def _handle_anchor (el ):
37
- if el .get ('id' , "" ):
38
- id_str = el .get ('id' , "" )
39
- el .clear () # to remove it's content from the descendants iterator
40
- return "xxanchor-" + id_str
41
-
42
-
43
- def _handle_table (el ):
44
- if el .name .lower () == 'table' :
45
- id_str = el .get ('id' , "xxunk" )
46
- el .clear () # to remove it's content from the descendants iterator
47
- return f"xxtable-xxanchor-" + id_str
48
-
49
-
50
- _transforms_el = [
51
- _handle_reference ,
52
- _handle_table ,
53
- _handle_anchor ,
54
- ]
55
-
56
-
57
- def transform (el ):
58
- if isinstance (el , Tag ):
59
- for f in _transforms_el :
60
- r = f (el )
61
- if r is not None :
62
- return transform (r )
63
- elif not isinstance (el , Comment ):
64
- return str (el )
65
- return ''
66
-
67
-
68
- def get_text (* els ):
69
- t = " " .join ([transform (t )
70
- for el in els for t in getattr (el , 'descendants' , [el ])])
71
- t = re .sub ("^[aA]bstract ?" , "" , t )
72
- t = re .sub ("[ \n \xa0 ]+" , " " , t )
73
- t = re .sub ("[;,()]* (#[A-Za-z0-9]+) [;,()]*" , r" \1 " , t )
74
- t = re .sub (r" (#[A-Za-z0-9]+) *\1 " , r" \1 " , t )
75
- return t .strip ()
76
-
77
-
78
- def content_in_section (header , names = ['h3' , 'h4' ], skip_comments = True ):
79
- for el in header .next_siblings :
80
- if getattr (el , 'name' , '' ) in names :
81
- break
82
- if skip_comments and isinstance (el , Comment ):
83
- continue
84
- yield el
85
-
86
-
87
- def get_class (el ):
88
- if hasattr (el , 'get' ):
89
- # fixme: less convoluted way to return '' if calss is not found
90
- return (el .get ('class' , ['' ])+ ['' ])[0 ]
91
- else :
92
- return ''
93
-
94
-
95
- def get_name (el ):
96
- return hasattr (el , 'name' ) and el .name or ''
97
-
98
-
99
- def _group_bibliography (el ):
100
- if get_class (el ) == 'thebibliography' :
101
- return [get_text (i ) for i in el .select ('p.bibitem' )]
102
- return []
103
-
104
-
105
- def _group_table (el ):
106
- if get_class (el ) == 'table' :
107
- return [get_text (el )]
108
- return []
109
-
110
-
111
- class ParagraphGrouper :
112
- def __init__ (self ):
113
- self .els = []
114
- self .join_next_p = False
115
-
116
- def collect (self , el ):
117
- if get_name (el ) == 'table' :
118
- self .join_next_p = True
119
- elif get_name (el ) == "p" :
120
- if self .join_next_p :
121
- self .join_next_p = False
122
- self .els .append (el )
123
- else :
124
- return self .flush (new_els = [el ])
125
- else :
126
- self .els .append (el )
127
- return []
128
-
129
- def flush (self , new_els = None ):
130
- text = get_text (* self .els )
131
- if new_els is None :
132
- new_els = []
133
- if isinstance (new_els , Tag ): # allow for one tag to be passed
134
- new_els = [new_els ]
135
- self .els = new_els
136
- if text :
137
- return [text ]
138
- return []
139
-
140
- def reset (self ):
141
- self .els = []
142
-
143
-
144
- _group_el = [
145
- _group_bibliography ,
146
- _group_table ,
147
- ]
148
-
149
-
150
- def group_content (elements ):
151
- par_gruop = ParagraphGrouper ()
152
- for el in elements :
153
- fragments = [frag for grouper in _group_el for frag in grouper (el )]
154
- if fragments :
155
- fragments = par_gruop .flush () + fragments
156
- else :
157
- fragments = par_gruop .collect (el )
158
- for frag in fragments :
159
- yield frag
160
-
161
- for frag in par_gruop .flush ():
162
- yield frag
163
-
164
-
165
- def set_ids_by_labels (soup ):
166
- captions = soup .select (".caption" )
167
- prefix = "tex4ht:label?:"
168
- for caption in captions :
169
- el = caption .next_sibling
170
- if isinstance (el , Comment ) and el .string .startswith (prefix ):
171
- label = el .string [len (prefix ):].strip ()
172
- for table in caption .parent .select ("table" ):
173
- table ["id" ] = label
174
-
175
-
176
25
class Fragments (list ):
177
26
178
27
def get_toc (self ):
@@ -335,9 +184,7 @@ def print_section(self, name, clean_up=lambda x: x):
335
184
336
185
@classmethod
337
186
def read_html (cls , file ):
338
- with codecs .open (file , 'r' , encoding = 'UTF-8' ) as f :
339
- text = f .read ()
340
- return BeautifulSoup (text , "html.parser" )
187
+ return read_html (file )
341
188
342
189
@classmethod
343
190
def parse_paper (cls , file ):
0 commit comments