Skip to content

Commit 1df3227

Browse files
committed
Adapt text extractor to latexml
* parse styles * parse footnotes * add row and column contexts to structure dataset generation * predict param cells
1 parent 50981c9 commit 1df3227

File tree

6 files changed

+308
-75
lines changed

6 files changed

+308
-75
lines changed

extract_tables.py

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -166,19 +166,60 @@ def move_out_references(table):
166166
wrap_elem_content(anchor, f"<ref id='{fix_id(anchor['href'][1:])}'>", "</ref>")
167167

168168

169-
#def move_out_text_styles(table):
170-
# ltx_font = 'ltx_font_'
171-
# font_selector = f'[class*="{ltx_font}"]'
172-
#
173-
# for elem in table.select(f"span{font_selector}, a{font_selector}, em{font_selector}"):
174-
# for c in set(elem.attrs["class"]):
175-
# if c == ltx_font + 'bold':
176-
# wrap_elem_content(elem, "<b>", "</b>")
177-
# elif c == ltx_font + 'italic':
178-
# wrap_elem_content(elem, "<i>", "</i>")
179-
180-
181-
def move_out_styles(table):
169+
bold_font_weight_re = re.compile(r"(^|;)\s*font-weight:\s*(bold|700|800|900)\s*(;|$)")
170+
bold_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-BI?$")
171+
italic_font_style_re = re.compile(r"(^|;)\s*font-style:\s*italic\s*(;|$)")
172+
italic_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-B?I$")
173+
174+
def _has_font_class(classes, font_re):
175+
return any(font_re.match(cls) for cls in classes)
176+
177+
178+
font_color_re = re.compile(r"(^|;)\s*color:\s*(?P<color>#[0-9A-Fa-f]{3,6}|red|green|blue)\s*(;|$)")
179+
def _extract_color_from_style(style):
180+
m = font_color_re.search(style)
181+
if m:
182+
color = m["color"]
183+
if color[0] == "#":
184+
color = color[1:]
185+
if len(color) != 6:
186+
color = (color + color)[:6]
187+
r, g, b = int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16)
188+
if r > 2 * g and r > 2 * b:
189+
color = "red"
190+
elif g > 2 * r and g > 2 * b:
191+
color = "green"
192+
elif b > 2 * r and b > 2 * g:
193+
color = "blue"
194+
else:
195+
return
196+
return color
197+
return
198+
199+
200+
def move_out_text_styles(table):
201+
for elem in table.select('.ltx_font_bold, [style*="font-weight"], [class*="MJXc-TeX-"]'):
202+
classes = elem.get("class", [])
203+
style = elem.get("style", "")
204+
if "ltx_font_bold" in classes or bold_font_weight_re.search(style) \
205+
or _has_font_class(classes, bold_mathjax_font_re):
206+
wrap_elem_content(elem, "<bold>", "</bold>")
207+
208+
for elem in table.select('.ltx_font_italic, [style*="font-style"], [class*="MJXc-TeX-"]'):
209+
classes = elem.get("class", [])
210+
style = elem.get("style", "")
211+
if "ltx_font_italic" in classes or italic_font_style_re.search(style) \
212+
or _has_font_class(classes, italic_mathjax_font_re):
213+
wrap_elem_content(elem, "<italic>", "</italic>")
214+
215+
for elem in table.select('[style*="color"]'):
216+
style = elem.get("style")
217+
color = _extract_color_from_style(style)
218+
if color:
219+
wrap_elem_content(elem, f"<{color}>", f"</{color}>")
220+
221+
222+
def move_out_cell_styles(table):
182223
ltx_border = 'ltx_border_'
183224
ltx_align = 'ltx_align_'
184225
ltx_th = 'ltx_th'
@@ -312,7 +353,8 @@ def extract_tables(filename, outdir):
312353
continue
313354
remove_footnotes(table)
314355
move_out_references(table)
315-
move_out_styles(table)
356+
move_out_text_styles(table)
357+
move_out_cell_styles(table)
316358
escape_table_content(table)
317359
tab = html2data(table)
318360
if tab is None:

sota_extractor2/data/doc_utils.py

Lines changed: 196 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,100 @@
11
import re
2-
from bs4 import BeautifulSoup, Comment, Tag
2+
from bs4 import BeautifulSoup, Comment, Tag, NavigableString
33
import codecs
44

55
def _handle_reference(el):
66
if el.get('href', "").startswith("#"):
77
r = str(el.get('href'))
88
el.clear() # to remove it's content from the descendants iterator
9-
return "xxref-" + r[1:]
9+
return "xxref-" + _simplify_anchor(r[1:])
10+
11+
12+
_anchor_like_classes = {
13+
'ltx_appendix', 'ltx_bibliography', 'ltx_figure', 'ltx_float', 'ltx_graphics', 'ltx_note',
14+
'ltx_paragraph', 'ltx_picture', 'ltx_section', 'ltx_subsection', 'ltx_subsubsection', 'ltx_theorem',
15+
'ltx_title_section', 'ltx_title_subsection'
16+
}
17+
18+
def _insert_anchor(el, anchor_id, prefix="xxanchor"):
19+
el.insert(0, NavigableString(f' {prefix}-{anchor_id} '))
20+
21+
def put_dummy_anchors(soup):
22+
for elem in soup.select(
23+
'.ltx_appendix, .ltx_bibliography, .ltx_bibitem, ' + \
24+
'.ltx_figure, .ltx_float, ' + \
25+
'.ltx_picture, .ltx_theorem'):
26+
id_str = elem.get('id', '')
27+
if id_str:
28+
_insert_anchor(elem, _simplify_anchor(id_str))
29+
for elem in soup.select('h2, h3, h4, h5, h6'):
30+
sec = elem.find_parent("section")
31+
if sec:
32+
id_str = sec.get('id')
33+
if id_str:
34+
_insert_anchor(elem, _simplify_anchor(id_str))
35+
for elem in soup.select(".ltx_table"):
36+
id_str = elem.get('id', "xxunk")
37+
_insert_anchor(elem, _simplify_anchor(id_str), "xxtable-xxanchor")
38+
for elem in soup.select(".ltx_tabular"):
39+
elem.extract()
40+
41+
for elem in soup.select('a[href^="#"]'):
42+
r = str(elem.get('href'))
43+
elem.string = "xxref-" + _simplify_anchor(r[1:])
44+
45+
put_footnote_anchors(soup)
46+
47+
def put_footnote_anchors(soup):
48+
for elem in soup.select('.ltx_note_content > .ltx_note_mark'):
49+
elem.extract()
50+
51+
for elem in soup.select('.ltx_role_footnote > .ltx_note_mark'):
52+
ft = elem.parent
53+
id_str = ft.get('id')
54+
elem.string = f" xxref-{_simplify_anchor(id_str)} "
55+
56+
for elem in soup.select('.ltx_note_content > .ltx_tag_note'):
57+
ft = elem.find_parent(class_="ltx_role_footnote")
58+
if ft:
59+
id_str = ft.get('id')
60+
elem.string = f" xxanchor-{_simplify_anchor(id_str)} "
61+
62+
# remove . from latexml ids (f.e., S2.SS5) so they can be searched for in elastic
63+
# without disambiguations
64+
def _simplify_anchor(s):
65+
return s.replace('.', '')
1066

1167

1268
def _handle_anchor(el):
13-
if el.get('id', ""):
69+
if el.name.lower() == 'a' and el.get('id', ""):
1470
id_str = el.get('id', "")
1571
el.clear() # to remove it's content from the descendants iterator
1672
return "xxanchor-" + id_str
73+
# classes = get_classes(el)
74+
# id_str = el.get('id')
75+
# if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes:
76+
# print(el.get_text())
77+
# print(el.name)
78+
# if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes:
79+
# print(el.get_text())
80+
# # this is workaround to deal with differences between
81+
# # htlatex and latexml html structure
82+
# # it would be better to make use of latexml structure
83+
# sec = el.find_parent("section")
84+
# if sec:
85+
# id_str = sec.get('id')
86+
# print(id_str, el.get_text())
87+
#
88+
# if id_str and classes:
89+
# classes = set(classes)
90+
# if classes.intersection(_anchor_like_classes):
91+
# print('xxanchor-'+id_str)
92+
# el.clear() # to remove it's content from the descendants iterator
93+
# return "xxanchor-" + id_str
1794

1895

1996
def _handle_table(el):
20-
if el.name.lower() == 'table':
97+
if 'ltx_table' in get_classes(el):
2198
id_str = el.get('id', "xxunk")
2299
el.clear() # to remove it's content from the descendants iterator
23100
return f"xxtable-xxanchor-" + id_str
@@ -32,26 +109,31 @@ def _handle_table(el):
32109

33110
def transform(el):
34111
if isinstance(el, Tag):
35-
for f in _transforms_el:
36-
r = f(el)
37-
if r is not None:
38-
return transform(r)
112+
# for f in _transforms_el:
113+
# r = f(el)
114+
# if r is not None:
115+
# return transform(r)
116+
return el.get_text()
39117
elif not isinstance(el, Comment):
40118
return str(el)
41119
return ''
42120

43121

122+
def clean_abstract(t):
123+
return re.sub("^\s*[aA]bstract ?", "", t)
124+
125+
44126
def get_text(*els):
45-
t = " ".join([transform(t)
46-
for el in els for t in getattr(el, 'descendants', [el])])
47-
t = re.sub("^[aA]bstract ?", "", t)
127+
# t = " ".join([transform(t)
128+
# for el in els for t in getattr(el, 'descendants', [el])])
129+
t = " ".join([transform(e) for e in els])
48130
t = re.sub("[ \n\xa0]+", " ", t)
49131
t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
50132
t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
51133
return t.strip()
52134

53135

54-
def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
136+
def content_in_section(header, names=['h2', 'h3'], skip_comments=True):
55137
for el in header.next_siblings:
56138
if getattr(el, 'name', '') in names:
57139
break
@@ -60,26 +142,25 @@ def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
60142
yield el
61143

62144

63-
def get_class(el):
145+
def get_classes(el):
64146
if hasattr(el, 'get'):
65-
# fixme: less convoluted way to return '' if calss is not found
66-
return (el.get('class', [''])+[''])[0]
147+
return el.get('class', [])
67148
else:
68-
return ''
149+
return []
69150

70151

71152
def get_name(el):
72153
return hasattr(el, 'name') and el.name or ''
73154

74155

75156
def _group_bibliography(el):
76-
if get_class(el) == 'thebibliography':
77-
return [get_text(i) for i in el.select('p.bibitem')]
157+
if 'ltx_bibliography' in get_classes(el):
158+
return [get_text(i) for i in el.select('li.ltx_bibitem')]
78159
return []
79160

80161

81162
def _group_table(el):
82-
if get_class(el) == 'table':
163+
if 'ltx_table' in get_classes(el):
83164
return [get_text(el)]
84165
return []
85166

@@ -92,7 +173,7 @@ def __init__(self):
92173
def collect(self, el):
93174
if get_name(el) == 'table':
94175
self.join_next_p = True
95-
elif get_name(el) == "p":
176+
elif 'ltx_para' in get_classes(el):
96177
if self.join_next_p:
97178
self.join_next_p = False
98179
self.els.append(el)
@@ -123,7 +204,7 @@ def reset(self):
123204
]
124205

125206

126-
def group_content(elements):
207+
def group_content2(elements):
127208
par_gruop = ParagraphGrouper()
128209
for el in elements:
129210
fragments = [frag for grouper in _group_el for frag in grouper(el)]
@@ -138,15 +219,100 @@ def group_content(elements):
138219
yield frag
139220

140221

141-
def set_ids_by_labels(soup):
142-
captions = soup.select(".caption")
143-
prefix = "tex4ht:label?:"
144-
for caption in captions:
145-
el = caption.next_sibling
146-
if isinstance(el, Comment) and el.string.startswith(prefix):
147-
label = el.string[len(prefix):].strip()
148-
for table in caption.parent.select("table"):
149-
table["id"] = label
222+
def walk(elem):
223+
for el in elem.children:
224+
classes = get_classes(el)
225+
if el.name == 'section' or 'ltx_biblist' in classes:
226+
yield from walk(el)
227+
else:
228+
yield el
229+
230+
class Grouper:
231+
def __init__(self):
232+
self.out = []
233+
self.section_idx = -1
234+
self.subsection_idx = 0
235+
self.header = ""
236+
self.in_section = False # move elements before first section into that section
237+
self.section_output = False # if a section is empty and new section begins, output it for keep header
238+
239+
def get_output_text(self):
240+
return " ".join(self.out)
241+
242+
def flush(self):
243+
if self.in_section:
244+
r = max(self.section_idx, 0), self.subsection_idx, self.header, self.get_output_text()
245+
self.out = []
246+
self.section_output = True
247+
self.subsection_idx += 1
248+
yield r
249+
250+
def new_section(self, header_el):
251+
if not self.section_output: # output (possibly) empty section so header won't be lost
252+
yield from self.flush()
253+
self.section_output = False
254+
self.in_section = True
255+
self.section_idx += 1
256+
self.subsection_idx = 0
257+
self.header = get_text(header_el)
258+
259+
def append(self, el):
260+
t = get_text(el).strip()
261+
if t != "":
262+
self.out.append(t)
263+
264+
def group_content(self, doc):
265+
for el in walk(doc):
266+
classes = get_classes(el)
267+
if el.name in ["h2", "h3"]:
268+
yield from self.new_section(el)
269+
elif el.name == "h1":
270+
continue
271+
elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes:
272+
self.append(el)
273+
yield from self.flush()
274+
else:
275+
self.append(el)
276+
277+
278+
def group_content(doc):
279+
yield from Grouper().group_content(doc)
280+
281+
def group_content3(doc):
282+
out = []
283+
section_idx = -1
284+
subsection_idx = 0
285+
header = ""
286+
has_paragraph = False
287+
for el in walk(doc):
288+
classes = get_classes(el)
289+
if el.name in ["h2", "h3"]:
290+
if len(out) and has_paragraph:
291+
yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
292+
out = []
293+
section_idx += 1
294+
subsection_idx = 0
295+
header = get_text(el)
296+
continue
297+
elif 'ltx_title' in classes and el.name != "h1":
298+
if len(out) and has_paragraph:
299+
yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
300+
out = []
301+
out += [el]
302+
303+
elif 'ltx_title_document' in classes:
304+
continue
305+
elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes:
306+
if len(out) and has_paragraph:
307+
yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
308+
subsection_idx += 1
309+
out = []
310+
has_paragraph = True
311+
out += [el]
312+
else:
313+
out.append(el)
314+
if len(out):
315+
yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
150316

151317
def read_html(file):
152318
with codecs.open(file, 'r', encoding='UTF-8') as f:

0 commit comments

Comments
 (0)