Skip to content

Commit c68d12f

Browse files
committed
Separate html parsing from elastic
1 parent 3be7094 commit c68d12f

File tree

3 files changed

+162
-161
lines changed

3 files changed

+162
-161
lines changed

normalize_references.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import ahocorasick
66
import pickle
77
from multiprocessing import Pool
8-
from sota_extractor2.data.elastic import get_text, Paper
8+
from sota_extractor2.data.doc_utils import get_text, read_html
99

1010
punctuation_table = str.maketrans('', '', string.punctuation)
1111

@@ -62,7 +62,7 @@ def save_html(path, html):
6262
def resolve_references_in_html(args):
6363
file, output = args
6464
output.parent.mkdir(exist_ok=True, parents=True)
65-
html = Paper.read_html(f)
65+
html = read_html(file)
6666
bibitems = get_bibitems(html)
6767
mapping = resolve_references(reference_trie, bibitems)
6868
update_references(html, mapping)

sota_extractor2/data/doc_utils.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import re
2+
from bs4 import BeautifulSoup, Comment, Tag
3+
import codecs
4+
5+
def _handle_reference(el):
6+
if el.get('href', "").startswith("#"):
7+
r = str(el.get('href'))
8+
el.clear() # to remove it's content from the descendants iterator
9+
return "xxref-" + r[1:]
10+
11+
12+
def _handle_anchor(el):
13+
if el.get('id', ""):
14+
id_str = el.get('id', "")
15+
el.clear() # to remove it's content from the descendants iterator
16+
return "xxanchor-" + id_str
17+
18+
19+
def _handle_table(el):
20+
if el.name.lower() == 'table':
21+
id_str = el.get('id', "xxunk")
22+
el.clear() # to remove it's content from the descendants iterator
23+
return f"xxtable-xxanchor-" + id_str
24+
25+
26+
_transforms_el = [
27+
_handle_reference,
28+
_handle_table,
29+
_handle_anchor,
30+
]
31+
32+
33+
def transform(el):
34+
if isinstance(el, Tag):
35+
for f in _transforms_el:
36+
r = f(el)
37+
if r is not None:
38+
return transform(r)
39+
elif not isinstance(el, Comment):
40+
return str(el)
41+
return ''
42+
43+
44+
def get_text(*els):
45+
t = " ".join([transform(t)
46+
for el in els for t in getattr(el, 'descendants', [el])])
47+
t = re.sub("^[aA]bstract ?", "", t)
48+
t = re.sub("[ \n\xa0]+", " ", t)
49+
t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
50+
t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
51+
return t.strip()
52+
53+
54+
def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
55+
for el in header.next_siblings:
56+
if getattr(el, 'name', '') in names:
57+
break
58+
if skip_comments and isinstance(el, Comment):
59+
continue
60+
yield el
61+
62+
63+
def get_class(el):
64+
if hasattr(el, 'get'):
65+
# fixme: less convoluted way to return '' if calss is not found
66+
return (el.get('class', [''])+[''])[0]
67+
else:
68+
return ''
69+
70+
71+
def get_name(el):
72+
return hasattr(el, 'name') and el.name or ''
73+
74+
75+
def _group_bibliography(el):
76+
if get_class(el) == 'thebibliography':
77+
return [get_text(i) for i in el.select('p.bibitem')]
78+
return []
79+
80+
81+
def _group_table(el):
82+
if get_class(el) == 'table':
83+
return [get_text(el)]
84+
return []
85+
86+
87+
class ParagraphGrouper:
88+
def __init__(self):
89+
self.els = []
90+
self.join_next_p = False
91+
92+
def collect(self, el):
93+
if get_name(el) == 'table':
94+
self.join_next_p = True
95+
elif get_name(el) == "p":
96+
if self.join_next_p:
97+
self.join_next_p = False
98+
self.els.append(el)
99+
else:
100+
return self.flush(new_els=[el])
101+
else:
102+
self.els.append(el)
103+
return []
104+
105+
def flush(self, new_els=None):
106+
text = get_text(*self.els)
107+
if new_els is None:
108+
new_els = []
109+
if isinstance(new_els, Tag): # allow for one tag to be passed
110+
new_els = [new_els]
111+
self.els = new_els
112+
if text:
113+
return [text]
114+
return []
115+
116+
def reset(self):
117+
self.els = []
118+
119+
120+
_group_el = [
121+
_group_bibliography,
122+
_group_table,
123+
]
124+
125+
126+
def group_content(elements):
127+
par_gruop = ParagraphGrouper()
128+
for el in elements:
129+
fragments = [frag for grouper in _group_el for frag in grouper(el)]
130+
if fragments:
131+
fragments = par_gruop.flush() + fragments
132+
else:
133+
fragments = par_gruop.collect(el)
134+
for frag in fragments:
135+
yield frag
136+
137+
for frag in par_gruop.flush():
138+
yield frag
139+
140+
141+
def set_ids_by_labels(soup):
142+
captions = soup.select(".caption")
143+
prefix = "tex4ht:label?:"
144+
for caption in captions:
145+
el = caption.next_sibling
146+
if isinstance(el, Comment) and el.string.startswith(prefix):
147+
label = el.string[len(prefix):].strip()
148+
for table in caption.parent.select("table"):
149+
table["id"] = label
150+
151+
def read_html(file):
152+
with codecs.open(file, 'r', encoding='UTF-8') as f:
153+
text = f.read()
154+
return BeautifulSoup(text, "html.parser")

sota_extractor2/data/elastic.py

Lines changed: 6 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
import pandas as pd
22
import re
3-
import numpy as np
4-
import elasticsearch
5-
from bs4 import BeautifulSoup, Comment, Tag
6-
import codecs
7-
import textwrap
3+
from bs4 import BeautifulSoup
84

9-
from datetime import datetime
10-
from elasticsearch_dsl import Document, Date, Nested, Boolean, Object, \
11-
analyzer, InnerDoc, Completion, Keyword, Text, Integer, tokenizer, token_filter
5+
from elasticsearch_dsl import Document, Boolean, Object, \
6+
analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
127

13-
from IPython.display import display, Markdown, Latex
8+
from IPython.display import display, Markdown
149

1510
from elasticsearch_dsl import connections
1611

12+
from sota_extractor2.data.doc_utils import get_text, content_in_section, group_content, set_ids_by_labels, read_html
1713
from .. import config
1814

1915

@@ -26,153 +22,6 @@ def printmd(*args): # fixme: make it work without jupyter notebook
2622
display(Markdown(" ".join(map(str, args))))
2723

2824

29-
def _handle_reference(el):
30-
if el.get('href', "").startswith("#"):
31-
r = str(el.get('href'))
32-
el.clear() # to remove it's content from the descendants iterator
33-
return "xxref-" + r[1:]
34-
35-
36-
def _handle_anchor(el):
37-
if el.get('id', ""):
38-
id_str = el.get('id', "")
39-
el.clear() # to remove it's content from the descendants iterator
40-
return "xxanchor-" + id_str
41-
42-
43-
def _handle_table(el):
44-
if el.name.lower() == 'table':
45-
id_str = el.get('id', "xxunk")
46-
el.clear() # to remove it's content from the descendants iterator
47-
return f"xxtable-xxanchor-" + id_str
48-
49-
50-
_transforms_el = [
51-
_handle_reference,
52-
_handle_table,
53-
_handle_anchor,
54-
]
55-
56-
57-
def transform(el):
58-
if isinstance(el, Tag):
59-
for f in _transforms_el:
60-
r = f(el)
61-
if r is not None:
62-
return transform(r)
63-
elif not isinstance(el, Comment):
64-
return str(el)
65-
return ''
66-
67-
68-
def get_text(*els):
69-
t = " ".join([transform(t)
70-
for el in els for t in getattr(el, 'descendants', [el])])
71-
t = re.sub("^[aA]bstract ?", "", t)
72-
t = re.sub("[ \n\xa0]+", " ", t)
73-
t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
74-
t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
75-
return t.strip()
76-
77-
78-
def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
79-
for el in header.next_siblings:
80-
if getattr(el, 'name', '') in names:
81-
break
82-
if skip_comments and isinstance(el, Comment):
83-
continue
84-
yield el
85-
86-
87-
def get_class(el):
88-
if hasattr(el, 'get'):
89-
# fixme: less convoluted way to return '' if calss is not found
90-
return (el.get('class', [''])+[''])[0]
91-
else:
92-
return ''
93-
94-
95-
def get_name(el):
96-
return hasattr(el, 'name') and el.name or ''
97-
98-
99-
def _group_bibliography(el):
100-
if get_class(el) == 'thebibliography':
101-
return [get_text(i) for i in el.select('p.bibitem')]
102-
return []
103-
104-
105-
def _group_table(el):
106-
if get_class(el) == 'table':
107-
return [get_text(el)]
108-
return []
109-
110-
111-
class ParagraphGrouper:
112-
def __init__(self):
113-
self.els = []
114-
self.join_next_p = False
115-
116-
def collect(self, el):
117-
if get_name(el) == 'table':
118-
self.join_next_p = True
119-
elif get_name(el) == "p":
120-
if self.join_next_p:
121-
self.join_next_p = False
122-
self.els.append(el)
123-
else:
124-
return self.flush(new_els=[el])
125-
else:
126-
self.els.append(el)
127-
return []
128-
129-
def flush(self, new_els=None):
130-
text = get_text(*self.els)
131-
if new_els is None:
132-
new_els = []
133-
if isinstance(new_els, Tag): # allow for one tag to be passed
134-
new_els = [new_els]
135-
self.els = new_els
136-
if text:
137-
return [text]
138-
return []
139-
140-
def reset(self):
141-
self.els = []
142-
143-
144-
_group_el = [
145-
_group_bibliography,
146-
_group_table,
147-
]
148-
149-
150-
def group_content(elements):
151-
par_gruop = ParagraphGrouper()
152-
for el in elements:
153-
fragments = [frag for grouper in _group_el for frag in grouper(el)]
154-
if fragments:
155-
fragments = par_gruop.flush() + fragments
156-
else:
157-
fragments = par_gruop.collect(el)
158-
for frag in fragments:
159-
yield frag
160-
161-
for frag in par_gruop.flush():
162-
yield frag
163-
164-
165-
def set_ids_by_labels(soup):
166-
captions = soup.select(".caption")
167-
prefix = "tex4ht:label?:"
168-
for caption in captions:
169-
el = caption.next_sibling
170-
if isinstance(el, Comment) and el.string.startswith(prefix):
171-
label = el.string[len(prefix):].strip()
172-
for table in caption.parent.select("table"):
173-
table["id"] = label
174-
175-
17625
class Fragments(list):
17726

17827
def get_toc(self):
@@ -335,9 +184,7 @@ def print_section(self, name, clean_up=lambda x: x):
335184

336185
@classmethod
337186
def read_html(cls, file):
338-
with codecs.open(file, 'r', encoding='UTF-8') as f:
339-
text = f.read()
340-
return BeautifulSoup(text, "html.parser")
187+
return read_html(file)
341188

342189
@classmethod
343190
def parse_paper(cls, file):

0 commit comments

Comments
 (0)