Skip to content

Commit 1e33103

Browse files
committed
Assemble pipeline
1 parent c04a03c commit 1e33103

21 files changed

+888
-80
lines changed

extract_tables.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
from dataclasses import dataclass
1414
from typing import Set
1515

16-
from tabular import Tabular
17-
16+
from sota_extractor2.data.table import Table
1817

1918
# begin of dirty hack
2019
# pandas parsing of html tables is really nice
@@ -265,18 +264,13 @@ def html2data(table):
265264
return data[0] if len(data) == 1 else None
266265

267266

268-
def save_table(data, filename):
269-
data.to_csv(filename, header=None, index=None)
270-
271-
272267
def save_tables(data, outdir):
273268
metadata = []
274269

275270
for num, table in enumerate(data, 1):
276271
filename = f"table_{num:02}.csv"
277272
layout = f"layout_{num:02}.csv"
278-
save_table(table.data, outdir / filename)
279-
save_table(table.layout, outdir / layout)
273+
table.save(outdir, filename, layout)
280274
metadata.append(dict(filename=filename, layout=layout, caption=table.caption, figure_id=table.figure_id))
281275
with open(outdir / "metadata.json", "w") as f:
282276
json.dump(metadata, f)
@@ -341,11 +335,9 @@ def remove_footnotes(soup):
341335
elem.extract()
342336

343337

344-
def extract_tables(filename, outdir):
338+
def extract_tables(filename):
345339
with open(filename, "rb") as f:
346340
html = f.read()
347-
outdir = Path(outdir)
348-
outdir.mkdir(parents=True, exist_ok=True)
349341
soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
350342
set_ids_by_labels(soup)
351343
fix_span_tables(soup)
@@ -381,8 +373,13 @@ def extract_tables(filename, outdir):
381373
if cap_el is not None:
382374
caption = clear_ws(cap_el.get_text())
383375
figure_id = table.get("data-figure-id")
384-
data.append(Tabular(tab, layout, caption, figure_id))
376+
data.append(Table(f"table_{len(data)+1:02}", tab, layout.applymap(str), caption, figure_id))
377+
return data
385378

386-
save_tables(data, outdir)
379+
def extract_tables_cmd(filename, outdir):
380+
tables = extract_tables(filename)
381+
outdir = Path(outdir)
382+
outdir.mkdir(parents=True, exist_ok=True)
383+
save_tables(tables, outdir)
387384

388-
if __name__ == "__main__": fire.Fire(extract_tables)
385+
if __name__ == "__main__": fire.Fire(extract_tables_cmd)

sota_extractor2/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,7 @@
2222

2323
datasets = data/"datasets"
2424
datasets_structure = datasets/"structure"
25+
structure_models = datasets / "structure" / "models"
26+
27+
linking_models = datasets / "linking" / "models"
28+
linking_data = datasets / "linking" / "data"

sota_extractor2/data/elastic.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,10 @@ def from_json(cls, json, paper_id=None):
162162
return paper
163163

164164
@classmethod
165-
def from_file(cls, path):
165+
def from_file(cls, path, paper_id=None):
166166
path = Path(path)
167-
paper_id = path.parent.name
167+
if paper_id is None:
168+
paper_id = path.parent.name
168169
with open(path, "rt") as f:
169170
json = f.read()
170171
return cls.from_json(json, paper_id)
@@ -187,6 +188,12 @@ def save(self, **kwargs):
187188
else:
188189
return super().save(**kwargs)
189190

191+
def delete(self, **kwargs):
192+
if hasattr(self, 'fragments'):
193+
for f in self.fragments:
194+
f.delete()
195+
return super().delete(**kwargs)
196+
190197
@classmethod
191198
def parse_html(cls, soup, paper_id):
192199
put_dummy_anchors(soup)
@@ -254,9 +261,12 @@ def read_html(cls, file):
254261
return read_html(file)
255262

256263
@classmethod
257-
def parse_paper(cls, file):
264+
def parse_paper(cls, file, paper_id=None):
265+
file = Path(file)
258266
soup = cls.read_html(file)
259-
return cls.parse_html(soup, file.stem)
267+
if paper_id is None:
268+
paper_id = file.stem
269+
return cls.parse_html(soup, paper_id)
260270

261271

262272
class Author(InnerDoc):

sota_extractor2/data/structure.py

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from collections import namedtuple
44
import hashlib
55
from fastai.text import progress_bar
6-
from .elastic import Fragment
6+
from .elastic import Fragment, setup_default_connection
77
from .json import *
8+
from .table import reference_re, remove_text_styles, remove_references, style_tags_re
89

910
def get_all_tables(papers):
1011
for paper in papers:
@@ -13,11 +14,18 @@ def get_all_tables(papers):
1314
table.paper_id = paper.arxiv_id
1415
yield table
1516

16-
def consume_cells(*matrix):
17+
def consume_cells(table):
1718
Cell = namedtuple('AnnCell', 'row col vals')
18-
for row_id, row in enumerate(zip(*matrix)):
19-
for col_id, cell_val in enumerate(zip(*row)):
20-
yield Cell(row=row_id, col=col_id, vals=cell_val)
19+
for row_id, row in enumerate(table.df.values):
20+
for col_id, cell in enumerate(row):
21+
vals = [
22+
remove_text_styles(remove_references(cell.raw_value)),
23+
"",
24+
cell.refs[0] if cell.refs else "",
25+
cell.layout,
26+
bool(style_tags_re.search(cell.raw_value))
27+
]
28+
yield Cell(row=row_id, col=col_id, vals=vals)
2129

2230

2331
reference_re = re.compile(r"\[[^]]*\]")
@@ -38,10 +46,12 @@ def empty_fragment(paper_id):
3846
return fragment
3947

4048

41-
def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpus_limit=10):
49+
def fetch_evidence(cell_content, cell_reference, paper_id, table_name, row, col, paper_limit=10, corpus_limit=10):
50+
if not filter_cells(cell_content):
51+
return [empty_fragment(paper_id)]
4252
cell_content = clear_cell(cell_content)
4353
if cell_content == "" and cell_reference == "":
44-
return []
54+
return [empty_fragment(paper_id)]
4555

4656
evidence_query = Fragment.search().highlight(
4757
'text', pre_tags="<b>", post_tags="</b>", fragment_size=400)
@@ -65,8 +75,11 @@ def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpu
6575
other_fagements = list(evidence_query
6676
.exclude('term', paper_id=paper_id)
6777
.query('match_phrase', text=query)[:corpus_limit])
68-
if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
69-
print(f"No evidences for '{cell_content}' of {paper_id}")
78+
79+
ext_id = f"{paper_id}/{table_name}/{row}.{col}"
80+
####print(f"{ext_id} |{cell_content}|: {len(paper_fragments)} paper fragments, {len(reference_fragments)} reference fragments, {len(other_fagements)} other fragments")
81+
# if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
82+
# print(f"No evidences for '{cell_content}' of {paper_id}")
7083
if not len(paper_fragments) and not len(reference_fragments):
7184
paper_fragments = [empty_fragment(paper_id)]
7285
return paper_fragments + reference_fragments + other_fagements
@@ -86,13 +99,13 @@ def fix_reference_hightlight(s):
8699
return partial_highlight_re.sub("xxref-", s)
87100

88101

89-
def create_evidence_records(textfrag, cell, table):
102+
def create_evidence_records(textfrag, cell, paper, table):
90103
for text_highlited in textfrag.meta['highlight']['text']:
91104
text_highlited = fix_reference_hightlight(fix_refs(text_highlited))
92105
text = highlight_re.sub("", text_highlited)
93106
text_sha1 = hashlib.sha1(text.encode("utf-8")).hexdigest()
94107

95-
cell_ext_id = f"{table.ext_id}/{cell.row}/{cell.col}"
108+
cell_ext_id = f"{paper.paper_id}/{table.name}/{cell.row}/{cell.col}"
96109

97110
yield {"text_sha1": text_sha1,
98111
"text_highlited": text_highlited,
@@ -103,46 +116,53 @@ def create_evidence_records(textfrag, cell, table):
103116
"cell_reference": cell.vals[2],
104117
"cell_layout": cell.vals[3],
105118
"cell_styles": cell.vals[4],
106-
"this_paper": textfrag.paper_id == table.paper_id,
119+
"this_paper": textfrag.paper_id == paper.paper_id,
107120
"row": cell.row,
108121
"col": cell.col,
109-
"row_context": " border ".join([str(s) for s in table.matrix[cell.row]]),
110-
"col_context": " border ".join([str(s) for s in table.matrix[:, cell.col]]),
122+
"row_context": " border ".join([str(s) for s in table.matrix.values[cell.row]]),
123+
"col_context": " border ".join([str(s) for s in table.matrix.values[:, cell.col]]),
111124
"ext_id": cell_ext_id
112125
#"table_id":table_id
113126
}
114127

115128

116-
def filter_cells(cell):
117-
return re.search("[a-zA-Z]{2,}", cell.vals[1]) is not None
129+
def filter_cells(cell_content):
130+
return re.search("[a-zA-Z]{2,}", cell_content) is not None
118131

119132

120133
interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub", "dataset-task"]
121134

122135

123-
def evidence_for_table(table, paper_limit=10, corpus_limit=1, limit_type='interesting'):
124-
def get_limits(cell_type):
125-
if limit_type == 'interesting' and (cell_type.strip() in interesting_types) or (limit_type == 'max'):
126-
return dict(paper_limit=1000, corpus_limit=1000)
127-
return dict(paper_limit=paper_limit, corpus_limit=corpus_limit)
136+
def evidence_for_table(paper, table, paper_limit, corpus_limit):
128137
records = [
129138
record
130-
for cell in consume_cells(table.matrix, table.matrix_gold_tags, table.matrix_references, table.matrix_layout, table.matrix_styles) if filter_cells(cell)
131-
for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=table.paper_id, **get_limits(cell.vals[1]))
132-
for record in create_evidence_records(evidence, cell, table=table)
139+
for cell in consume_cells(table)
140+
for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=paper.paper_id, table_name=table.name,
141+
row=cell.row, col=cell.col, paper_limit=paper_limit, corpus_limit=corpus_limit)
142+
for record in create_evidence_records(evidence, cell, paper=paper, table=table)
133143
]
134144
df = pd.DataFrame.from_records(records)
135145
return df
136146

137147

138-
def prepare_data(tables, csv_path, limit_type='interesting'):
139-
df = pd.concat([evidence_for_table(table,
148+
def prepare_data(paper, tables, csv_path, limit_type='interesting'):
149+
df = pd.concat([evidence_for_table(paper, table,
140150
paper_limit=100,
141151
corpus_limit=20,
142152
limit_type=limit_type) for table in progress_bar(tables)])
143153
#moved to experiment preprocessing
144154
#df = df.drop_duplicates(
145155
# ["cell_content", "text_highlited", "cell_type", "this_paper"])
146156
print("Number of text fragments ", len(df))
157+
147158
csv_path.parent.mkdir(parents=True, exist_ok=True)
148159
df.to_csv(csv_path, index=None)
160+
161+
162+
class CellEvidenceExtractor:
163+
def __init__(self):
164+
# todo: make sure can be called more than once or refactor to singleton
165+
setup_default_connection()
166+
167+
def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
168+
return pd.concat([evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables])

sota_extractor2/data/table.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ def extract_references(s):
3636
return text, refs
3737

3838

39+
empty_paren_re = re.compile(r"\(\s*\)|\[\s*\]")
40+
def remove_references(s):
41+
s = reference_re.sub("", s)
42+
return empty_paren_re.sub("", s)
43+
44+
3945
style_tags_re = re.compile(r"</?(bold|italic|red|green|blue)>")
4046
def remove_text_styles(s):
4147
return style_tags_re.sub("", s)
@@ -76,10 +82,7 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
7682
self.old_name = old_name
7783

7884
if layout is not None:
79-
#self.layout = layout
80-
for r, row in layout.iterrows():
81-
for c, cell in enumerate(row):
82-
self.df.iloc[r,c].layout = cell
85+
self.set_layout(layout)
8386

8487
if annotations is not None:
8588
self.gold_tags = annotations.gold_tags.strip()
@@ -97,9 +100,7 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
97100
elif gt_rows > 0:
98101
gt_cols = len(tags[0])
99102
if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols):
100-
for r, row in enumerate(tags):
101-
for c, cell in enumerate(row):
102-
self.df.iloc[r,c].gold_tags = cell.strip()
103+
self.set_tags(tags)
103104
else:
104105
print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
105106
# print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
@@ -112,14 +113,34 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
112113
self.dataset_text = ''
113114
self.notes = ''
114115

116+
def set_layout(self, layout):
117+
for r, row in layout.iterrows():
118+
for c, cell in enumerate(row):
119+
self.df.iloc[r, c].layout = cell
120+
121+
def set_tags(self, tags):
122+
for r, row in tags.iterrows():
123+
for c, cell in enumerate(row):
124+
# todo: change gold_tags to tags to avoid confusion
125+
self.df.iloc[r,c].gold_tags = cell.strip()
126+
115127
@property
116128
def matrix(self):
117129
return self.df.applymap(lambda x: x.value)
118130

131+
@property
132+
def matrix_layout(self):
133+
return self.df.applymap(lambda x: x.layout)
134+
119135
@property
120136
def matrix_gold_tags(self):
121137
return self.df.applymap(lambda x: x.gold_tags)
122138

139+
# todo: remove gold_tags
140+
@property
141+
def matrix_tags(self):
142+
return self.matrix_gold_tags
143+
123144
@classmethod
124145
def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=None, guessed_tags=None):
125146
path = Path(path)
@@ -146,6 +167,14 @@ def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=N
146167
def display(self):
147168
display_table(self.df.applymap(lambda x: raw_value_to_html(x.raw_value)).values, self.df.applymap(lambda x: x.gold_tags).values, self.df.applymap(lambda x:x.layout).values)
148169

170+
def _save_df(self, df, filename):
171+
df.to_csv(filename, header=None, index=None)
172+
173+
def save(self, path, table_name, layout_name):
174+
path = Path(path)
175+
self._save_df(self.df.applymap(lambda x: x.raw_value), path / table_name)
176+
self._save_df(self.df.applymap(lambda x: x.layout), path / layout_name)
177+
149178
#####
150179
# this code is used to migrate table annotations from
151180
# tables parsed by htlatex to tables parsed by
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import docker
2+
from pathlib import Path
3+
4+
def ro_bind(path): return dict(bind=path, mode='ro')
5+
def rw_bind(path): return dict(bind=path, mode='rw')
6+
7+
8+
9+
class LatexConverter:
10+
def __init__(self, base_path):
11+
# pull arxivvanity/engrafo image
12+
self.client = docker.from_env()
13+
self.base_path = Path(base_path)
14+
15+
def to_html(self, source_dir, output_dir):
16+
base = self.base_path
17+
volumes = {
18+
base / "latex2html.sh": ro_bind("/files/latex2html.sh"),
19+
base / "guess_main.py": ro_bind("/files/guess_main.py"), # todo: run guess_main outside of docker
20+
base / "patches": ro_bind("/files/patches"), # todo: see which patches can be dropped
21+
source_dir.resolve(): ro_bind("/files/ro-source"),
22+
output_dir.resolve(): rw_bind("/files/htmls")
23+
}
24+
25+
output_dir.mkdir(parents=True, exist_ok=True)
26+
filename = "index.html"
27+
command = ["/files/latex2html.sh", filename]
28+
self.client.containers.run("arxivvanity/engrafo", command, remove=True,
29+
volumes=volumes) # todo: check if command as a list protects from shell injection
30+
# todo: check for errors

sota_extractor2/helpers/temp_paper.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from ..data.elastic import Paper as PaperText
2+
from ..data.paper_collection import Paper
3+
from extract_tables import extract_tables
4+
import string
5+
import random
6+
7+
# todo: make sure multithreading/processing won't cause collisions
8+
def random_id():
9+
return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
10+
11+
12+
def temp_paper(path):
13+
text = PaperText.parse_paper(path, random_id())
14+
tables = extract_tables(path)
15+
return Paper(paper_id=text.meta['id'], text=text, tables=tables, annotations=None)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .taxonomy import Taxonomy
2+
from .linker import Linker
3+
from .context_search import ContextSearch, DatasetExtractor
4+
from .proposals_filters import *
5+
6+
__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "ProposalsFilter", "NopFilter",
7+
"BestResultFilter", "StructurePredictionFilter", "ConfidenceFilter", "CompoundFilter"]

0 commit comments

Comments
 (0)