Skip to content

Commit a5a776a

Browse files
committed
Migrate structure annotations
* remove empty tables from extraction results * remove undefine latex commands from output cells * remove algorithms and graphics * keep arxiv versions in paper collection * ignore arxiv version when searching in paper collection * keep cell's raw_value to be able to locate references * during migration of structure annotations match tables using cell contexts (consider 8-connected neighbourhood of cell first, then it's direct neighbours in row, column and finally cell value only)
1 parent 9b98cc7 commit a5a776a

File tree

3 files changed

+182
-26
lines changed

3 files changed

+182
-26
lines changed

extract_tables.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,9 @@ def fix_table(df):
149149
return decouple_layout(df)
150150

151151

152+
def is_table_empty(df):
153+
return (df.applymap(lambda x: x.strip()).values == "").all()
154+
152155
def fix_id(s):
153156
return s.replace(".", "-")
154157

@@ -198,6 +201,11 @@ def move_out_styles(table):
198201
wrap_elem_content(elem, f"{b},{a},{header},{colspan},{rowspan};", "")
199202

200203

204+
def remove_ltx_errors(soup):
205+
for span in soup.select('span.ltx_ERROR'):
206+
span.extract()
207+
208+
201209
def html2data(table):
202210
data = pd.read_html(str(table), match='')
203211
if len(data) > 1:
@@ -231,6 +239,26 @@ def set_ids_by_labels(soup):
231239
for table in fig.select(".ltx_tabular"):
232240
table["data-figure-id"] = label
233241

242+
243+
alg_id_re = re.compile(r"^alg(orithm)?[0-9]+")
244+
def perhaps_not_tabular(table, float_div):
245+
classes = float_div.attrs.get("class")
246+
if 'ltx_table' in classes:
247+
return False
248+
if 'ltx_figure' in classes:
249+
if table.find("img", class_="ltx_graphics"):
250+
return True
251+
if 'ltx_float' in classes:
252+
if 'biography' in classes:
253+
return True
254+
if 'ltx_float_algorithm':
255+
return True
256+
if 'ltx_lstlisting':
257+
return True
258+
if float_div.id and alg_id_re.match(float_div.id):
259+
return True
260+
return False
261+
234262
def is_figure(tag):
235263
return tag.name == "figure"
236264
# classes = tag.attrs.get("class", [])
@@ -270,6 +298,7 @@ def extract_tables(filename, outdir):
270298
set_ids_by_labels(soup)
271299
fix_span_tables(soup)
272300
fix_th(soup)
301+
remove_ltx_errors(soup)
273302
flatten_tables(soup)
274303
tables = soup.find_all("table", class_="ltx_tabular")
275304

@@ -279,6 +308,8 @@ def extract_tables(filename, outdir):
279308
continue
280309

281310
float_div = table.find_parent(is_figure)
311+
if float_div and perhaps_not_tabular(table, float_div):
312+
continue
282313
remove_footnotes(table)
283314
move_out_references(table)
284315
move_out_styles(table)
@@ -288,6 +319,8 @@ def extract_tables(filename, outdir):
288319
continue
289320

290321
tab, layout = fix_table(tab)
322+
if is_table_empty(tab):
323+
continue
291324

292325
caption = None
293326
if float_div is not None:

sota_extractor2/data/paper_collection.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
class Paper:
1212
def __init__(self, paper_id, text, tables, annotations):
1313
self.paper_id = paper_id
14+
self.arxiv_no_version = remove_arxiv_version(paper_id)
1415
if text is not None:
1516
self.text = text
1617
else:
@@ -32,20 +33,18 @@ def remove_arxiv_version(arxiv_id):
3233
def _load_texts(path, jobs):
3334
files = list(path.glob("**/text.json"))
3435
texts = Parallel(n_jobs=jobs, prefer="processes")(delayed(PaperText.from_file)(f) for f in files)
35-
return {remove_arxiv_version(text.meta.id): text for text in texts}
36+
return {text.meta.id: text for text in texts}
3637

3738

3839
def _load_tables(path, annotations, jobs):
3940
files = list(path.glob("**/metadata.json"))
40-
tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(remove_arxiv_version(f.parent.name))) for f in files)
41-
return {remove_arxiv_version(f.parent.name): tbls for f, tbls in zip(files, tables)}
41+
tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(f.parent.name)) for f in files)
42+
return {f.parent.name: tbls for f, tbls in zip(files, tables)}
43+
4244

4345
def _load_annotated_papers(path):
44-
dump = load_gql_dump(path, compressed=False)["allPapers"]
45-
annotations = {}
46-
for a in dump:
47-
arxiv_id = remove_arxiv_version(a.arxiv_id)
48-
annotations[arxiv_id] = a
46+
dump = load_gql_dump(path, compressed=path.suffix == ".gz")["allPapers"]
47+
annotations = {a.arxiv_id: a for a in dump}
4948
return annotations
5049

5150

@@ -74,12 +73,18 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
7473
papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join]
7574
return cls(papers)
7675

77-
def get_by_id(self, paper_id):
78-
paper_id = remove_arxiv_version(paper_id)
79-
for p in self.data:
80-
if p.paper_id == paper_id:
81-
return p
82-
return None
76+
def get_by_id(self, paper_id, ignore_version=True):
77+
if ignore_version:
78+
paper_id = remove_arxiv_version(paper_id)
79+
for p in self.data:
80+
if p.arxiv_no_version == paper_id:
81+
return p
82+
return None
83+
else:
84+
for p in self.data:
85+
if p.paper_id == paper_id:
86+
return p
87+
return None
8388

8489
@classmethod
8590
def cells_gold_tags_legend(cls):

sota_extractor2/data/table.py

Lines changed: 130 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pandas as pd
2+
import numpy as np
23
import json
34
from pathlib import Path
45
import re
@@ -9,6 +10,7 @@
910
@dataclass
1011
class Cell:
1112
value: str
13+
raw_value: str
1214
gold_tags: str = ''
1315
refs: List[str] = field(default_factory=list)
1416
layout: str = ''
@@ -36,7 +38,7 @@ def extract_references(s):
3638

3739
def str2cell(s):
3840
value, refs = extract_references(s)
39-
return Cell(value=value, refs=refs)
41+
return Cell(value=value, raw_value=s, refs=refs)
4042

4143
def read_str_csv(filename):
4244
try:
@@ -49,34 +51,51 @@ def read_str_csv(filename):
4951

5052

5153
class Table:
52-
def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None):
54+
def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None, guessed_tags=None):
5355
self.df = df
5456
self.caption = caption
5557
self.figure_id = figure_id
5658
self.df = df.applymap(str2cell)
5759
self.old_name = old_name
5860

5961
if layout is not None:
60-
self.layout = layout
62+
#self.layout = layout
6163
for r, row in layout.iterrows():
6264
for c, cell in enumerate(row):
6365
self.df.iloc[r,c].layout = cell
6466

6567
if annotations is not None:
6668
self.gold_tags = annotations.gold_tags.strip()
67-
tags = annotations.matrix_gold_tags
68-
gt_rows = len(annotations.matrix_gold_tags)
69-
if gt_rows > 0:
70-
gt_cols = len(annotations.matrix_gold_tags[0])
69+
self.dataset_text = annotations.dataset_text.strip()
70+
self.notes = annotations.notes.strip()
71+
if guessed_tags is not None:
72+
tags = guessed_tags.values
73+
else:
74+
tags = annotations.matrix_gold_tags
75+
gt_rows = len(tags)
76+
if gt_rows == 0 and len(self.df) > 0:
77+
#print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
78+
self.old_name = None
79+
elif gt_rows > 0:
80+
gt_cols = len(tags[0])
7181
if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols):
7282
for r, row in enumerate(tags):
7383
for c, cell in enumerate(row):
7484
self.df.iloc[r,c].gold_tags = cell.strip()
85+
else:
86+
if guessed_tags is not None:
87+
print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
88+
# print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
89+
# print(annotations.matrix_gold_tags)
90+
# print(self.df.applymap(lambda c:c.value))
91+
self.old_name = None
7592
else:
7693
self.gold_tags = ''
94+
self.dataset_text = ''
95+
self.notes = ''
7796

7897
@classmethod
79-
def from_file(cls, path, metadata, annotations=None, match_name=None):
98+
def from_file(cls, path, metadata, annotations=None, match_name=None, guessed_tags=None):
8099
path = Path(path)
81100
filename = path / metadata['filename']
82101
df = read_str_csv(filename)
@@ -89,7 +108,7 @@ def from_file(cls, path, metadata, annotations=None, match_name=None):
89108
table_ann = table_ann[0]
90109
else:
91110
table_ann = None
92-
return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name)
111+
return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name, guessed_tags)
93112

94113
def display(self):
95114
display_table(self.df.applymap(lambda x: x.value).values, self.df.applymap(lambda x: x.gold_tags).values)
@@ -104,11 +123,15 @@ def display(self):
104123
import string
105124
from collections import Counter
106125

126+
figure_prefix_re = re.compile('^(table|figure)\s+([0-9]+|[ivxl]+)?')
107127
punctuation_table = str.maketrans('', '', string.punctuation)
108128
def normalize_string(s):
109129
if s is None:
110130
return ""
111-
return unidecode(s.strip().lower().replace(' ', '')).translate(punctuation_table)
131+
132+
s = s.strip().lower()
133+
s = figure_prefix_re.sub('', s).strip()
134+
return unidecode(s.replace('\xa0', '').replace(' ', '')).translate(punctuation_table)
112135

113136
def _remove_almost_empty_values(d):
114137
return {k:v for k,v in d.items() if len(v) >= 10}
@@ -128,11 +151,106 @@ def _match_tables_by_captions(annotations, metadata):
128151
old_captions_reverse = {v:k for k,v in old_captions.items()}
129152
return {new_name:old_captions_reverse[caption] for new_name, caption in new_captions.items() if caption in old_captions_reverse}
130153

154+
def normalize_cell(s):
155+
#s = reference_re.sub(' [] ', s)
156+
return normalize_string(s)
157+
158+
# begin of guess annotations mapping
159+
def create_cell_contexts(df):
160+
cell_context = df.values
161+
cells = np.pad(cell_context, 1, mode='constant', constant_values='')
162+
163+
slices = [slice(None, -2), slice(1,-1), slice(2, None)]
164+
165+
row_context = np.stack([cells[1:-1, s] for s in slices], axis=-1)
166+
col_context = np.stack([cells[s, 1:-1] for s in slices], axis=-1)
167+
box_context = np.stack([cells[s1, s2] for s1 in slices for s2 in slices], axis=-1)
168+
return box_context, row_context, col_context, cell_context[...,None]
169+
170+
def map_context(context, values):
171+
ctx_len = context.shape[-1]
172+
mapping = {}
173+
for ctx, val in zip(context.reshape((-1, ctx_len)), values.reshape(-1)):
174+
mapping.setdefault(tuple(ctx), set()).add(val)
175+
return mapping
176+
177+
REANNOTATE_TAG = 'reannotate'
178+
179+
def guess_annotations(old_table, gold_tags, new_table):
180+
df = pd.DataFrame().reindex_like(new_table).fillna(REANNOTATE_TAG)
181+
if old_table.empty:
182+
return 0, df
183+
old_contexts = create_cell_contexts(old_table)
184+
old_mappings = [map_context(ctx, gold_tags.values) for ctx in old_contexts]
185+
new_contexts = create_cell_contexts(new_table)
186+
187+
rows, cols = new_table.shape
188+
matched = 0
189+
for row in range(rows):
190+
for col in range(cols):
191+
for mapping, context in zip(old_mappings, new_contexts):
192+
ctx = tuple(context[row, col])
193+
values = mapping.get(ctx, set())
194+
if len(values) == 1:
195+
(val,) = values
196+
df.iloc[row, col] = val
197+
matched += 1
198+
break
199+
return matched, df
200+
201+
# end of guess annotations mapping
202+
203+
204+
def same_table(old_table, new_table):
205+
return old_table.equals(new_table)
206+
207+
DEB_PAPER="1607.00036v2"
208+
209+
def deb(path, old_name, old_table, new_name, new_table):
210+
if path.name == DEB_PAPER and old_name == "table_02.csv" == new_name:
211+
print(old_table)
212+
print(new_table)
213+
214+
def _match_tables_by_content(path, annotations, metadata):
215+
if annotations is None:
216+
return {}, {}
217+
old_tables = {x.name: (pd.DataFrame(x.matrix).applymap(normalize_cell), pd.DataFrame(x.matrix_gold_tags)) for x in annotations.table_set}
218+
new_tables = {m['filename']: Table.from_file(path, m, None, None).df.applymap(lambda c: normalize_cell(c.value)) for m in metadata}
219+
matched = {}
220+
new_tags = {}
221+
for new_name, new_table in new_tables.items():
222+
max_hits = 0
223+
matched_name = None
224+
size = np.prod(new_table.shape)
225+
guessed_tags = None
226+
for old_name, (old_table, gold_tags) in old_tables.items():
227+
hits, tags = guess_annotations(old_table, gold_tags, new_table)
228+
if hits > max_hits:
229+
max_hits = hits
230+
matched_name = old_name
231+
guessed_tags = tags
232+
if max_hits > size / 2:
233+
matched[new_name] = matched_name
234+
new_tags[new_name] = guessed_tags
235+
#deb(path, old_name, old_table, new_name, new_table)
236+
#if same_table(old_table, new_table):
237+
# if new_name in matched:
238+
# print(f"Multiple matches for {path}/{new_name}: {matched[new_name]}, {old_name}")
239+
# else:
240+
# matched[new_name] = old_name
241+
return matched, new_tags
131242
####
132243

133244
def read_tables(path, annotations):
134245
path = Path(path)
135246
with open(path / "metadata.json", "r") as f:
136247
metadata = json.load(f)
137-
_match_names = _match_tables_by_captions(annotations, metadata)
138-
return [Table.from_file(path, m, annotations, match_name=_match_names.get(m["filename"])) for m in metadata]
248+
_matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
249+
_matched_names_by_content, _guessed_tags = _match_tables_by_content(path, annotations, metadata)
250+
_matched_names = _matched_names_by_captions
251+
for new_name, old_name in _matched_names_by_content.items():
252+
if new_name in _matched_names and _matched_names[new_name] != old_name:
253+
print(f"Multiple matches for table {path}/{new_name}: {_matched_names[new_name]} by caption and {old_name} by content")
254+
else:
255+
_matched_names[new_name] = old_name
256+
return [Table.from_file(path, m, annotations, match_name=_matched_names.get(m["filename"]), guessed_tags=_guessed_tags.get(m["filename"])) for m in metadata]

0 commit comments

Comments
 (0)