Skip to content

Commit 50981c9

Browse files
committed
Disable structure migration by default
1 parent a5a776a commit 50981c9

File tree

2 files changed

+41
-26
lines changed

2 files changed

+41
-26
lines changed

sota_extractor2/data/paper_collection.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ def _load_texts(path, jobs):
3636
return {text.meta.id: text for text in texts}
3737

3838

39-
def _load_tables(path, annotations, jobs):
39+
def _load_tables(path, annotations, jobs, migrate):
4040
files = list(path.glob("**/metadata.json"))
41-
tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(f.parent.name)) for f in files)
41+
tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(f.parent.name), migrate) for f in files)
4242
return {f.parent.name: tbls for f, tbls in zip(files, tables)}
4343

4444

@@ -53,7 +53,7 @@ def __init__(self, data=None):
5353
super().__init__(data)
5454

5555
@classmethod
56-
def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, jobs=-1):
56+
def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, jobs=-1, migrate=False):
5757
path = Path(path)
5858
if annotations_path is None:
5959
annotations_path = path / "structure-annotations.json"
@@ -64,7 +64,7 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
6464

6565
annotations = _load_annotated_papers(annotations_path)
6666
if load_tables:
67-
tables = _load_tables(path, annotations, jobs)
67+
tables = _load_tables(path, annotations, jobs, migrate)
6868
else:
6969
tables = {}
7070
annotations = {}

sota_extractor2/data/table.py

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,14 @@ def read_str_csv(filename):
5151

5252

5353
class Table:
54-
def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None, guessed_tags=None):
54+
def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, migrate=False, old_name=None, guessed_tags=None):
5555
self.df = df
5656
self.caption = caption
5757
self.figure_id = figure_id
5858
self.df = df.applymap(str2cell)
59-
self.old_name = old_name
59+
60+
if migrate:
61+
self.old_name = old_name
6062

6163
if layout is not None:
6264
#self.layout = layout
@@ -74,41 +76,49 @@ def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, o
7476
tags = annotations.matrix_gold_tags
7577
gt_rows = len(tags)
7678
if gt_rows == 0 and len(self.df) > 0:
77-
#print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
78-
self.old_name = None
79+
print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
80+
if migrate:
81+
self.old_name = None
7982
elif gt_rows > 0:
8083
gt_cols = len(tags[0])
8184
if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols):
8285
for r, row in enumerate(tags):
8386
for c, cell in enumerate(row):
8487
self.df.iloc[r,c].gold_tags = cell.strip()
8588
else:
86-
if guessed_tags is not None:
87-
print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
89+
print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
8890
# print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
8991
# print(annotations.matrix_gold_tags)
9092
# print(self.df.applymap(lambda c:c.value))
91-
self.old_name = None
93+
if migrate:
94+
self.old_name = None
9295
else:
9396
self.gold_tags = ''
9497
self.dataset_text = ''
9598
self.notes = ''
9699

97100
@classmethod
98-
def from_file(cls, path, metadata, annotations=None, match_name=None, guessed_tags=None):
101+
def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=None, guessed_tags=None):
99102
path = Path(path)
100103
filename = path / metadata['filename']
101104
df = read_str_csv(filename)
102105
if 'layout' in metadata:
103106
layout = read_str_csv(path / metadata['layout'])
104107
else:
105108
layout = None
106-
if annotations is not None and match_name is not None:
107-
table_ann = annotations.table_set.filter(name=match_name) + [None]
108-
table_ann = table_ann[0]
109+
if annotations is not None:
110+
if not migrate:
111+
# TODO: remove parser after migration is fully finished
112+
table_ann = annotations.table_set.filter(name=metadata['filename'], parser="latexml") + [None]
113+
table_ann = table_ann[0]
114+
elif match_name is not None:
115+
table_ann = annotations.table_set.filter(name=match_name) + [None]
116+
table_ann = table_ann[0]
117+
else:
118+
table_ann = None
109119
else:
110120
table_ann = None
111-
return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name, guessed_tags)
121+
return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, migrate, match_name, guessed_tags)
112122

113123
def display(self):
114124
display_table(self.df.applymap(lambda x: x.value).values, self.df.applymap(lambda x: x.gold_tags).values)
@@ -241,16 +251,21 @@ def _match_tables_by_content(path, annotations, metadata):
241251
return matched, new_tags
242252
####
243253

244-
def read_tables(path, annotations):
254+
def read_tables(path, annotations, migrate=False):
245255
path = Path(path)
246256
with open(path / "metadata.json", "r") as f:
247257
metadata = json.load(f)
248-
_matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
249-
_matched_names_by_content, _guessed_tags = _match_tables_by_content(path, annotations, metadata)
250-
_matched_names = _matched_names_by_captions
251-
for new_name, old_name in _matched_names_by_content.items():
252-
if new_name in _matched_names and _matched_names[new_name] != old_name:
253-
print(f"Multiple matches for table {path}/{new_name}: {_matched_names[new_name]} by caption and {old_name} by content")
254-
else:
255-
_matched_names[new_name] = old_name
256-
return [Table.from_file(path, m, annotations, match_name=_matched_names.get(m["filename"]), guessed_tags=_guessed_tags.get(m["filename"])) for m in metadata]
258+
259+
if migrate:
260+
_matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
261+
_matched_names_by_content, _guessed_tags = _match_tables_by_content(path, annotations, metadata)
262+
_matched_names = _matched_names_by_captions
263+
for new_name, old_name in _matched_names_by_content.items():
264+
if new_name in _matched_names and _matched_names[new_name] != old_name:
265+
print(f"Multiple matches for table {path}/{new_name}: {_matched_names[new_name]} by caption and {old_name} by content")
266+
else:
267+
_matched_names[new_name] = old_name
268+
else:
269+
_matched_names = {}
270+
_guessed_tags = {}
271+
return [Table.from_file(path, m, annotations, migrate=migrate, match_name=_matched_names.get(m["filename"]), guessed_tags=_guessed_tags.get(m["filename"])) for m in metadata]

0 commit comments

Comments
 (0)