Skip to content

Commit 00fb305

Browse files
committed
Add pipeline loggers
1 parent 3df4648 commit 00fb305

17 files changed

+258
-65
lines changed

sota_extractor2/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,7 @@
2424
datasets_structure = datasets/"structure"
2525
structure_models = datasets / "structure" / "models"
2626

27+
mocks = datasets / "mocks"
28+
2729
linking_models = datasets / "linking" / "models"
2830
linking_data = datasets / "linking" / "data"

sota_extractor2/data/paper_collection.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
from joblib import Parallel, delayed
88
from collections import UserList
99
from ..helpers.jupyter import display_table
10+
import string
11+
import random
12+
from extract_tables import extract_tables
13+
1014

1115
class Paper:
1216
def __init__(self, paper_id, text, tables, annotations):
@@ -25,6 +29,27 @@ def __init__(self, paper_id, text, tables, annotations):
2529
self.gold_tags = ''
2630

2731

32+
# todo: make sure multithreading/processing won't cause collisions
33+
def random_id():
34+
return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
35+
36+
37+
class TempPaper(Paper):
38+
"""Similar to Paper, but can be used as context manager, temporarily saving the paper to elastic"""
39+
def __init__(self, html):
40+
paper_id = random_id()
41+
text = PaperText.from_html(html, paper_id)
42+
tables = extract_tables(html)
43+
super().__init__(paper_id=paper_id, text=text, tables=tables, annotations=None)
44+
45+
def __enter__(self):
46+
self.text.save()
47+
return self
48+
49+
def __exit__(self, exc, value, tb):
50+
self.text.delete()
51+
52+
2853
arxiv_version_re = re.compile(r"v\d+$")
2954
def remove_arxiv_version(arxiv_id):
3055
return arxiv_version_re.sub("", arxiv_id)

sota_extractor2/data/structure.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ def fix_reference_hightlight(s):
9999
return partial_highlight_re.sub("xxref-", s)
100100

101101

102+
evidence_columns = ["text_sha1", "text_highlited", "text", "header", "cell_type", "cell_content", "cell_reference",
103+
"cell_layout", "cell_styles", "this_paper", "row", "col", "row_context", "col_context", "ext_id"]
104+
105+
102106
def create_evidence_records(textfrag, cell, paper, table):
103107
for text_highlited in textfrag.meta['highlight']['text']:
104108
text_highlited = fix_reference_hightlight(fix_refs(text_highlited))
@@ -141,15 +145,19 @@ def evidence_for_table(paper, table, paper_limit, corpus_limit):
141145
row=cell.row, col=cell.col, paper_limit=paper_limit, corpus_limit=corpus_limit)
142146
for record in create_evidence_records(evidence, cell, paper=paper, table=table)
143147
]
144-
df = pd.DataFrame.from_records(records)
148+
df = pd.DataFrame.from_records(records, columns=evidence_columns)
145149
return df
146150

147151

148152
def prepare_data(paper, tables, csv_path, limit_type='interesting'):
149-
df = pd.concat([evidence_for_table(paper, table,
153+
data = [evidence_for_table(paper, table,
150154
paper_limit=100,
151155
corpus_limit=20,
152-
limit_type=limit_type) for table in progress_bar(tables)])
156+
limit_type=limit_type) for table in progress_bar(tables)]
157+
if len(data):
158+
df = pd.concat(data)
159+
else:
160+
df = pd.DataFrame(columns=evidence_columns)
153161
#moved to experiment preprocessing
154162
#df = df.drop_duplicates(
155163
# ["cell_content", "text_highlited", "cell_type", "this_paper"])
@@ -168,4 +176,4 @@ def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
168176
dfs = [evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables]
169177
if len(dfs):
170178
return pd.concat(dfs)
171-
return pd.DataFrame()
179+
return pd.DataFrame(columns=evidence_columns)

sota_extractor2/helpers/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from .temp_paper import TempPaper
21
from .latex_converter import LatexConverter
32
from .unpack import Unpack
43

5-
__all__ = ["TempPaper", "LatexConverter", "Unpack"]
4+
__all__ = ["LatexConverter", "Unpack"]

sota_extractor2/helpers/temp_paper.py

Lines changed: 0 additions & 26 deletions
This file was deleted.

sota_extractor2/helpers/unpack.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,21 @@
44
from pathlib import Path
55
from shutil import copyfileobj
66
from sota_extractor2.errors import UnpackError
7+
from ..pipeline_logger import pipeline_logger
78

89

910
class Unpack:
11+
step = "unpack"
12+
1013
def __init__(self):
1114
self.magic = Magic(mime=True, uncompress=True)
1215

1316
def __call__(self, source, dest):
17+
pipeline_logger(f"{Unpack.step}::call", source=source, dest=dest)
1418
source = Path(source)
1519
dest = Path(dest)
1620
mime = self.magic.from_file(str(source))
21+
pipeline_logger(f"{Unpack.step}::detect_mime", source=source, mime=mime)
1722
if mime == 'application/x-tar':
1823
dest.mkdir(parents=True, exist_ok=True)
1924
with tarfile.open(source, "r:*") as tar:

sota_extractor2/loggers.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import sys
2+
import pandas as pd
3+
from .models.structure.experiment import Experiment, label_map, Labels
4+
from .models.structure.type_predictor import TableType
5+
6+
7+
class BaseLogger:
8+
def __init__(self, pipeline_logger, pattern=".*"):
9+
pipeline_logger.register(pattern, self)
10+
11+
def __call__(self, step, **kwargs):
12+
raise NotImplementedError()
13+
14+
15+
class StdoutLogger:
16+
def __init__(self, pipeline_logger, file=sys.stdout):
17+
self.file = file
18+
pipeline_logger.register(".*", self)
19+
20+
def __call__(self, step, **kwargs):
21+
print(f"[STEP] {step}: {kwargs}", file=self.file)
22+
23+
24+
class StructurePredictionEvaluator:
25+
def __init__(self, pipeline_logger, pc):
26+
pipeline_logger.register("structure_prediction::tables_labelled", self.on_tables_labelled)
27+
pipeline_logger.register("type_prediction::predicted", self.on_type_predicted)
28+
self.pc = pc
29+
self.results = {}
30+
self.type_predictions = {}
31+
32+
def on_type_predicted(self, step, paper, tables, predictions):
33+
self.type_predictions[paper.paper_id] = predictions
34+
35+
def on_tables_labelled(self, step, paper, tables):
36+
golds = [p for p in self.pc if p.text.title == paper.text.title]
37+
paper_id = paper.paper_id
38+
type_results = []
39+
cells_results = []
40+
if len(golds) == 1:
41+
gold = golds[0]
42+
for gold_table, table, table_type in zip(gold.tables, paper.tables, self.type_predictions.get(paper.paper_id, [])):
43+
is_important = table_type == TableType.SOTA or table_type == TableType.ABLATION
44+
gold_is_important = "sota" in gold_table.gold_tags or "ablation" in gold_table.gold_tags
45+
type_results.append({"predicted": is_important, "gold": gold_is_important, "name": table.name})
46+
if not is_important:
47+
continue
48+
rows, cols = table.df.shape
49+
for r in range(rows):
50+
for c in range(cols):
51+
cells_results.append({
52+
"predicted": table.df.iloc[r, c].gold_tags,
53+
"gold": gold_table.df.iloc[r, c].gold_tags,
54+
"ext_id": f"{table.name}/{r}.{c}",
55+
"content": table.df.iloc[r, c].value
56+
})
57+
58+
self.results[paper_id] = {
59+
'type': pd.DataFrame.from_records(type_results),
60+
'cells': pd.DataFrame.from_records(cells_results)
61+
}
62+
63+
def map_tags(self, tags):
64+
mapping = dict(label_map)
65+
mapping[""] = Labels.EMPTY.value
66+
return tags.str.strip().apply(lambda x: mapping.get(x, 0))
67+
68+
def metrics(self, paper_id):
69+
if paper_id not in self.results:
70+
print(f"No annotations for {paper_id}")
71+
return
72+
print("Structure prediction:")
73+
results = self.results[paper_id]
74+
cells_df = results['cells']
75+
e = Experiment()
76+
e._set_results(paper_id, self.map_tags(results['cells'].predicted), self.map_tags(results['cells'].gold))
77+
e.show_results(paper_id, normalize=True)
78+
79+
80+
class LinkerEvaluator:
81+
def __init__(self, pipeline_logger, pc):
82+
pipeline_logger.register("linking::call", self.on_before_linking)
83+
pipeline_logger.register("linking::taxonomy_linking::call", self.on_before_taxonomy)
84+
pipeline_logger.register("linking::taxonomy_linking::topk", self.on_taxonomy_topk)
85+
pipeline_logger.register("linking::linked", self.on_after_linking)
86+
self.proposals = {}
87+
self.topk = {}
88+
89+
def on_before_linking(self, step, paper, tables):
90+
pass
91+
92+
def on_after_linking(self, step, paper, tables, proposals):
93+
self.proposals[paper.paper_id] = proposals.copy(deep=True)
94+
95+
def on_before_taxonomy(self, step, ext_id, query, datasets, caption):
96+
pass
97+
98+
def on_taxonomy_topk(self, step, ext_id, topk):
99+
paper_id, table_name, rc = ext_id.split('/')
100+
row, col = [int(x) for x in rc.split('.')]
101+
self.topk[paper_id, table_name, row, col] = topk.copy(deep=True)
102+
103+
def top_matches(self, paper_id, table_name, row, col):
104+
return self.topk[(paper_id, table_name, row, col)]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
class LatexConverterMock:
2+
def __init__(self, mock_file):
3+
with open(mock_file, "r") as f:
4+
self.mock = f.read()
5+
6+
def to_html(self, source_dir):
7+
return self.mock

sota_extractor2/models/linking/bm25_naive.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,11 @@ def handle_pm(value):
169169
pass
170170
# %%
171171

172+
173+
proposal_columns = ['dataset', 'metric', 'task', 'format', 'raw_value', 'model', 'model_type', 'cell_ext_id',
174+
'confidence', 'parsed', 'struct_model_type', 'struct_dataset']
175+
176+
172177
def generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets):
173178
# %%
174179
# Proposal generation
@@ -249,9 +254,8 @@ def linked_proposals(proposals):
249254
yield linked
250255

251256
# specify columns in case there's no proposal
252-
columns = ['dataset', 'metric', 'task', 'format', 'raw_value', 'model', 'model_type', 'cell_ext_id', 'confidence', 'parsed',
253-
'struct_model_type', 'struct_dataset']
254-
proposals = pd.DataFrame.from_records(list(linked_proposals(proposals)), columns=columns)
257+
258+
proposals = pd.DataFrame.from_records(list(linked_proposals(proposals)), columns=proposal_columns)
255259

256260
if len(proposals):
257261
proposals["parsed"]=proposals[["raw_value", "format"]].apply(
@@ -274,7 +278,9 @@ def linked_proposals(paper_ext_id, paper, annotated_tables, taxonomy_linking=Mat
274278

275279
if 'sota' in tags and 'no_sota_records' not in tags: # only parse tables that are marked as sota
276280
proposals.append(generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets))
277-
return pd.concat(proposals)
281+
if len(proposals):
282+
return pd.concat(proposals)
283+
return pd.DataFrame(columns=proposal_columns)
278284

279285

280286
def test_link_taxonomy():

sota_extractor2/models/linking/context_search.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,21 @@
99
import pandas as pd
1010
import numpy as np
1111

12+
from sota_extractor2.pipeline_logger import pipeline_logger
13+
1214
metrics = {
1315
'BLEU': ['bleu'],
1416
'BLEU score': ['bleu'],
15-
'Character Error Rate': ['cer'],
17+
'Character Error Rate': ['cer', 'cers'],
1618
'Error': ['error'],
1719
'Exact Match Ratio': ['exact match'],
1820
'F1': ['f1', 'f1 score'],
1921
'F1 score': ['f1', 'f1 score'],
2022
'MAP': ['map'],
21-
'Percentage error': ['wer', 'per', 'word error rate', 'word error rates', 'phoneme error rates',
23+
'Percentage error': ['wer', 'per', 'wers', 'pers', 'word error rate', 'word error rates', 'phoneme error rates',
2224
'phoneme error rate', 'error', 'error rate', 'error rates'],
23-
'Word Error Rate': ['wer', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'],
24-
'Word Error Rate (WER)': ['wer', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'],
25+
'Word Error Rate': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'],
26+
'Word Error Rate (WER)': ['wer', 'wers', 'word error rate', 'word error rates', 'error', 'error rate', 'error rates'],
2527
'ROUGE-1': ['r1'],
2628
'ROUGE-2': ['r2'],
2729
'ROUGE-F': ['rf'],
@@ -173,10 +175,10 @@ def match(self, contexts):
173175
return zip(keys, probs)
174176

175177
def __call__(self, query, datasets, caption, debug_info=None):
178+
cellstr = debug_info.cell.cell_ext_id
179+
pipeline_logger("linking::taxonomy_linking::call", ext_id=cellstr, query=query, datasets=datasets, caption=caption)
176180
datasets = " ".join(datasets)
177-
cell = debug_info.cell
178181
key = (datasets, caption, query)
179-
cellstr = f"{cell.table_ext_id}/{cell.row}.{cell.col}"
180182
###print(f"[DEBUG] {cellstr}")
181183
###print("[DEBUG]", debug_info)
182184
###print("query:", query, caption)
@@ -226,6 +228,7 @@ def __call__(self, query, datasets, caption, debug_info=None):
226228
else:
227229
print("[EA] No gold sota record found for the cell")
228230
# end of error analysis only
231+
pipeline_logger("linking::taxonomy_linking::topk", ext_id=cellstr, topk=p)
229232
return p.head(1)
230233

231234

0 commit comments

Comments
 (0)