Skip to content

Commit 7baae77

Browse files
committed
textfragment selection limit increased
1 parent ee17f08 commit 7baae77

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

sota_extractor2/data/structure.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,22 +73,29 @@ def filter_cells(cell):
7373
return re.search("[a-zA-Z]{2,}", cell.vals[1]) is not None
7474

7575

76-
def evidence_for_table(table, paper_limit=10, corpus_limit=1):
76+
interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub", "dataset-task"]
77+
78+
79+
def evidence_for_table(table, paper_limit=10, corpus_limit=1, limit_type='interesting'):
80+
def get_limits(cell_type):
81+
if limit_type == 'interesting' and (cell_type.strip() in interesting_types) or (limit_type == 'max'):
82+
return dict(paper_limit=1000, corpus_limit=1000)
83+
return dict(paper_limit=paper_limit, corpus_limit=corpus_limit)
7784
records = [
7885
record
7986
for cell in consume_cells(table.matrix, table.matrix_gold_tags) if filter_cells(cell)
80-
for evidence in fetch_evidence(cell.vals[0], paper_id=table.paper_id, paper_limit=paper_limit, corpus_limit=corpus_limit)
87+
for evidence in fetch_evidence(cell.vals[0], paper_id=table.paper_id, **get_limits(cell.vals[1]))
8188
for record in create_evidence_records(evidence, cell, table=table)
8289
]
8390
df = pd.DataFrame.from_records(records)
8491
return df
8592

8693

87-
def evidence_for_tables(tables, paper_limit=100, corpus_limit=20):
88-
return pd.concat([evidence_for_table(table, paper_limit=paper_limit, corpus_limit=corpus_limit) for table in progress_bar(tables)])
89-
90-
def prepare_data(tables, csv_path):
91-
df = evidence_for_tables(tables)
94+
def prepare_data(tables, csv_path, limit_type='interesting'):
95+
df = pd.concat([evidence_for_table(table,
96+
paper_limit=100,
97+
corpus_limit=20,
98+
limit_type=limit_type) for table in progress_bar(tables)])
9299
df = df.drop_duplicates(
93100
["cell_content", "text_highlited", "cell_type", "this_paper"])
94101
print("Number of text fragments ", len(df))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from fastai.text import *

0 commit comments

Comments
 (0)