@@ -73,22 +73,29 @@ def filter_cells(cell):
73
73
return re .search ("[a-zA-Z]{2,}" , cell .vals [1 ]) is not None
74
74
75
75
76
- def evidence_for_table (table , paper_limit = 10 , corpus_limit = 1 ):
76
+ interesting_types = ["model-paper" , "model-best" , "model-competing" , "dataset" , "dataset-sub" , "dataset-task" ]
77
+
78
+
79
+ def evidence_for_table (table , paper_limit = 10 , corpus_limit = 1 , limit_type = 'interesting' ):
80
+ def get_limits (cell_type ):
81
+ if limit_type == 'interesting' and (cell_type .strip () in interesting_types ) or (limit_type == 'max' ):
82
+ return dict (paper_limit = 1000 , corpus_limit = 1000 )
83
+ return dict (paper_limit = paper_limit , corpus_limit = corpus_limit )
77
84
records = [
78
85
record
79
86
for cell in consume_cells (table .matrix , table .matrix_gold_tags ) if filter_cells (cell )
80
- for evidence in fetch_evidence (cell .vals [0 ], paper_id = table .paper_id , paper_limit = paper_limit , corpus_limit = corpus_limit )
87
+ for evidence in fetch_evidence (cell .vals [0 ], paper_id = table .paper_id , ** get_limits ( cell . vals [ 1 ]) )
81
88
for record in create_evidence_records (evidence , cell , table = table )
82
89
]
83
90
df = pd .DataFrame .from_records (records )
84
91
return df
85
92
86
93
87
- def evidence_for_tables (tables , paper_limit = 100 , corpus_limit = 20 ):
88
- return pd .concat ([evidence_for_table (table , paper_limit = paper_limit , corpus_limit = corpus_limit ) for table in progress_bar ( tables )])
89
-
90
- def prepare_data ( tables , csv_path ):
91
- df = evidence_for_tables (tables )
94
+ def prepare_data (tables , csv_path , limit_type = 'interesting' ):
95
+ df = pd .concat ([evidence_for_table (table ,
96
+ paper_limit = 100 ,
97
+ corpus_limit = 20 ,
98
+ limit_type = limit_type ) for table in progress_bar (tables )] )
92
99
df = df .drop_duplicates (
93
100
["cell_content" , "text_highlited" , "cell_type" , "this_paper" ])
94
101
print ("Number of text fragments " , len (df ))
0 commit comments