@@ -20,7 +20,7 @@ def consume_cells(table):
20
20
for col_id , cell in enumerate (row ):
21
21
vals = [
22
22
remove_text_styles (remove_references (cell .raw_value )),
23
- "" ,
23
+ cell . gold_tags ,
24
24
cell .refs [0 ] if cell .refs else "" ,
25
25
cell .layout ,
26
26
bool (style_tags_re .search (cell .raw_value ))
@@ -103,13 +103,13 @@ def fix_reference_hightlight(s):
103
103
"cell_layout" , "cell_styles" , "this_paper" , "row" , "col" , "row_context" , "col_context" , "ext_id" ]
104
104
105
105
106
- def create_evidence_records (textfrag , cell , paper , table ):
106
+ def create_evidence_records (textfrag , cell , paper_id , table ):
107
107
for text_highlited in textfrag .meta ['highlight' ]['text' ]:
108
108
text_highlited = fix_reference_hightlight (fix_refs (text_highlited ))
109
109
text = highlight_re .sub ("" , text_highlited )
110
110
text_sha1 = hashlib .sha1 (text .encode ("utf-8" )).hexdigest ()
111
111
112
- cell_ext_id = f"{ paper . paper_id } /{ table .name } /{ cell .row } /{ cell .col } "
112
+ cell_ext_id = f"{ paper_id } /{ table .name } /{ cell .row } /{ cell .col } "
113
113
114
114
yield {"text_sha1" : text_sha1 ,
115
115
"text_highlited" : text_highlited ,
@@ -120,7 +120,7 @@ def create_evidence_records(textfrag, cell, paper, table):
120
120
"cell_reference" : cell .vals [2 ],
121
121
"cell_layout" : cell .vals [3 ],
122
122
"cell_styles" : cell .vals [4 ],
123
- "this_paper" : textfrag .paper_id == paper . paper_id ,
123
+ "this_paper" : textfrag .paper_id == paper_id ,
124
124
"row" : cell .row ,
125
125
"col" : cell .col ,
126
126
"row_context" : " border " .join ([str (s ) for s in table .matrix .values [cell .row ]]),
@@ -137,23 +137,22 @@ def filter_cells(cell_content):
137
137
interesting_types = ["model-paper" , "model-best" , "model-competing" , "dataset" , "dataset-sub" , "dataset-task" ]
138
138
139
139
140
- def evidence_for_table (paper , table , paper_limit , corpus_limit ):
140
+ def evidence_for_table (paper_id , table , paper_limit , corpus_limit ):
141
141
records = [
142
142
record
143
143
for cell in consume_cells (table )
144
- for evidence in fetch_evidence (cell .vals [0 ], cell .vals [2 ], paper_id = paper . paper_id , table_name = table .name ,
144
+ for evidence in fetch_evidence (cell .vals [0 ], cell .vals [2 ], paper_id = paper_id , table_name = table .name ,
145
145
row = cell .row , col = cell .col , paper_limit = paper_limit , corpus_limit = corpus_limit )
146
- for record in create_evidence_records (evidence , cell , paper = paper , table = table )
146
+ for record in create_evidence_records (evidence , cell , paper_id = paper_id , table = table )
147
147
]
148
148
df = pd .DataFrame .from_records (records , columns = evidence_columns )
149
149
return df
150
150
151
151
152
- def prepare_data (paper , tables , csv_path , limit_type = 'interesting' ):
153
- data = [evidence_for_table (paper , table ,
152
+ def prepare_data (tables , csv_path ):
153
+ data = [evidence_for_table (table . paper_id , table ,
154
154
paper_limit = 100 ,
155
- corpus_limit = 20 ,
156
- limit_type = limit_type ) for table in progress_bar (tables )]
155
+ corpus_limit = 20 ) for table in progress_bar (tables )]
157
156
if len (data ):
158
157
df = pd .concat (data )
159
158
else :
@@ -173,7 +172,7 @@ def __init__(self):
173
172
setup_default_connection ()
174
173
175
174
def __call__ (self , paper , tables , paper_limit = 30 , corpus_limit = 10 ):
176
- dfs = [evidence_for_table (paper , table , paper_limit , corpus_limit ) for table in tables ]
175
+ dfs = [evidence_for_table (paper . paper_id , table , paper_limit , corpus_limit ) for table in tables ]
177
176
if len (dfs ):
178
177
return pd .concat (dfs )
179
178
return pd .DataFrame (columns = evidence_columns )
0 commit comments