3
3
from collections import namedtuple
4
4
import hashlib
5
5
from fastai .text import progress_bar
6
- from .elastic import Fragment
6
+ from .elastic import Fragment , setup_default_connection
7
7
from .json import *
8
+ from .table import reference_re , remove_text_styles , remove_references , style_tags_re
8
9
9
10
def get_all_tables (papers ):
10
11
for paper in papers :
@@ -13,11 +14,18 @@ def get_all_tables(papers):
13
14
table .paper_id = paper .arxiv_id
14
15
yield table
15
16
16
- def consume_cells (* matrix ):
17
+ def consume_cells (table ):
17
18
Cell = namedtuple ('AnnCell' , 'row col vals' )
18
- for row_id , row in enumerate (zip (* matrix )):
19
- for col_id , cell_val in enumerate (zip (* row )):
20
- yield Cell (row = row_id , col = col_id , vals = cell_val )
19
+ for row_id , row in enumerate (table .df .values ):
20
+ for col_id , cell in enumerate (row ):
21
+ vals = [
22
+ remove_text_styles (remove_references (cell .raw_value )),
23
+ "" ,
24
+ cell .refs [0 ] if cell .refs else "" ,
25
+ cell .layout ,
26
+ bool (style_tags_re .search (cell .raw_value ))
27
+ ]
28
+ yield Cell (row = row_id , col = col_id , vals = vals )
21
29
22
30
23
31
reference_re = re .compile (r"\[[^]]*\]" )
@@ -38,10 +46,12 @@ def empty_fragment(paper_id):
38
46
return fragment
39
47
40
48
41
- def fetch_evidence (cell_content , cell_reference , paper_id , paper_limit = 10 , corpus_limit = 10 ):
49
+ def fetch_evidence (cell_content , cell_reference , paper_id , table_name , row , col , paper_limit = 10 , corpus_limit = 10 ):
50
+ if not filter_cells (cell_content ):
51
+ return [empty_fragment (paper_id )]
42
52
cell_content = clear_cell (cell_content )
43
53
if cell_content == "" and cell_reference == "" :
44
- return []
54
+ return [empty_fragment ( paper_id ) ]
45
55
46
56
evidence_query = Fragment .search ().highlight (
47
57
'text' , pre_tags = "<b>" , post_tags = "</b>" , fragment_size = 400 )
@@ -65,8 +75,11 @@ def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpu
65
75
other_fagements = list (evidence_query
66
76
.exclude ('term' , paper_id = paper_id )
67
77
.query ('match_phrase' , text = query )[:corpus_limit ])
68
- if not len (paper_fragments ) and not len (reference_fragments ) and not len (other_fagements ):
69
- print (f"No evidences for '{ cell_content } ' of { paper_id } " )
78
+
79
+ ext_id = f"{ paper_id } /{ table_name } /{ row } .{ col } "
80
+ ####print(f"{ext_id} |{cell_content}|: {len(paper_fragments)} paper fragments, {len(reference_fragments)} reference fragments, {len(other_fagements)} other fragments")
81
+ # if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
82
+ # print(f"No evidences for '{cell_content}' of {paper_id}")
70
83
if not len (paper_fragments ) and not len (reference_fragments ):
71
84
paper_fragments = [empty_fragment (paper_id )]
72
85
return paper_fragments + reference_fragments + other_fagements
@@ -86,13 +99,13 @@ def fix_reference_hightlight(s):
86
99
return partial_highlight_re .sub ("xxref-" , s )
87
100
88
101
89
- def create_evidence_records (textfrag , cell , table ):
102
+ def create_evidence_records (textfrag , cell , paper , table ):
90
103
for text_highlited in textfrag .meta ['highlight' ]['text' ]:
91
104
text_highlited = fix_reference_hightlight (fix_refs (text_highlited ))
92
105
text = highlight_re .sub ("" , text_highlited )
93
106
text_sha1 = hashlib .sha1 (text .encode ("utf-8" )).hexdigest ()
94
107
95
- cell_ext_id = f"{ table .ext_id } /{ cell .row } /{ cell .col } "
108
+ cell_ext_id = f"{ paper . paper_id } / { table .name } /{ cell .row } /{ cell .col } "
96
109
97
110
yield {"text_sha1" : text_sha1 ,
98
111
"text_highlited" : text_highlited ,
@@ -103,46 +116,53 @@ def create_evidence_records(textfrag, cell, table):
103
116
"cell_reference" : cell .vals [2 ],
104
117
"cell_layout" : cell .vals [3 ],
105
118
"cell_styles" : cell .vals [4 ],
106
- "this_paper" : textfrag .paper_id == table .paper_id ,
119
+ "this_paper" : textfrag .paper_id == paper .paper_id ,
107
120
"row" : cell .row ,
108
121
"col" : cell .col ,
109
- "row_context" : " border " .join ([str (s ) for s in table .matrix [cell .row ]]),
110
- "col_context" : " border " .join ([str (s ) for s in table .matrix [:, cell .col ]]),
122
+ "row_context" : " border " .join ([str (s ) for s in table .matrix . values [cell .row ]]),
123
+ "col_context" : " border " .join ([str (s ) for s in table .matrix . values [:, cell .col ]]),
111
124
"ext_id" : cell_ext_id
112
125
#"table_id":table_id
113
126
}
114
127
115
128
116
- def filter_cells (cell ):
117
- return re .search ("[a-zA-Z]{2,}" , cell . vals [ 1 ] ) is not None
129
+ def filter_cells (cell_content ):
130
+ return re .search ("[a-zA-Z]{2,}" , cell_content ) is not None
118
131
119
132
120
133
interesting_types = ["model-paper" , "model-best" , "model-competing" , "dataset" , "dataset-sub" , "dataset-task" ]
121
134
122
135
123
- def evidence_for_table (table , paper_limit = 10 , corpus_limit = 1 , limit_type = 'interesting' ):
124
- def get_limits (cell_type ):
125
- if limit_type == 'interesting' and (cell_type .strip () in interesting_types ) or (limit_type == 'max' ):
126
- return dict (paper_limit = 1000 , corpus_limit = 1000 )
127
- return dict (paper_limit = paper_limit , corpus_limit = corpus_limit )
136
+ def evidence_for_table (paper , table , paper_limit , corpus_limit ):
128
137
records = [
129
138
record
130
- for cell in consume_cells (table .matrix , table .matrix_gold_tags , table .matrix_references , table .matrix_layout , table .matrix_styles ) if filter_cells (cell )
131
- for evidence in fetch_evidence (cell .vals [0 ], cell .vals [2 ], paper_id = table .paper_id , ** get_limits (cell .vals [1 ]))
132
- for record in create_evidence_records (evidence , cell , table = table )
139
+ for cell in consume_cells (table )
140
+ for evidence in fetch_evidence (cell .vals [0 ], cell .vals [2 ], paper_id = paper .paper_id , table_name = table .name ,
141
+ row = cell .row , col = cell .col , paper_limit = paper_limit , corpus_limit = corpus_limit )
142
+ for record in create_evidence_records (evidence , cell , paper = paper , table = table )
133
143
]
134
144
df = pd .DataFrame .from_records (records )
135
145
return df
136
146
137
147
138
- def prepare_data (tables , csv_path , limit_type = 'interesting' ):
139
- df = pd .concat ([evidence_for_table (table ,
148
+ def prepare_data (paper , tables , csv_path , limit_type = 'interesting' ):
149
+ df = pd .concat ([evidence_for_table (paper , table ,
140
150
paper_limit = 100 ,
141
151
corpus_limit = 20 ,
142
152
limit_type = limit_type ) for table in progress_bar (tables )])
143
153
#moved to experiment preprocessing
144
154
#df = df.drop_duplicates(
145
155
# ["cell_content", "text_highlited", "cell_type", "this_paper"])
146
156
print ("Number of text fragments " , len (df ))
157
+
147
158
csv_path .parent .mkdir (parents = True , exist_ok = True )
148
159
df .to_csv (csv_path , index = None )
160
+
161
+
162
+ class CellEvidenceExtractor :
163
+ def __init__ (self ):
164
+ # todo: make sure can be called more than once or refactor to singleton
165
+ setup_default_connection ()
166
+
167
+ def __call__ (self , paper , tables , paper_limit = 30 , corpus_limit = 10 ):
168
+ return pd .concat ([evidence_for_table (paper , table , paper_limit , corpus_limit ) for table in tables ])
0 commit comments