@@ -20,7 +20,23 @@ def consume_cells(*matrix):
20
20
yield Cell (row = row_id , col = col_id , vals = cell_val )
21
21
22
22
23
- def fetch_evidence (cell_content , paper_id , paper_limit = 10 , corpus_limit = 10 ):
23
+ reference_re = re .compile (r"\[[^]]*\]" )
24
+ ours_re = re .compile (r"\(ours?\)" )
25
+ all_parens_re = re .compile (r"\([^)]*\)" )
26
+
27
+
28
+ def clear_cell (s ):
29
+ for pat in [reference_re , all_parens_re ]:
30
+ s = pat .sub ("" , s )
31
+ s = s .strip ()
32
+ return s
33
+
34
+
35
+ def fetch_evidence (cell_content , cell_reference , paper_id , paper_limit = 10 , corpus_limit = 10 ):
36
+ cell_content = clear_cell (cell_content )
37
+ if cell_content == "" and cell_reference == "" :
38
+ return []
39
+
24
40
evidence_query = Fragment .search ().highlight (
25
41
'text' , pre_tags = "<b>" , post_tags = "</b>" , fragment_size = 400 )
26
42
cell_content = cell_content .replace ("\xa0 " , " " )
@@ -31,10 +47,21 @@ def fetch_evidence(cell_content, paper_id, paper_limit=10, corpus_limit=10):
31
47
paper_fragments = list (evidence_query
32
48
.filter ('term' , paper_id = paper_id )
33
49
.query ('match_phrase' , text = query )[:paper_limit ])
50
+ if cell_reference != "" :
51
+ reference_fragments = list (evidence_query
52
+ .filter ('term' , paper_id = paper_id )
53
+ .query ('match_phrase' , text = {
54
+ "query" : cell_reference ,
55
+ "slop" : 1
56
+ })[:paper_limit ])
57
+ else :
58
+ reference_fragments = []
34
59
other_fagements = list (evidence_query
35
60
.exclude ('term' , paper_id = paper_id )
36
61
.query ('match_phrase' , text = query )[:corpus_limit ])
37
- return paper_fragments + other_fagements
62
+ if not len (paper_fragments ) and not len (reference_fragments ) and not len (other_fagements ):
63
+ print (f"No evidences for '{ cell_content } ' of { paper_id } " )
64
+ return paper_fragments + reference_fragments + other_fagements
38
65
39
66
fix_refs_re = re .compile ('\(\?\)|\s[?]+(\s|$)' )
40
67
@@ -44,29 +71,34 @@ def fix_refs(text):
44
71
45
72
46
73
highlight_re = re .compile ("</?b>" )
74
+ partial_highlight_re = re .compile (r"\<b\>xxref\</b\>-(?!\<b\>)" )
75
+
76
+
77
+ def fix_reference_hightlight (s ):
78
+ return partial_highlight_re .sub ("xxref-" , s )
47
79
48
80
49
81
def create_evidence_records (textfrag , cell , table ):
50
82
for text_highlited in textfrag .meta ['highlight' ]['text' ]:
51
- text_highlited = fix_refs (text_highlited )
83
+ text_highlited = fix_reference_hightlight ( fix_refs (text_highlited ) )
52
84
text = highlight_re .sub ("" , text_highlited )
53
85
text_sha1 = hashlib .sha1 (text .encode ("utf-8" )).hexdigest ()
54
86
55
87
cell_ext_id = f"{ table .ext_id } /{ cell .row } /{ cell .col } "
56
88
57
- if len ( text . split ()) > 50 :
58
- yield { "text_sha1 " : text_sha1 ,
59
- "text_highlited " : text_highlited ,
60
- "text " : text ,
61
- "header " : textfrag . header ,
62
- "cell_type " : cell .vals [1 ] ,
63
- "cell_content " : fix_refs ( cell .vals [0 ]) ,
64
- "this_paper" : textfrag .paper_id == table .paper_id ,
65
- "row" : cell .row ,
66
- "col" : cell .col ,
67
- "ext_id" : cell_ext_id
68
- #"table_id":table_id
69
- }
89
+ yield { "text_sha1" : text_sha1 ,
90
+ "text_highlited " : text_highlited ,
91
+ "text " : text ,
92
+ "header " : textfrag . header ,
93
+ "cell_type " : cell . vals [ 1 ] ,
94
+ "cell_content " : fix_refs ( cell .vals [0 ]) ,
95
+ "cell_reference " : cell .vals [2 ] ,
96
+ "this_paper" : textfrag .paper_id == table .paper_id ,
97
+ "row" : cell .row ,
98
+ "col" : cell .col ,
99
+ "ext_id" : cell_ext_id
100
+ #"table_id":table_id
101
+ }
70
102
71
103
72
104
def filter_cells (cell ):
@@ -83,8 +115,8 @@ def get_limits(cell_type):
83
115
return dict (paper_limit = paper_limit , corpus_limit = corpus_limit )
84
116
records = [
85
117
record
86
- for cell in consume_cells (table .matrix , table .matrix_gold_tags ) if filter_cells (cell )
87
- for evidence in fetch_evidence (cell .vals [0 ], paper_id = table .paper_id , ** get_limits (cell .vals [1 ]))
118
+ for cell in consume_cells (table .matrix , table .matrix_gold_tags , table . matrix_references ) if filter_cells (cell )
119
+ for evidence in fetch_evidence (cell .vals [0 ], cell . vals [ 2 ], paper_id = table .paper_id , ** get_limits (cell .vals [1 ]))
88
120
for record in create_evidence_records (evidence , cell , table = table )
89
121
]
90
122
df = pd .DataFrame .from_records (records )
0 commit comments