10
10
import re
11
11
import pandas as pd
12
12
import numpy as np
13
+ import json
13
14
import ahocorasick
14
15
from numba import njit , typed , types
16
+ from pathlib import Path
15
17
16
18
from axcell .pipeline_logger import pipeline_logger
17
19
18
20
from axcell .models .linking import manual_dicts
19
21
from collections import Counter
20
22
23
+
21
24
def dummy_item (reason ):
22
25
return pd .DataFrame (dict (dataset = [reason ], task = [reason ], metric = [reason ], evidence = ["" ], confidence = [0.0 ]))
23
26
@@ -28,7 +31,9 @@ class EvidenceFinder:
28
31
end_letter_re = re .compile (r"\w\b" )
29
32
letter_re = re .compile (r"\w" )
30
33
31
- def __init__ (self , taxonomy ):
34
+ def __init__ (self , taxonomy , abbreviations_path = None , use_manual_dicts = False ):
35
+ self .abbreviations_path = abbreviations_path
36
+ self .use_manual_dicts = use_manual_dicts
32
37
self ._init_structs (taxonomy )
33
38
34
39
@staticmethod
@@ -58,6 +63,14 @@ def make_trie(names):
58
63
trie .make_automaton ()
59
64
return trie
60
65
66
+ @staticmethod
67
+ def get_auto_evidences (name , abbreviations , abbrvs_trie ):
68
+ frags = EvidenceFinder .find_names (normalize_dataset_ws (name ), abbrvs_trie )
69
+ evidences = []
70
+ for f in frags :
71
+ evidences .extend (abbreviations [f ])
72
+ return list (set (evidences ))
73
+
61
74
@staticmethod
62
75
def find_names (text , names_trie ):
63
76
text = text .lower ()
@@ -84,15 +97,30 @@ def find_tasks(self, text):
84
97
85
98
def init_evidence_dicts (self , taxonomy ):
86
99
self .tasks , self .datasets , self .metrics = EvidenceFinder .get_basic_dicts (taxonomy )
87
- EvidenceFinder .merge_evidences (self .tasks , manual_dicts .tasks )
88
- EvidenceFinder .merge_evidences (self .datasets , manual_dicts .datasets )
89
- EvidenceFinder .merge_evidences (self .metrics , manual_dicts .metrics )
100
+
101
+ if self .use_manual_dicts :
102
+ EvidenceFinder .merge_evidences (self .tasks , manual_dicts .tasks )
103
+ EvidenceFinder .merge_evidences (self .datasets , manual_dicts .datasets )
104
+ EvidenceFinder .merge_evidences (self .metrics , manual_dicts .metrics )
105
+
106
+ if self .abbreviations_path is not None :
107
+ with Path (self .abbreviations_path ).open ('rt' ) as f :
108
+ abbreviations = json .load (f )
109
+ abbrvs_trie = EvidenceFinder .make_trie (list (abbreviations .keys ()))
110
+
111
+ ds_auto = {x : EvidenceFinder .get_auto_evidences (x , abbreviations , abbrvs_trie ) for x in taxonomy .datasets }
112
+ ms_auto = {x : EvidenceFinder .get_auto_evidences (x , abbreviations , abbrvs_trie ) for x in taxonomy .metrics }
113
+
114
+ EvidenceFinder .merge_evidences (self .datasets , ds_auto )
115
+ EvidenceFinder .merge_evidences (self .metrics , ms_auto )
116
+
90
117
self .datasets = {k : (v + ['test' ] if 'val' not in k else v + ['validation' , 'dev' , 'development' ]) for k , v in
91
118
self .datasets .items ()}
92
- self .datasets .update ({
93
- 'LibriSpeech dev-clean' : ['libri speech dev clean' , 'libri speech' , 'dev' , 'clean' , 'dev clean' , 'development' ],
94
- 'LibriSpeech dev-other' : ['libri speech dev other' , 'libri speech' , 'dev' , 'other' , 'dev other' , 'development' , 'noisy' ],
95
- })
119
+ if self .use_manual_dicts :
120
+ self .datasets .update ({
121
+ 'LibriSpeech dev-clean' : ['libri speech dev clean' , 'libri speech' , 'dev' , 'clean' , 'dev clean' , 'development' ],
122
+ 'LibriSpeech dev-other' : ['libri speech dev other' , 'libri speech' , 'dev' , 'other' , 'dev other' , 'development' , 'noisy' ],
123
+ })
96
124
97
125
def _init_structs (self , taxonomy ):
98
126
self .init_evidence_dicts (taxonomy )
@@ -163,7 +191,10 @@ def _to_typed_list(iterable):
163
191
164
192
165
193
class ContextSearch :
166
- def __init__ (self , taxonomy , evidence_finder , context_noise = (0.5 , 0.1 , 0.2 , 0.2 , 0.1 ), metric_noise = None , task_noise = None ,
194
+ def __init__ (self , taxonomy , evidence_finder ,
195
+ context_noise = (0.99 , 1.0 , 1.0 , 0.25 , 0.01 ),
196
+ metric_noise = (0.99 , 1.0 , 1.0 , 0.25 , 0.01 ),
197
+ task_noise = (0.1 , 1.0 , 1.0 , 0.1 , 0.1 ),
167
198
ds_pb = 0.001 , ms_pb = 0.01 , ts_pb = 0.01 , debug_gold_df = None ):
168
199
merged_p = \
169
200
get_probs ({k : Counter ([normalize_cell (normalize_dataset (x )) for x in v ]) for k , v in evidence_finder .datasets .items ()})[1 ]
0 commit comments