Add separate noise weights for metrics

mkardas · mkardas · commit ac2a5b5bc3ee · 2019-10-25T11:13:21.000+02:00
* add separate noise weights for metrics
* add noise dataset and metric probabilities
* make format parsing less sensitive to whitespaces
* show non-parsed gold sota records
diff --git a/sota_extractor2/config.py b/sota_extractor2/config.py
@@ -28,3 +28,5 @@
 
 linking_models = datasets / "linking" / "models"
 linking_data = datasets / "linking" / "data"
+
+autodict = linking_data / "autodict"
diff --git a/sota_extractor2/loggers.py b/sota_extractor2/loggers.py
@@ -151,6 +151,7 @@ def __init__(self, pipeline_logger):
         pipeline_logger.register("linking::linked", self.on_after_linking)
         self.proposals = {}
         self.topk = {}
+        self.queries = {}
 
     def on_before_linking(self, step, paper, tables):
         pass
@@ -159,7 +160,7 @@ def on_after_linking(self, step, paper, tables, proposals):
         self.proposals[paper.paper_id] = proposals.copy(deep=True)
 
     def on_before_taxonomy(self, step, ext_id, query, datasets, caption):
-        pass
+        self.queries[ext_id] = (query, datasets, caption)
 
     def on_taxonomy_topk(self, step, ext_id, topk):
         paper_id, table_name, rc = ext_id.split('/')
diff --git a/sota_extractor2/models/linking/acronym_extractor.py b/sota_extractor2/models/linking/acronym_extractor.py
@@ -1,6 +1,6 @@
 import spacy
 from scispacy.abbreviation import AbbreviationDetector
-from .utils import normalize_cell, normalize_dataset_ws
+from .utils import normalize_cell, normalize_dataset
 
 class AcronymExtractor:
     def __init__(self):
@@ -14,7 +14,7 @@ def __call__(self, text):
         abbrvs = {}
         for abrv in doc._.abbreviations:
             # abbrvs.setdefault(normalize_cell(str(abrv)), Counter())[str(abrv._.long_form)] += 1
-            norm = normalize_cell(normalize_dataset_ws(str(abrv)))
+            norm = normalize_cell(normalize_dataset(str(abrv)))
             if norm != '':
-                abbrvs[norm] = normalize_cell(normalize_dataset_ws(str(abrv._.long_form)))
+                abbrvs[norm] = normalize_cell(normalize_dataset(str(abrv._.long_form)))
         return abbrvs
diff --git a/sota_extractor2/models/linking/context_search.py b/sota_extractor2/models/linking/context_search.py
@@ -3,7 +3,7 @@
 
 from sota_extractor2.models.linking.acronym_extractor import AcronymExtractor
 from sota_extractor2.models.linking.probs import get_probs, reverse_probs
-from sota_extractor2.models.linking.utils import normalize_dataset_ws, normalize_cell, normalize_cell_ws
+from sota_extractor2.models.linking.utils import normalize_dataset, normalize_cell, normalize_cell_ws
 from scipy.special import softmax
 import re
 import pandas as pd
@@ -108,16 +108,16 @@
     'Rain100L': ['rain100l'],
     'Rain12': ['rain12'],
     'Rain800': ['rain800'],
-    'Rain1400': ['rain1400'], 
-    'Real Rain': ['real rain'],    
-    'Rain in Surveillance': ['ris'],   
-    'Rain in Driving': ['rid'],   
+    'Rain1400': ['rain1400'],
+    'Real Rain': ['real rain'],
+    'Rain in Surveillance': ['ris'],
+    'Rain in Driving': ['rid'],
     'DID-MDN': ['did-mdn'],
     'SOTS': ['sots'],
     'Test 1': ['test 1'],
     'RainSynLight25': ['rainsynlight25'],
-    'RainSynComplex25': ['rainsyncomplex25'],    
-    'NTURain': ['nturain'],    
+    'RainSynComplex25': ['rainsyncomplex25'],
+    'NTURain': ['nturain'],
     'RainSynAll100': ['rainsynall100'],
     'SPA-DATA': ['spa-data'],
     'LasVR': ['lasvar'],
@@ -143,8 +143,8 @@
 #     return re.compile(r'(?:^|\s+)' + escaped_ws_re.sub(r'\\s*', re.escape(name.strip())) + r'(?:$|\s+)', re.I)
 
 #all_datasets = set(k for k,v in merged_p.items() if k != '' and not re.match("^\d+$", k) and v.get('NOMATCH', 0.0) < 0.9)
-all_datasets = set(y for x in datasets.values() for y in x)
-all_metrics = set(y for x in metrics.values() for y in x)
+all_datasets = set(normalize_cell_ws(normalize_dataset(y)) for x in datasets.values() for y in x)
+all_metrics = set(normalize_cell_ws(y) for x in metrics.values() for y in x)
 #all_metrics = set(metrics_p.keys())
 
 # all_datasets_re = {x:name_to_re(x) for x in all_datasets}
@@ -201,7 +201,7 @@ def dummy_item(reason):
 
 
 @njit
-def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, dss, mss, noise, logprobs):
+def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, dss, mss, noise, ms_noise, ds_pb, ms_pb, logprobs):
     empty = typed.Dict.empty(types.unicode_type, types.float64)
     for i, (task, dataset, metric) in enumerate(taxonomy):
         logprob = 0.0
@@ -213,19 +213,19 @@ def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, dss, mss, no
             #                         ds = long_form
             #                         break
             # if merged_p[ds].get('NOMATCH', 0.0) < 0.5:
-            logprob += np.log(noise * 0.001 + (1 - noise) * short_probs.get(ds, 0.0))
+            logprob += np.log(noise * ds_pb + (1 - noise) * short_probs.get(ds, 0.0))
         for ms in mss:
-            logprob += np.log(noise * 0.01 + (1 - noise) * met_probs.get(ms, 0.0))
+            logprob += np.log(ms_noise * ms_pb + (1 - ms_noise) * met_probs.get(ms, 0.0))
         logprobs[i] += logprob
         #logprobs[(dataset, metric)] = logprob
 
 
 class ContextSearch:
-    def __init__(self, taxonomy, context_noise=(0.5, 0.2, 0.1), debug_gold_df=None):
+    def __init__(self, taxonomy, context_noise=(0.5, 0.2, 0.1), metrics_noise=None, ds_pb=0.001, ms_pb=0.01, debug_gold_df=None):
         merged_p = \
-        get_probs({k: Counter([normalize_cell(normalize_dataset_ws(x)) for x in v]) for k, v in datasets.items()})[1]
+        get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in datasets.items()})[1]
         metrics_p = \
-        get_probs({k: Counter([normalize_cell(normalize_dataset_ws(x)) for x in v]) for k, v in metrics.items()})[1]
+        get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in metrics.items()})[1]
 
 
         self.queries = {}
@@ -235,6 +235,9 @@ def __init__(self, taxonomy, context_noise=(0.5, 0.2, 0.1), debug_gold_df=None):
             self._taxonomy.append(t)
         self.extract_acronyms = AcronymExtractor()
         self.context_noise = context_noise
+        self.metrics_noise = metrics_noise if metrics_noise else context_noise
+        self.ds_pb = ds_pb
+        self.ms_pb = ms_pb
         self.reverse_merged_p = self._numba_update_nested_dict(reverse_probs(merged_p))
         self.reverse_metrics_p = self._numba_update_nested_dict(reverse_probs(metrics_p))
         self.debug_gold_df = debug_gold_df
@@ -253,10 +256,10 @@ def _numba_extend_list(self, lst):
             l.append(x)
         return l
 
-    def compute_context_logprobs(self, context, noise, logprobs):
+    def compute_context_logprobs(self, context, noise, ms_noise, logprobs):
         context = context or ""
         abbrvs = self.extract_acronyms(context)
-        context = normalize_cell_ws(normalize_dataset_ws(context))
+        context = normalize_cell_ws(normalize_dataset(context))
         dss = set(find_datasets(context)) | set(abbrvs.keys())
         mss = set(find_metrics(context))
         dss -= mss
@@ -266,15 +269,16 @@ def compute_context_logprobs(self, context, noise, logprobs):
         ###print("mss", mss)
         dss = self._numba_extend_list(dss)
         mss = self._numba_extend_list(mss)
-        compute_logprobs(self._taxonomy, self.reverse_merged_p, self.reverse_metrics_p, dss, mss, noise, logprobs)
+        compute_logprobs(self._taxonomy, self.reverse_merged_p, self.reverse_metrics_p,
+                         dss, mss, noise, ms_noise, self.ds_pb, self.ms_pb, logprobs)
 
     def match(self, contexts):
         assert len(contexts) == len(self.context_noise)
         n = len(self._taxonomy)
         context_logprobs = np.zeros(n)
 
-        for context, noise in zip(contexts, self.context_noise):
-            self.compute_context_logprobs(context, noise, context_logprobs)
+        for context, noise, ms_noise in zip(contexts, self.context_noise, self.metrics_noise):
+            self.compute_context_logprobs(context, noise, ms_noise, context_logprobs)
         keys = self.taxonomy.taxonomy
         logprobs = context_logprobs
         #keys, logprobs = zip(*context_logprobs.items())
@@ -293,7 +297,7 @@ def __call__(self, query, datasets, caption, debug_info=None):
             # print(self.queries[key])
             # for context in key:
             #     abbrvs = self.extract_acronyms(context)
-            #     context = normalize_cell_ws(normalize_dataset_ws(context))
+            #     context = normalize_cell_ws(normalize_dataset(context))
             #     dss = set(find_datasets(context)) | set(abbrvs.keys())
             #     mss = set(find_metrics(context))
             #     dss -= mss
@@ -353,4 +357,4 @@ def from_paper(self, paper):
         return self(text)
 
     def __call__(self, text):
-        return find_datasets(normalize_cell_ws(normalize_dataset_ws(text)))
+        return find_datasets(normalize_cell_ws(normalize_dataset(text)))
diff --git a/sota_extractor2/models/linking/execution.py b/sota_extractor2/models/linking/execution.py
@@ -1,5 +1,6 @@
 import pandas as pd
 from django.db import connection
+from IPython.core.display import display
 
 from sota_extractor2.models.linking.metrics import Metrics
 from sota_extractor2.models.linking.format import extract_value
@@ -36,6 +37,13 @@ def fetch_gold_sota_records():
     gold_sota_records["parsed"] = gold_sota_records[["raw_value", "format"]].apply(
         lambda row: float(extract_value(row.raw_value, row.format)), axis=1)
 
+    unparsed = gold_sota_records[gold_sota_records["parsed"] != gold_sota_records["parsed"]]
+    if len(unparsed):
+        print("Found unparsed values")
+        display(unparsed.style.format({'cell_ext_id':
+            lambda x: f'<a target="labeler" href="http://10.0.1.145:8001/paper/{x}">{x}</a>'})
+        )
+
     gold_sota_records = gold_sota_records[gold_sota_records["parsed"] == gold_sota_records["parsed"]]
 
     strip_cols=["task", "dataset", "format", "metric",  "raw_value", "model", "model_type"]
diff --git a/sota_extractor2/models/linking/format.py b/sota_extractor2/models/linking/format.py
@@ -12,7 +12,10 @@ def format_to_regexp(format):
     fn=lambda x: x
     for i, s in enumerate(placeholders):
         if i % 2 == 0:
-            regexp += escaped_whitespace_re.sub(r"\\s+", re.escape(s))
+            if s.strip() == "":
+                regexp += escaped_whitespace_re.sub(r"\\s+", re.escape(s))
+            else:
+                regexp += escaped_whitespace_re.sub(r"\\s*", re.escape(s))
         elif s.strip() == "":
             regexp += float_value_nc.pattern
         else:
@@ -29,6 +32,6 @@ def extract_value(cell_value, format):
     cell_value = re.sub(r"\s+%", "%", cell_value)
     regexp, fn = format_to_regexp(format)
     match = regexp.match(cell_value.strip())
-    if match is None:
+    if match is None or not len(match.groups()):
         return Decimal('NaN')
     return fn(Decimal(match.group(1)))
diff --git a/sota_extractor2/models/linking/probs.py b/sota_extractor2/models/linking/probs.py
@@ -1,6 +1,19 @@
 from collections import Counter
 
+
 def get_probs(occurrences):
+    """
+    Computes conditional probabilities based on frequency of co-occurrences
+
+    Parameters
+    ----------
+    occurrences: occurences[x][y] number of times with (X=x and Y=y)
+
+    Returns
+    -------
+    probs : probs[x][y] = Pr(Y=y | X=x)
+    reverse_probs : reverse_probs[y][x] = Pr(X=x | Y=y)
+    """
     probs = {}
     reverse_probs = {}
     y_occ = Counter()
@@ -27,7 +40,7 @@ def reverse_probs(probs):
 
     Returns
     -------
-    reverse : reverse[y][x] = Pr(X=x | Y=y) assuming X and Y are uniform
+    reverse : reverse[y][x] = Pr(X=x | Y=y) assuming X is uniform
     """
     reverse = {}
     for x, probs_x in probs.items():
diff --git a/sota_extractor2/models/linking/proposals_filters.py b/sota_extractor2/models/linking/proposals_filters.py
@@ -86,8 +86,8 @@ def __init__(self, confidence=-1):
         self.confidence = confidence
 
     def _filter(self, proposals):
-        which = proposals.confidence > self.confidence
-        reason = "confidence " + proposals[~which].confidence.round(2).astype(str) + f" <= {self.confidence}"
+        which = proposals.confidence >= self.confidence
+        reason = "confidence " + proposals[~which].confidence.round(2).astype(str) + f" < {self.confidence}"
         return which, reason[~which]
 
     def log(self, **kwargs):
diff --git a/sota_extractor2/models/linking/utils.py b/sota_extractor2/models/linking/utils.py
@@ -15,7 +15,7 @@ def remove_parens(text):
     return parens_re.sub("", text)
 
 def clean_name(name):
-    return remove_parens(name.replace('\xa0', ' ').strip()).strip()
+    return remove_parens(unidecode(name).strip()).strip()
 
 def clean_cell(cell):
     return strip_nonalnum(clean_name(cell))