Return the top-k best proposals

Marcin Kardas · Marcin Kardas · commit b0d0daef1412 · 2020-01-16T18:31:04.000Z
diff --git a/sota_extractor2/models/linking/bm25_naive.py b/sota_extractor2/models/linking/bm25_naive.py
@@ -174,7 +174,7 @@ def handle_pm(value):
                     'confidence', 'parsed', 'struct_model_type', 'struct_dataset']
 
 
-def generate_proposals_for_table(table_ext_id,  matrix, structure, desc, taxonomy_linking, datasets):
+def generate_proposals_for_table(table_ext_id,  matrix, structure, desc, taxonomy_linking, datasets, topk=1):
     # %%
     # Proposal generation
     def consume_cells(matrix):
@@ -217,11 +217,6 @@ def annotations(r, c, type='model'):
 
     def linked_proposals(proposals):
         for prop in proposals:
-            df = taxonomy_linking(prop.dataset, datasets, desc, debug_info=prop)
-            assert len(df) == 1
-
-            metric = df['metric'][0]
-
             # heuristyic to handle accuracy vs error
             first_num = (list(handle_pm(prop.raw_value)) + [0])[0]
             format = "{x}"
@@ -234,24 +229,27 @@ def linked_proposals(proposals):
             if '%' in prop.raw_value:
                 format += '%'
 
-            # if ("error" in metric or "Error" in metric) and (first_num > 0.5):
-            if (metric.strip().lower() == "error") and (first_num > 0.5):
-                metric = "Accuracy"
-
-            linked = {
-                'dataset': df['dataset'][0],
-                'metric': metric,
-                'task': df['task'][0],
-                'format': format,
-                'raw_value': prop.raw_value,
-                'model': prop.model_name,
-                'model_type': prop.model_type,
-                'cell_ext_id': prop.cell.cell_ext_id,
-                'confidence': df['confidence'][0],
-                'struct_model_type': prop.model_type,
-                'struct_dataset': prop.dataset
-            }
-            yield linked
+            df = taxonomy_linking(prop.dataset, datasets, desc, topk=topk, debug_info=prop)
+            for _, row in df.iterrows():
+                metric = row['metric']
+                # if ("error" in metric or "Error" in metric) and (first_num > 0.5):
+                if (metric.strip().lower() == "error") and (first_num > 0.5):
+                    metric = "Accuracy"
+
+                linked = {
+                    'dataset': row['dataset'],
+                    'metric': metric,
+                    'task': row['task'],
+                    'format': format,
+                    'raw_value': prop.raw_value,
+                    'model': prop.model_name,
+                    'model_type': prop.model_type,
+                    'cell_ext_id': prop.cell.cell_ext_id,
+                    'confidence': row['confidence'],
+                    'struct_model_type': prop.model_type,
+                    'struct_dataset': prop.dataset
+                }
+                yield linked
 
     # specify columns in case there's no proposal
 
@@ -264,7 +262,7 @@ def linked_proposals(proposals):
 
 
 def linked_proposals(paper_ext_id, paper, annotated_tables, taxonomy_linking=MatchSearch(),
-                     dataset_extractor=None):
+                     dataset_extractor=None, topk=1):
     #                     dataset_extractor=DatasetExtractor()):
     proposals = []
     datasets = dataset_extractor.from_paper(paper)
@@ -277,7 +275,11 @@ def linked_proposals(paper_ext_id, paper, annotated_tables, taxonomy_linking=Mat
         table_ext_id = f"{paper_ext_id}/{table.name}"
 
         if 'sota' in tags and 'no_sota_records' not in tags: # only parse tables that are marked as sota
-            proposals.append(generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets))
+            proposals.append(
+                generate_proposals_for_table(
+                    table_ext_id, matrix, structure, desc, taxonomy_linking, datasets, topk=topk
+                )
+            )
     if len(proposals):
         return pd.concat(proposals)
     return pd.DataFrame(columns=proposal_columns)
diff --git a/sota_extractor2/models/linking/context_search.py b/sota_extractor2/models/linking/context_search.py
@@ -308,7 +308,7 @@ def match(self, contexts):
         probs = softmax(np.array(logprobs))
         return zip(keys, probs)
 
-    def __call__(self, query, datasets, caption, debug_info=None):
+    def __call__(self, query, datasets, caption, topk=1, debug_info=None):
         cellstr = debug_info.cell.cell_ext_id
         pipeline_logger("linking::taxonomy_linking::call", ext_id=cellstr, query=query, datasets=datasets, caption=caption)
         datasets = " ".join(datasets)
@@ -331,10 +331,10 @@ def __call__(self, query, datasets, caption, debug_info=None):
             p = self.queries[key]
         else:
             dist = self.match(key)
-            topk = sorted(dist, key=lambda x: x[1], reverse=True)[0:5]
+            top_results = sorted(dist, key=lambda x: x[1], reverse=True)[:max(topk, 5)]
 
             entries = []
-            for it, prob in topk:
+            for it, prob in top_results:
                 task, dataset, metric = it
                 entry = dict(task=task, dataset=dataset, metric=metric)
                 entry.update({"evidence": "", "confidence": prob})
@@ -363,8 +363,8 @@ def __call__(self, query, datasets, caption, debug_info=None):
             else:
                 print("[EA] No gold sota record found for the cell")
         # end of error analysis only
-        pipeline_logger("linking::taxonomy_linking::topk", ext_id=cellstr, topk=p)
-        return p.head(1)
+        pipeline_logger("linking::taxonomy_linking::topk", ext_id=cellstr, topk=p.head(5))
+        return p.head(topk)
 
 
 # todo: compare regex approach (old) with find_datasets(.) (current)
diff --git a/sota_extractor2/models/linking/linker.py b/sota_extractor2/models/linking/linker.py
@@ -10,10 +10,18 @@ def __init__(self, name, taxonomy_linking, dataset_extractor):
         self.dataset_extractor = dataset_extractor
         self.__name__ = name
 
-    def __call__(self, paper, tables):
+    def __call__(self, paper, tables, topk=1):
         pipeline_logger(f"{Linker.step}::call", paper=paper, tables=tables)
         proposals = linked_proposals(paper.paper_id, paper, tables,
                                      taxonomy_linking=self.taxonomy_linking,
-                                     dataset_extractor=self.dataset_extractor).set_index('cell_ext_id')
-        pipeline_logger(f"{Linker.step}::linked", paper=paper, tables=tables, proposals=proposals)
+                                     dataset_extractor=self.dataset_extractor,
+                                     topk=topk)
+
+        if topk == 1:
+            proposals = proposals.set_index('cell_ext_id')
+            best = proposals
+        else:
+            best = proposals.groupby('cell_ext_id').head(1).set_index('cell_ext_id')
+
+        pipeline_logger(f"{Linker.step}::linked", paper=paper, tables=tables, proposals=best)
         return proposals