paperswithcode
diff --git a/‎sota_extractor2/config.py
Lines changed: 1 addition & 1 deletion b/‎sota_extractor2/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sota_extractor2/data/paper_collection.py
Lines changed: 6 additions & 2 deletions b/‎sota_extractor2/data/paper_collection.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎sota_extractor2/data/table.py
Lines changed: 27 additions & 3 deletions b/‎sota_extractor2/data/table.py
Lines changed: 27 additions & 3 deletions
diff --git a/‎sota_extractor2/helpers/explainers.py
Lines changed: 160 additions & 8 deletions b/‎sota_extractor2/helpers/explainers.py
Lines changed: 160 additions & 8 deletions
diff --git a/‎sota_extractor2/helpers/jupyter.py
Lines changed: 15 additions & 5 deletions b/‎sota_extractor2/helpers/jupyter.py
Lines changed: 15 additions & 5 deletions
@@ -10,7 +10,7 @@
 
 # otherwise use this files
 data = Path("/mnt/efs/pwc/data")
-goldtags_dump = data / "dumps" / "goldtags-2019.09.13_0219.json.gz"
+goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz"
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
 
@@ -73,8 +73,12 @@ def _load_tables(path, annotations, jobs, migrate):
     return {f.parent.name: tbls for f, tbls in zip(files, tables)}
 
 
-def _load_annotated_papers(path):
-    dump = load_gql_dump(path, compressed=path.suffix == ".gz")["allPapers"]
+def _load_annotated_papers(data_or_path):
+    if isinstance(data_or_path, dict):
+        compressed = False
+    else:
+        compressed = data_or_path.suffix == ".gz"
+    dump = load_gql_dump(data_or_path, compressed=compressed)["allPapers"]
     annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
     annotations.update({a.arxiv_id: a for a in dump})
     return annotations
 
@@ -5,7 +5,7 @@
 import re
 from dataclasses import dataclass, field
 from typing import List
-from ..helpers.jupyter import display_table
+from ..helpers.jupyter import display_html, table_to_html
 from copy import deepcopy
 
 
@@ -21,6 +21,7 @@ class Cell:
 reference_re = re.compile(r"<ref id='([^']*)'>(.*?)</ref>")
 num_re = re.compile(r"^\d+$")
 
+
 def extract_references(s):
     parts = reference_re.split(s)
     refs = parts[1::3]
@@ -89,10 +90,16 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
         if layout is not None:
             self.set_layout(layout)
 
+        self._set_annotations(annotations, migrate=migrate, old_name=old_name, guessed_tags=guessed_tags)
+
+    def _set_annotations(self, annotations, migrate=False, old_name=None, guessed_tags=None):
         if annotations is not None:
             self.gold_tags = annotations.gold_tags.strip()
             self.dataset_text = annotations.dataset_text.strip()
             self.notes = annotations.notes.strip()
+
+            sota_records = json.loads(annotations.cells_sota_records)
+
             if guessed_tags is not None:
                 tags = guessed_tags.values
             else:
@@ -117,14 +124,24 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
             self.gold_tags = ''
             self.dataset_text = ''
             self.notes = ''
+            sota_records = {}
+
+        sota_records = pd.DataFrame(sota_records.values(), index=sota_records.keys(),
+                                    columns=['task', 'dataset', 'metric', 'format', 'model', 'value'])
+        sota_records.index = self.name + "/" + sota_records.index
+        sota_records.index.rename("cell_ext_id", inplace=True)
+        sota_records.rename(columns={"value": "raw_value"}, inplace=True)
+
+        self.sota_records = sota_records.replace("", np.nan).dropna(subset=["model", "metric", "task", "dataset"])
+
 
     def set_layout(self, layout):
         for r, row in layout.iterrows():
             for c, cell in enumerate(row):
                 self.df.iloc[r, c].layout = cell
 
     def set_tags(self, tags):
-        for r, row in tags.iterrows():
+        for r, row in enumerate(tags):
             for c, cell in enumerate(row):
                 # todo: change gold_tags to tags to avoid confusion
                 self.df.iloc[r,c].gold_tags = cell.strip()
@@ -133,6 +150,10 @@ def set_tags(self, tags):
     def matrix(self):
         return self.df.applymap(lambda x: x.value)
 
+    @property
+    def matrix_html(self):
+        return self.df.applymap(lambda x: raw_value_to_html(x.raw_value))
+
     @property
     def matrix_layout(self):
         return self.df.applymap(lambda x: x.layout)
@@ -169,8 +190,11 @@ def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=N
             table_ann = None
         return cls(metadata['filename'], df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, migrate, match_name, guessed_tags)
 
+    def _repr_html_(self):
+        return table_to_html(self.matrix_html.values, self.matrix_tags.values, self.matrix_layout.values)
+
     def display(self):
-        display_table(self.df.applymap(lambda x: raw_value_to_html(x.raw_value)).values, self.df.applymap(lambda x: x.gold_tags).values, self.df.applymap(lambda x:x.layout).values)
+        display_html(self._repr_html_())
 
     def _save_df(self, df, filename):
         df.to_csv(filename, header=None, index=None)
 
@@ -1,25 +1,98 @@
+from sota_extractor2.models.linking.metrics import Metrics
 from ..models.structure import TableType
 from ..loggers import StructurePredictionEvaluator, LinkerEvaluator, FilteringEvaluator
 import pandas as pd
+import numpy as np
+from ..helpers.jupyter import table_to_html
+from sota_extractor2.models.linking.format import extract_value
 
 
-class TableTypeExplainer:
+class Reason:
+    pass
+
+
+class IrrelevantTable(Reason):
     def __init__(self, paper, table, table_type, probs):
         self.paper = paper
         self.table = table
         self.table_type = table_type
         self.probs = pd.DataFrame(probs, columns=["type", "probability"])
 
     def __str__(self):
-        return f"Table {self.table.name} was labelled as {self.table_type}."
+        return f"Table {self.table.name} was labelled as {self.table_type.name}."
+
+    def _repr_html_(self):
+        prediction = f'<div>{self}</div>'
+        caption = f'<div>Caption: {self.table.caption}</div>'
+        probs = self.probs.style.format({"probability": "{:.2f}"})._repr_html_()
+        return prediction + caption + probs
+
+
+class MislabeledCell(Reason):
+    def __init__(self, paper, table, row, col, probs):
+        self.paper = paper
+        self.table = table
+
+
+class TableExplanation:
+    def __init__(self, paper, table, table_type, proposals, reasons, topk):
+        self.paper = paper
+        self.table = table
+        self.table_type = table_type
+        self.proposals = proposals
+        self.reasons = reasons
+        self.topk = topk
+
+    def _format_tooltip(self, proposal):
+        return f"dataset: {proposal.dataset}\n" \
+            f"metric: {proposal.metric}\n" \
+            f"task: {proposal.task}\n" \
+            f"score: {proposal.parsed}\n" \
+            f"confidence: {proposal.confidence:0.2f}"
+
+    def _format_topk(self, topk):
+        return ""
 
-    def display(self):
-        print(self)
-        self.probs.display()
+    def _repr_html_(self):
+        matrix = self.table.matrix_html.values
+        predictions = np.zeros_like(matrix, dtype=object)
+        tooltips = np.zeros_like(matrix, dtype=object)
+        for cell_ext_id, proposal in self.proposals.iterrows():
+            paper_id, table_name, rc = cell_ext_id.split("/")
+            row, col = [int(x) for x in rc.split('.')]
+            if cell_ext_id in self.reasons:
+                reason = self.reasons[cell_ext_id]
+                tooltips[row, col] = reason
+                if reason.startswith("replaced by "):
+                    tooltips[row, col] += "\n\n" + self._format_tooltip(proposal)
+                elif reason.startswith("confidence "):
+                    tooltips[row, col] += "\n\n" + self._format_topk(self.topk[row, col])
+            else:
+                predictions[row, col] = 'final-proposal'
+                tooltips[row, col] = self._format_tooltip(proposal)
+
+        table_type_html = f'<div>Table {self.table.name} was labelled as {self.table_type.name}.</div>'
+        caption_html = f'<div>Caption: {self.table.caption}</div>'
+        table_html = table_to_html(matrix,
+                                   self.table.matrix_tags.values,
+                                   self.table.matrix_layout.values,
+                                   predictions,
+                                   tooltips)
+        html = table_type_html + caption_html + table_html
+        proposals = self.proposals[~self.proposals.index.isin(self.reasons.index)]
+        if len(proposals):
+            proposals = proposals[["dataset", "metric", "task", "model", "parsed"]]\
+                .reset_index(drop=True).rename(columns={"parsed": "score"})
+            html2 = proposals._repr_html_()
+            return f"<div><div>{html}</div><div>Proposals</div><div>{html2}</div></div>"
+        return html
 
 
 class Explainer:
+    _sota_record_columns = ['task', 'dataset', 'metric', 'format', 'model', 'model_type', 'raw_value', 'parsed']
+
     def __init__(self, pipeline_logger, paper_collection):
+        self.paper_collection = paper_collection
         self.spe = StructurePredictionEvaluator(pipeline_logger, paper_collection)
         self.le = LinkerEvaluator(pipeline_logger, paper_collection)
         self.fe = FilteringEvaluator(pipeline_logger)
@@ -29,15 +102,94 @@ def explain(self, paper, cell_ext_id):
         if paper.paper_id != paper_id:
             return "No such cell"
 
-        row, col = [int(x) for x in rc.split('.')]
-
         table_type, probs = self.spe.get_table_type_predictions(paper_id, table_name)
 
         if table_type == TableType.IRRELEVANT:
-            return TableTypeExplainer(paper, paper.table_by_name(table_name), table_type, probs)
+            return IrrelevantTable(paper, paper.table_by_name(table_name), table_type, probs)
+
+        all_proposals = self.le.proposals[paper_id]
+        reasons = self.fe.reason
+        table_ext_id = f"{paper_id}/{table_name}"
+        table_proposals = all_proposals[all_proposals.index.str.startswith(table_ext_id+"/")]
+        topk = {(row, col): topk for (pid, tn, row, col), topk in self.le.topk.items()
+                if (pid, tn) == (paper_id, table_name)}
+
+        return TableExplanation(paper, paper.table_by_name(table_name), table_type, table_proposals, reasons, topk)
+
+        row, col = [int(x) for x in rc.split('.')]
 
         reason = self.fe.reason.get(cell_ext_id)
         if reason is None:
             pass
         else:
             return reason
+
+    def _get_table_sota_records(self, table):
+
+        first_model = lambda x: ([a for a in x if a.startswith('model')] + [''])[0]
+        if len(table.sota_records):
+            matrix = table.matrix.values
+            tags = table.matrix_tags
+            model_type_col = tags.apply(first_model)
+            model_type_row = tags.T.apply(first_model)
+            sota_records = table.sota_records.copy()
+            sota_records['model_type'] = ''
+            sota_records['raw_value'] = ''
+            for cell_ext_id, record in sota_records.iterrows():
+                name, rc = cell_ext_id.split('/')
+                row, col = [int(x) for x in rc.split('.')]
+                record.model_type = model_type_col[col] or model_type_row[row]
+                record.raw_value = matrix[row, col]
+
+            sota_records["parsed"] = sota_records[["raw_value", "format"]].apply(
+                lambda row: float(extract_value(row.raw_value, row.format)), axis=1)
+
+            sota_records = sota_records[sota_records["parsed"] == sota_records["parsed"]]
+
+            strip_cols = ["task", "dataset", "format", "metric", "raw_value", "model", "model_type"]
+            sota_records = sota_records.transform(
+                lambda x: x.str.strip() if x.name in strip_cols else x)
+            return sota_records[self._sota_record_columns]
+        else:
+            empty = pd.DataFrame(columns=self._sota_record_columns)
+            empty.index.rename("cell_ext_id", inplace=True)
+            return empty
+
+    def _get_sota_records(self, paper):
+        if not len(paper.tables):
+            empty = pd.DataFrame(columns=self._sota_record_columns)
+            empty.index.rename("cell_ext_id", inplace=True)
+            return empty
+        records = [self._get_table_sota_records(table) for table in paper.tables]
+        records = pd.concat(records)
+        records.index = paper.paper_id + "/" + records.index
+        records.index.rename("cell_ext_id", inplace=True)
+        return records
+
+    def linking_metrics(self, experiment_name="unk"):
+        paper_ids = list(self.le.proposals.keys())
+
+        proposals = pd.concat(self.le.proposals.values())
+        proposals = proposals[~proposals.index.isin(self.fe.reason.index)]
+
+        papers = {paper_id: self.paper_collection.get_by_id(paper_id) for paper_id in paper_ids}
+        missing = [paper_id for paper_id, paper in papers.items() if paper is None]
+        if missing:
+            print("Missing papers in paper collection:")
+            print(", ".join(missing))
+        papers = [paper for paper in papers.values() if paper is not None]
+
+        if not len(papers):
+            gold_sota_records = pd.DataFrame(columns=self._sota_record_columns)
+            gold_sota_records.index.rename("cell_ext_id", inplace=True)
+        else:
+            gold_sota_records = pd.concat([self._get_sota_records(paper) for paper in papers])
+
+        df = gold_sota_records.merge(proposals, 'outer', left_index=True, right_index=True, suffixes=['_gold', '_pred'])
+        df = df.reindex(sorted(df.columns), axis=1)
+        df = df.fillna('not-present')
+        if "experiment_name" in df.columns:
+            del df["experiment_name"]
+
+        metrics = Metrics(df, experiment_name=experiment_name)
+        return metrics
@@ -1,5 +1,8 @@
 from IPython.core.display import display, HTML
 from .table_style import table_style
+import numpy as np
+
+
 def set_seed(seed, name):
     import torch
     import numpy as np
@@ -9,11 +12,11 @@ def set_seed(seed, name):
     torch.backends.cudnn.benchmark = False
     np.random.seed(seed)
 
-def display_html(s): return display(HTML(s))
 
+def display_html(s): return display(HTML(s))
 
 
-def display_table(table, structure=None, layout=None):
+def table_to_html(table, structure=None, layout=None, predictions=None, tooltips=None):
     """
         matrix - 2d ndarray with cell values
         strucutre - 2d ndarray with structure annotation
@@ -24,15 +27,22 @@ def display_table(table, structure=None, layout=None):
         matrix = table
     if structure is None: structure = table.matrix_gold_tags
     if layout is None: layout = np.zeros_like(matrix, dtype=str)
+    if predictions is None: predictions = np.zeros_like(matrix, dtype=str)
+    if tooltips is None: tooltips = np.zeros_like(matrix, dtype=str)
     html = []
     html.append(table_style)
     html.append('<div class="tableWrapper">')
     html.append("<table>")
-    for row,struc_row, layout_row in zip(matrix, structure, layout):
+    for row,struc_row, layout_row, preds_row, tt_row in zip(matrix, structure, layout, predictions, tooltips):
         html.append("<tr>")
-        for cell,struct,layout in zip(row,struc_row,layout_row):
-            html.append(f'<td class="{struct} {layout}">{cell}</td>')
+        for cell,struct,layout,preds, tt in zip(row,struc_row,layout_row,preds_row, tt_row):
+            html.append(f'<td class="{struct} {layout} {preds}" title="{tt}">{cell}</td>')
         html.append("</tr>")
     html.append("</table>")
     html.append('</div>')
+    return "\n".join(html)
+
+
+def display_table(table, structure=None, layout=None):
+    html = table_to_html(table, structure, layout)
     display_html("\n".join(html))