paperswithcode
diff --git a/‎label_tables.py
Lines changed: 68 additions & 21 deletions b/‎label_tables.py
Lines changed: 68 additions & 21 deletions
diff --git a/‎sota_extractor2/config.py
Lines changed: 1 addition & 1 deletion b/‎sota_extractor2/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sota_extractor2/data/paper_collection.py
Lines changed: 4 additions & 3 deletions b/‎sota_extractor2/data/paper_collection.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎sota_extractor2/data/structure.py
Lines changed: 3 additions & 1 deletion b/‎sota_extractor2/data/structure.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎sota_extractor2/data/table.py
Lines changed: 8 additions & 0 deletions b/‎sota_extractor2/data/table.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎sota_extractor2/models/linking/bm25_naive.py
Lines changed: 80 additions & 24 deletions b/‎sota_extractor2/models/linking/bm25_naive.py
Lines changed: 80 additions & 24 deletions
@@ -9,7 +9,9 @@
 import sys
 from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation
 from collections import Counter, namedtuple
-
+from joblib import delayed, Parallel
+from sota_extractor2.data.paper_collection import PaperCollection, remove_arxiv_version
+from functools import reduce
 
 arxiv_url_re = re.compile(r"^https?://(www.)?arxiv.org/(abs|pdf|e-print)/(?P<arxiv_id>\d{4}\.[^./]*)(\.pdf)?$")
 
@@ -33,6 +35,8 @@ def get_table(filename):
         return pd.DataFrame()
 
 
+# all_metadata[arxiv_id] = {'table_01.csv': 'Table 1: ...', ...}
+# all_tables[arxiv_id] = {'table_01.csv': DataFrame(...), ...}
 def get_tables(tables_dir):
     tables_dir = Path(tables_dir)
     all_metadata = {}
@@ -223,19 +227,20 @@ def mark_strings(table, tags, values):
                     if match_str(real, s):
                         cell_tags += f"{beg}{s}{end}"
     return cell_tags
-    
+
 
 metatables = {}
-def match_many(output_dir, task_name, dataset_name, metric_name, tables, values):
+def match_many(task_name, dataset_name, metric_name, tables, values):
+    metatables = {}
     for arxiv_id in tables:
         for table in tables[arxiv_id]:
             tags = mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, tables[arxiv_id][table], values)
-            global metatables
             key = (arxiv_id, table)
             if key in metatables:
                 metatables[key] += tags
             else:
                 metatables[key] = tags
+    return metatables
 
 
 def normalize_metric(value):
@@ -256,6 +261,26 @@ def normalize_table(table):
     return table.applymap(normalize_cell)
 
 
+celltags_re = re.compile(r"<hit><sota>(?P<sota>.*?)</sota><paper>(?P<paper>.*?)</paper><model>(?P<model>.*?)</model><metric>(?P<metric>.*?)</metric><dataset>(?P<dataset>.*?)</dataset><task>(?P<task>.*?)</task>(?P<this_paper><this_paper/>)?<comparator>(?P<comparator>.*?)</comparator><matched_cell>(?P<matched_cell>.*?)</matched_cell><matched_str>(?P<matched_str>.*?)</matched_str></hit>")
+def parse_celltags(v):
+    r = []
+    for m in celltags_re.finditer(v):
+        d = m.groupdict()
+        d['this_paper'] = d['this_paper'] is not None
+        r.append(d)
+    return r
+
+
+def celltags_to_json(df):
+    tags = []
+    for r, row in df.iterrows():
+        for c, cell in enumerate(row):
+            if cell != "":
+                tags.append(dict(row=r, col=c, hits=parse_celltags(cell)))
+    return tags
+
+
+
 # for each task with sota row
 #     arxivs <- list of papers related to the task
 #     for each (dataset_name, metric_name) of the task:
@@ -269,40 +294,62 @@ def normalize_table(table):
 #                 if table.arxiv_id == paper_id: mark with this-tag
 PaperResult = namedtuple("PaperResult", ["arxiv_id", "model", "value", "normalized"])
 
+arxivs_by_metrics = {}
+tables = {}
+
+def match_for(task, dataset, metric):
+    records = arxivs_by_metrics[(task, dataset, metric)]
+    tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables}
+    return match_many(task, dataset, metric, tabs, records)
+
 
-def label_tables(tasksfile, tables_dir):
-    output_dir = Path(tables_dir)
+def label_tables(tasksfile, papers_dir, output, jobs=-1):
+    print("Reading PwC entries...", file=sys.stderr)
     tasks = get_sota_tasks(tasksfile)
-    metadata, tables = get_tables(tables_dir)
+    print("Reading tables from files...", file=sys.stderr)
+    pc = PaperCollection.from_files(papers_dir, load_texts=False, load_annotations=False, jobs=jobs)
 
-    arxivs_by_metrics = {}
+    # share data between processes to avoid costly joblib serialization
+    global arxivs_by_metrics, tables
 
-    tables = {arxiv_id: {tab: normalize_table(tables[arxiv_id][tab]) for tab in tables[arxiv_id]} for arxiv_id in tables}
+    print("Normalizing tables...", file=sys.stderr)
+    tables = {p.arxiv_no_version: {tab.name: normalize_table(tab.matrix) for tab in p.tables} for p in pc}
 
+    print("Aggregating papers...", file=sys.stderr)
     for task in tasks:
         for dataset in task.datasets:
             for row in dataset.sota.rows:
                 match = arxiv_url_re.match(row.paper_url)
                 if match is not None:
-                    arxiv_id = match.group("arxiv_id")
+                    arxiv_id = remove_arxiv_version(match.group("arxiv_id"))
                     for metric in row.metrics:
                         arxivs_by_metrics.setdefault((task.name, dataset.name, metric), set()).add(
                             PaperResult(arxiv_id=arxiv_id, model=row.model_name, value=row.metrics[metric],
                                 normalized=normalize_metric(row.metrics[metric])
                             )
                         )
 
-    for task, dataset, metric in arxivs_by_metrics:
-        records = arxivs_by_metrics[(task, dataset, metric)]
-        tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables}
-        match_many(output_dir, task, dataset, metric, tabs, records)
-
-    global metatables
-
-    for (arxiv_id, table), best in metatables.items():
-        out = output_dir / arxiv_id
-        out.mkdir(parents=True, exist_ok=True)
-        best.to_csv(out / table.replace("table", "celltags"), header=None, index=None)
+    print("Matching results...", file=sys.stderr)
+    metatables_list = Parallel(n_jobs=jobs, backend="multiprocessing")(
+        [delayed(match_for)(task, dataset, metric)
+         for task, dataset, metric in arxivs_by_metrics])
+
+    print("Aggregating results...", file=sys.stderr)
+    metatables = {}
+    for mt in metatables_list:
+        for k, v in mt.items():
+            metatables[k] = metatables.get(k, "") + v
+    grouped_metatables = {}
+    for (arxiv_id, tablename), df in metatables.items():
+        grouped_metatables.setdefault(arxiv_id, {})[tablename] = celltags_to_json(df)
+
+    with open(output, 'wt') as f:
+        json.dump(grouped_metatables, f)
+    # print("Saving matches...", file=sys.stderr)
+    # for (arxiv_id, table), best in metatables.items():
+    #     out = output_dir / arxiv_id
+    #     out.mkdir(parents=True, exist_ok=True)
+    #     best.to_csv(out / table.replace("table", "celltags"), header=None, index=None)
 
 
 if __name__ == "__main__": fire.Fire(label_tables)
@@ -10,7 +10,7 @@
 
 # otherwise use this files
 data = Path("/mnt/efs/pwc/data")
-goldtags_dump = data / "dumps" / "goldtags-2019.08.06_0835.json.gz"
+goldtags_dump = data / "dumps" / "goldtags-2019.09.13_0219.json.gz"
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
 
@@ -54,7 +54,7 @@ def __init__(self, data=None):
         super().__init__(data)
 
     @classmethod
-    def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, jobs=-1, migrate=False):
+    def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
         path = Path(path)
         if annotations_path is None:
             annotations_path = path / "structure-annotations.json"
@@ -63,12 +63,13 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
         else:
             texts = {}
 
-        annotations = _load_annotated_papers(annotations_path)
+        annotations = {}
         if load_tables:
+            if load_annotations:
+                annotations = _load_annotated_papers(annotations_path)
             tables = _load_tables(path, annotations, jobs, migrate)
         else:
             tables = {}
-            annotations = {}
         outer_join = set(texts).union(set(tables))
 
         papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join]
 
@@ -101,6 +101,8 @@ def create_evidence_records(textfrag, cell, table):
                "cell_type": cell.vals[1],
                "cell_content": fix_refs(cell.vals[0]),
                "cell_reference": cell.vals[2],
+               "cell_layout": cell.vals[3],
+               "cell_styles": cell.vals[4],
                "this_paper": textfrag.paper_id == table.paper_id,
                "row": cell.row,
                "col": cell.col,
@@ -125,7 +127,7 @@ def get_limits(cell_type):
         return dict(paper_limit=paper_limit, corpus_limit=corpus_limit)
     records = [
         record
-            for cell in consume_cells(table.matrix, table.matrix_gold_tags, table.matrix_references) if filter_cells(cell)
+            for cell in consume_cells(table.matrix, table.matrix_gold_tags, table.matrix_references, table.matrix_layout, table.matrix_styles) if filter_cells(cell)
             for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=table.paper_id, **get_limits(cell.vals[1]))
             for record in create_evidence_records(evidence, cell, table=table)
     ]
 
@@ -112,6 +112,14 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
             self.dataset_text = ''
             self.notes = ''
 
+    @property
+    def matrix(self):
+        return self.df.applymap(lambda x: x.value)
+
+    @property
+    def matrix_gold_tags(self):
+        return self.df.applymap(lambda x: x.gold_tags)
+
     @classmethod
     def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=None, guessed_tags=None):
         path = Path(path)
 
@@ -5,6 +5,10 @@
 import pandas as pd
 from elasticsearch import Elasticsearch, client
 import logging
+#from .extractors import DatasetExtractor
+import spacy
+from scispacy.abbreviation import AbbreviationDetector
+from sota_extractor2.models.linking.format import extract_value
 
 
 @dataclass()
@@ -84,9 +88,36 @@ def __init__(self, mkquery=mkquery_ngrams, es=None):
         self.log = logging.getLogger(__name__)
         self.mkquery = mkquery
 
-    def preproc(self, val):
+        self.nlp = spacy.load("en_core_web_sm")
+        abbreviation_pipe = AbbreviationDetector(self.nlp)
+        self.nlp.add_pipe(abbreviation_pipe)
+        self.nlp.disable_pipes("tagger", "ner", "parser")
+
+    def match_abrv(self, dataset, datasets):
+        abrvs = []
+        for ds in datasets:
+            # "!" is a workaround to scispacy error
+            doc = self.nlp(f"! {ds} ({dataset})")
+            for abrv in doc._.abbreviations:
+                if str(abrv) == dataset and str(abrv._.long_form) == ds:
+                    abrvs.append(str(abrv._.long_form))
+        abrvs = list(set(abrvs))
+        if len(abrvs) == 1:
+            print(f"abrv. for {dataset}: {abrvs[0]}")
+            return abrvs[0]
+        elif len(abrvs) == 0:
+            return None
+        else:
+            print(f"Multiple abrvs. for {dataset}: {abrvs}")
+            return None
+
+    def preproc(self, val, datasets=None):
         val = val.strip(',- ')
         val = re.sub("dataset", '', val, flags=re.I)
+        if datasets:
+            abrv = self.match_abrv(val, datasets)
+            if abrv:
+                val += " " + abrv
         #         if self.case:
         #             val += (" " +re.sub("([a-z])([A-Z])", r'\1 \2', val)
         #                     +" " +re.sub("([a-zA-Z])([0-9])", r'\1 \2', val)
@@ -99,9 +130,11 @@ def search(self, query, explain_doc_id=None):
             return self.es.explain('et_taxonomy', doc_type='doc', id=explain_doc_id, body=body)
         return self.es.search('et_taxonomy', doc_type='doc', body=body)["hits"]
 
-    def __call__(self, query):
+    def __call__(self, query, datasets, caption):
         split_re = re.compile('([^a-zA-Z0-9])')
-        query = self.preproc(query).strip()
+        query = self.preproc(query, datasets).strip()
+        if caption:
+            query += " " + self.preproc(caption).strip()[:400]
         results = self.search(query)
         hits = results["hits"][:3]
         df = pd.DataFrame.from_records([
@@ -136,7 +169,7 @@ def handle_pm(value):
                 pass
             # %%
 
-def generate_proposals_for_table(table_ext_id,  matrix, structure, desc, taxonomy_linking):
+def generate_proposals_for_table(table_ext_id,  matrix, structure, desc, taxonomy_linking, datasets):
     # %%
     # Proposal generation
     def consume_cells(matrix):
@@ -170,30 +203,37 @@ def annotations(r, c, type='model'):
         for r, c, val in consume_cells(matrix)
         if structure[r, c] == '' and number_re.match(matrix[r, c].strip())]
 
+    # def empty_proposal(cell_ext_id, reason):
+    #     np = "not-present"
+    #     return dict(
+    #         dataset=np, metric=np, task=np, format=np, raw_value=np, model=np,
+    #         model_type=np, cell_ext_id=cell_ext_id, confidence=-1, debug_reason=reason
+    #     )
+
     def linked_proposals(proposals):
         for prop in proposals:
-            if prop.dataset == '' or prop.model_type == '':
-                continue
-            if 'dev' in prop.dataset.lower() or 'train' in prop.dataset.lower():
-                continue
-
-            df = taxonomy_linking(prop.dataset)
-            if not len(df):
-                continue
+            df = taxonomy_linking(prop.dataset, datasets, desc, debug_info=prop)
+            assert len(df) == 1
 
             metric = df['metric'][0]
 
             # heuristyic to handle accuracy vs error
             first_num = (list(handle_pm(prop.raw_value)) + [0])[0]
             format = "{x}"
-            if first_num > 1:
-                first_num /= 100
-                format = "{x/100}"
-
-            if ("error" in metric or "Error" in metric) and (first_num > 0.5):
+            # if first_num > 1:
+            #     first_num /= 100
+            #     format = "{x/100}"
+            if first_num < 1 and '%' not in prop.raw_value:
+                first_num *= 100
+                format = "{100*x}"
+            if '%' in prop.raw_value:
+                format += '%'
+
+            # if ("error" in metric or "Error" in metric) and (first_num > 0.5):
+            if (metric.strip().lower() == "error") and (first_num > 0.5):
                 metric = "Accuracy"
 
-            yield {
+            linked = {
                 'dataset': df['dataset'][0],
                 'metric': metric,
                 'task': df['task'][0],
@@ -203,22 +243,38 @@ def linked_proposals(proposals):
                 'model_type': prop.model_type,
                 'cell_ext_id': prop.cell.cell_ext_id,
                 'confidence': df['confidence'][0],
+                'struct_model_type': prop.model_type,
+                'struct_dataset': prop.dataset
             }
+            yield linked
+
+    # specify columns in case there's no proposal
+    columns = ['dataset', 'metric', 'task', 'format', 'raw_value', 'model', 'model_type', 'cell_ext_id', 'confidence', 'parsed',
+               'struct_model_type', 'struct_dataset']
+    proposals = pd.DataFrame.from_records(list(linked_proposals(proposals)), columns=columns)
+
+    if len(proposals):
+        proposals["parsed"]=proposals[["raw_value", "format"]].apply(
+            lambda row: float(extract_value(row.raw_value, row.format)), axis=1)
+    return proposals
 
-    return list(linked_proposals(proposals))
 
-def linked_proposals(paper_ext_id, tables, structure_annotator, taxonomy_linking=MatchSearch()):
+def linked_proposals(paper_ext_id, paper, tables, structure_annotator, taxonomy_linking=MatchSearch(),
+                     dataset_extractor=None):
+    #                     dataset_extractor=DatasetExtractor()):
     proposals = []
+    datasets = dataset_extractor.from_paper(paper)
+    print(f"Extracted datasets: {datasets}")
     for idx, table in enumerate(tables):
         matrix = np.array(table.matrix)
-        structure, tags = structure_annotator(table)
+        structure, tags = structure_annotator(paper, table)
         structure = np.array(structure)
-        desc = table.desc
+        desc = table.caption
         table_ext_id = f"{paper_ext_id}/{table.name}"
 
         if 'sota' in tags and 'no_sota_records' not in tags: # only parse tables that are marked as sota
-            proposals += list(generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking))
-    return pd.DataFrame.from_records(proposals)
+            proposals.append(generate_proposals_for_table(table_ext_id, matrix, structure, desc, taxonomy_linking, datasets))
+    return pd.concat(proposals)
 
 
 def test_link_taxonomy():