paperswithcode
diff --git a/‎environment.yml
Lines changed: 1 addition & 0 deletions b/‎environment.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎extract_tables.py
Lines changed: 13 additions & 16 deletions b/‎extract_tables.py
Lines changed: 13 additions & 16 deletions
diff --git a/‎label_tables.py
Lines changed: 68 additions & 21 deletions b/‎label_tables.py
Lines changed: 68 additions & 21 deletions
diff --git a/‎sota_extractor2/config.py
Lines changed: 9 additions & 1 deletion b/‎sota_extractor2/config.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎sota_extractor2/data/elastic.py
Lines changed: 20 additions & 4 deletions b/‎sota_extractor2/data/elastic.py
Lines changed: 20 additions & 4 deletions
diff --git a/‎sota_extractor2/data/paper_collection.py
Lines changed: 43 additions & 5 deletions b/‎sota_extractor2/data/paper_collection.py
Lines changed: 43 additions & 5 deletions
@@ -16,3 +16,4 @@ dependencies:
 - elasticsearch-dsl=7.0.0
 - ipython=7.5.0
 - joblib=0.13.2
+- python-magic=0.4.15
@@ -13,8 +13,7 @@
 from dataclasses import dataclass
 from typing import Set
 
-from tabular import Tabular
-
+from sota_extractor2.data.table import Table
 
 # begin of dirty hack
 # pandas parsing of html tables is really nice
@@ -265,18 +264,13 @@ def html2data(table):
     return data[0] if len(data) == 1 else None
 
 
-def save_table(data, filename):
-    data.to_csv(filename, header=None, index=None)
-
-
 def save_tables(data, outdir):
     metadata = []
 
     for num, table in enumerate(data, 1):
         filename = f"table_{num:02}.csv"
         layout = f"layout_{num:02}.csv"
-        save_table(table.data, outdir / filename)
-        save_table(table.layout, outdir / layout)
+        table.save(outdir, filename, layout)
         metadata.append(dict(filename=filename, layout=layout, caption=table.caption, figure_id=table.figure_id))
     with open(outdir / "metadata.json", "w") as f:
         json.dump(metadata, f)
@@ -341,11 +335,7 @@ def remove_footnotes(soup):
         elem.extract()
 
 
-def extract_tables(filename, outdir):
-    with open(filename, "rb") as f:
-        html = f.read()
-    outdir = Path(outdir)
-    outdir.mkdir(parents=True, exist_ok=True)
+def extract_tables(html):
     soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
     set_ids_by_labels(soup)
     fix_span_tables(soup)
@@ -381,8 +371,15 @@ def extract_tables(filename, outdir):
             if cap_el is not None:
                 caption = clear_ws(cap_el.get_text())
         figure_id = table.get("data-figure-id")
-        data.append(Tabular(tab, layout, caption, figure_id))
+        data.append(Table(f"table_{len(data)+1:02}", tab, layout.applymap(str), caption, figure_id))
+    return data
 
-    save_tables(data, outdir)
+def extract_tables_cmd(filename, outdir):
+    with open(filename, "rb") as f:
+        html = f.read()
+    tables = extract_tables(html)
+    outdir = Path(outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    save_tables(tables, outdir)
 
-if __name__ == "__main__": fire.Fire(extract_tables)
+if __name__ == "__main__": fire.Fire(extract_tables_cmd)
@@ -9,7 +9,9 @@
 import sys
 from decimal import Decimal, ROUND_DOWN, ROUND_HALF_UP, InvalidOperation
 from collections import Counter, namedtuple
-
+from joblib import delayed, Parallel
+from sota_extractor2.data.paper_collection import PaperCollection, remove_arxiv_version
+from functools import reduce
 
 arxiv_url_re = re.compile(r"^https?://(www.)?arxiv.org/(abs|pdf|e-print)/(?P<arxiv_id>\d{4}\.[^./]*)(\.pdf)?$")
 
@@ -33,6 +35,8 @@ def get_table(filename):
         return pd.DataFrame()
 
 
+# all_metadata[arxiv_id] = {'table_01.csv': 'Table 1: ...', ...}
+# all_tables[arxiv_id] = {'table_01.csv': DataFrame(...), ...}
 def get_tables(tables_dir):
     tables_dir = Path(tables_dir)
     all_metadata = {}
@@ -223,19 +227,20 @@ def mark_strings(table, tags, values):
                     if match_str(real, s):
                         cell_tags += f"{beg}{s}{end}"
     return cell_tags
-    
+
 
 metatables = {}
-def match_many(output_dir, task_name, dataset_name, metric_name, tables, values):
+def match_many(task_name, dataset_name, metric_name, tables, values):
+    metatables = {}
     for arxiv_id in tables:
         for table in tables[arxiv_id]:
             tags = mark_with_all_comparators(task_name, dataset_name, metric_name, arxiv_id, tables[arxiv_id][table], values)
-            global metatables
             key = (arxiv_id, table)
             if key in metatables:
                 metatables[key] += tags
             else:
                 metatables[key] = tags
+    return metatables
 
 
 def normalize_metric(value):
@@ -256,6 +261,26 @@ def normalize_table(table):
     return table.applymap(normalize_cell)
 
 
+celltags_re = re.compile(r"<hit><sota>(?P<sota>.*?)</sota><paper>(?P<paper>.*?)</paper><model>(?P<model>.*?)</model><metric>(?P<metric>.*?)</metric><dataset>(?P<dataset>.*?)</dataset><task>(?P<task>.*?)</task>(?P<this_paper><this_paper/>)?<comparator>(?P<comparator>.*?)</comparator><matched_cell>(?P<matched_cell>.*?)</matched_cell><matched_str>(?P<matched_str>.*?)</matched_str></hit>")
+def parse_celltags(v):
+    r = []
+    for m in celltags_re.finditer(v):
+        d = m.groupdict()
+        d['this_paper'] = d['this_paper'] is not None
+        r.append(d)
+    return r
+
+
+def celltags_to_json(df):
+    tags = []
+    for r, row in df.iterrows():
+        for c, cell in enumerate(row):
+            if cell != "":
+                tags.append(dict(row=r, col=c, hits=parse_celltags(cell)))
+    return tags
+
+
+
 # for each task with sota row
 #     arxivs <- list of papers related to the task
 #     for each (dataset_name, metric_name) of the task:
@@ -269,40 +294,62 @@ def normalize_table(table):
 #                 if table.arxiv_id == paper_id: mark with this-tag
 PaperResult = namedtuple("PaperResult", ["arxiv_id", "model", "value", "normalized"])
 
+arxivs_by_metrics = {}
+tables = {}
+
+def match_for(task, dataset, metric):
+    records = arxivs_by_metrics[(task, dataset, metric)]
+    tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables}
+    return match_many(task, dataset, metric, tabs, records)
+
 
-def label_tables(tasksfile, tables_dir):
-    output_dir = Path(tables_dir)
+def label_tables(tasksfile, papers_dir, output, jobs=-1):
+    print("Reading PwC entries...", file=sys.stderr)
     tasks = get_sota_tasks(tasksfile)
-    metadata, tables = get_tables(tables_dir)
+    print("Reading tables from files...", file=sys.stderr)
+    pc = PaperCollection.from_files(papers_dir, load_texts=False, load_annotations=False, jobs=jobs)
 
-    arxivs_by_metrics = {}
+    # share data between processes to avoid costly joblib serialization
+    global arxivs_by_metrics, tables
 
-    tables = {arxiv_id: {tab: normalize_table(tables[arxiv_id][tab]) for tab in tables[arxiv_id]} for arxiv_id in tables}
+    print("Normalizing tables...", file=sys.stderr)
+    tables = {p.arxiv_no_version: {tab.name: normalize_table(tab.matrix) for tab in p.tables} for p in pc}
 
+    print("Aggregating papers...", file=sys.stderr)
     for task in tasks:
         for dataset in task.datasets:
             for row in dataset.sota.rows:
                 match = arxiv_url_re.match(row.paper_url)
                 if match is not None:
-                    arxiv_id = match.group("arxiv_id")
+                    arxiv_id = remove_arxiv_version(match.group("arxiv_id"))
                     for metric in row.metrics:
                         arxivs_by_metrics.setdefault((task.name, dataset.name, metric), set()).add(
                             PaperResult(arxiv_id=arxiv_id, model=row.model_name, value=row.metrics[metric],
                                 normalized=normalize_metric(row.metrics[metric])
                             )
                         )
 
-    for task, dataset, metric in arxivs_by_metrics:
-        records = arxivs_by_metrics[(task, dataset, metric)]
-        tabs = {r.arxiv_id: tables[r.arxiv_id] for r in records if r.arxiv_id in tables}
-        match_many(output_dir, task, dataset, metric, tabs, records)
-
-    global metatables
-
-    for (arxiv_id, table), best in metatables.items():
-        out = output_dir / arxiv_id
-        out.mkdir(parents=True, exist_ok=True)
-        best.to_csv(out / table.replace("table", "celltags"), header=None, index=None)
+    print("Matching results...", file=sys.stderr)
+    metatables_list = Parallel(n_jobs=jobs, backend="multiprocessing")(
+        [delayed(match_for)(task, dataset, metric)
+         for task, dataset, metric in arxivs_by_metrics])
+
+    print("Aggregating results...", file=sys.stderr)
+    metatables = {}
+    for mt in metatables_list:
+        for k, v in mt.items():
+            metatables[k] = metatables.get(k, "") + v
+    grouped_metatables = {}
+    for (arxiv_id, tablename), df in metatables.items():
+        grouped_metatables.setdefault(arxiv_id, {})[tablename] = celltags_to_json(df)
+
+    with open(output, 'wt') as f:
+        json.dump(grouped_metatables, f)
+    # print("Saving matches...", file=sys.stderr)
+    # for (arxiv_id, table), best in metatables.items():
+    #     out = output_dir / arxiv_id
+    #     out.mkdir(parents=True, exist_ok=True)
+    #     best.to_csv(out / table.replace("table", "celltags"), header=None, index=None)
 
 
 if __name__ == "__main__": fire.Fire(label_tables)
@@ -10,7 +10,7 @@
 
 # otherwise use this files
 data = Path("/mnt/efs/pwc/data")
-goldtags_dump = data / "dumps" / "goldtags-2019.08.06_0835.json.gz"
+goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz"
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
@@ -22,3 +22,11 @@
 
 datasets = data/"datasets"
 datasets_structure = datasets/"structure"
+structure_models = datasets / "structure" / "models"
+
+mocks = datasets / "mocks"
+
+linking_models = datasets / "linking" / "models"
+linking_data = datasets / "linking" / "data"
+
+autodict = linking_data / "autodict"
@@ -1,3 +1,4 @@
+from bs4 import BeautifulSoup
 import pandas as pd
 import re
 
@@ -162,9 +163,10 @@ def from_json(cls, json, paper_id=None):
         return paper
 
     @classmethod
-    def from_file(cls, path):
+    def from_file(cls, path, paper_id=None):
         path = Path(path)
-        paper_id = path.parent.name
+        if paper_id is None:
+            paper_id = path.parent.name
         with open(path, "rt") as f:
             json = f.read()
         return cls.from_json(json, paper_id)
@@ -187,6 +189,12 @@ def save(self, **kwargs):
         else:
             return super().save(**kwargs)
 
+    def delete(self, **kwargs):
+        if hasattr(self, 'fragments'):
+            for f in self.fragments:
+                f.delete()
+        return super().delete(**kwargs)
+
     @classmethod
     def parse_html(cls, soup, paper_id):
         put_dummy_anchors(soup)
@@ -254,9 +262,17 @@ def read_html(cls, file):
         return read_html(file)
 
     @classmethod
-    def parse_paper(cls, file):
+    def from_html(cls, html, paper_id):
+        soup = BeautifulSoup(html, "html.parser")
+        return cls.parse_html(soup, paper_id)
+
+    @classmethod
+    def parse_paper(cls, file, paper_id=None):
+        file = Path(file)
         soup = cls.read_html(file)
-        return cls.parse_html(soup, file.stem)
+        if paper_id is None:
+            paper_id = file.stem
+        return cls.parse_html(soup, paper_id)
 
 
 class Author(InnerDoc):
 
@@ -7,6 +7,10 @@
 from joblib import Parallel, delayed
 from collections import UserList
 from ..helpers.jupyter import display_table
+import string
+import random
+from extract_tables import extract_tables
+
 
 class Paper:
     def __init__(self, paper_id, text, tables, annotations):
@@ -24,6 +28,33 @@ def __init__(self, paper_id, text, tables, annotations):
         else:
             self.gold_tags = ''
 
+    def table_by_name(self, name):
+        for table in self.tables:
+            if table.name == name:
+                return table
+        return None
+
+
+# todo: make sure multithreading/processing won't cause collisions
+def random_id():
+    return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
+
+
+class TempPaper(Paper):
+    """Similar to Paper, but can be used as context manager, temporarily saving the paper to elastic"""
+    def __init__(self, html):
+        paper_id = random_id()
+        text = PaperText.from_html(html, paper_id)
+        tables = extract_tables(html)
+        super().__init__(paper_id=paper_id, text=text, tables=tables, annotations=None)
+
+    def __enter__(self):
+        self.text.save()
+        return self
+
+    def __exit__(self, exc, value, tb):
+        self.text.delete()
+
 
 arxiv_version_re = re.compile(r"v\d+$")
 def remove_arxiv_version(arxiv_id):
@@ -42,8 +73,12 @@ def _load_tables(path, annotations, jobs, migrate):
     return {f.parent.name: tbls for f, tbls in zip(files, tables)}
 
 
-def _load_annotated_papers(path):
-    dump = load_gql_dump(path, compressed=path.suffix == ".gz")["allPapers"]
+def _load_annotated_papers(data_or_path):
+    if isinstance(data_or_path, dict):
+        compressed = False
+    else:
+        compressed = data_or_path.suffix == ".gz"
+    dump = load_gql_dump(data_or_path, compressed=compressed)["allPapers"]
     annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
     annotations.update({a.arxiv_id: a for a in dump})
     return annotations
@@ -54,21 +89,24 @@ def __init__(self, data=None):
         super().__init__(data)
 
     @classmethod
-    def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, jobs=-1, migrate=False):
+    def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
         path = Path(path)
         if annotations_path is None:
             annotations_path = path / "structure-annotations.json"
+        else:
+            annotations_path = Path(annotations_path)
         if load_texts:
             texts = _load_texts(path, jobs)
         else:
             texts = {}
 
-        annotations = _load_annotated_papers(annotations_path)
+        annotations = {}
         if load_tables:
+            if load_annotations:
+                annotations = _load_annotated_papers(annotations_path)
             tables = _load_tables(path, annotations, jobs, migrate)
         else:
             tables = {}
-            annotations = {}
         outer_join = set(texts).union(set(tables))
 
         papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join]