paperswithcode
diff --git a/‎extract_tables.py
Lines changed: 11 additions & 14 deletions b/‎extract_tables.py
Lines changed: 11 additions & 14 deletions
diff --git a/‎sota_extractor2/config.py
Lines changed: 4 additions & 0 deletions b/‎sota_extractor2/config.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎sota_extractor2/data/elastic.py
Lines changed: 14 additions & 4 deletions b/‎sota_extractor2/data/elastic.py
Lines changed: 14 additions & 4 deletions
diff --git a/‎sota_extractor2/data/structure.py
Lines changed: 46 additions & 26 deletions b/‎sota_extractor2/data/structure.py
Lines changed: 46 additions & 26 deletions
diff --git a/‎sota_extractor2/data/table.py
Lines changed: 36 additions & 7 deletions b/‎sota_extractor2/data/table.py
Lines changed: 36 additions & 7 deletions
diff --git a/‎sota_extractor2/helpers/latex_converter.py
Lines changed: 30 additions & 0 deletions b/‎sota_extractor2/helpers/latex_converter.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎sota_extractor2/helpers/temp_paper.py
Lines changed: 15 additions & 0 deletions b/‎sota_extractor2/helpers/temp_paper.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎sota_extractor2/models/linking/__init__.py
Lines changed: 7 additions & 0 deletions b/‎sota_extractor2/models/linking/__init__.py
Lines changed: 7 additions & 0 deletions
@@ -13,8 +13,7 @@
 from dataclasses import dataclass
 from typing import Set
 
-from tabular import Tabular
-
+from sota_extractor2.data.table import Table
 
 # begin of dirty hack
 # pandas parsing of html tables is really nice
@@ -265,18 +264,13 @@ def html2data(table):
     return data[0] if len(data) == 1 else None
 
 
-def save_table(data, filename):
-    data.to_csv(filename, header=None, index=None)
-
-
 def save_tables(data, outdir):
     metadata = []
 
     for num, table in enumerate(data, 1):
         filename = f"table_{num:02}.csv"
         layout = f"layout_{num:02}.csv"
-        save_table(table.data, outdir / filename)
-        save_table(table.layout, outdir / layout)
+        table.save(outdir, filename, layout)
         metadata.append(dict(filename=filename, layout=layout, caption=table.caption, figure_id=table.figure_id))
     with open(outdir / "metadata.json", "w") as f:
         json.dump(metadata, f)
@@ -341,11 +335,9 @@ def remove_footnotes(soup):
         elem.extract()
 
 
-def extract_tables(filename, outdir):
+def extract_tables(filename):
     with open(filename, "rb") as f:
         html = f.read()
-    outdir = Path(outdir)
-    outdir.mkdir(parents=True, exist_ok=True)
     soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
     set_ids_by_labels(soup)
     fix_span_tables(soup)
@@ -381,8 +373,13 @@ def extract_tables(filename, outdir):
             if cap_el is not None:
                 caption = clear_ws(cap_el.get_text())
         figure_id = table.get("data-figure-id")
-        data.append(Tabular(tab, layout, caption, figure_id))
+        data.append(Table(f"table_{len(data)+1:02}", tab, layout.applymap(str), caption, figure_id))
+    return data
 
-    save_tables(data, outdir)
+def extract_tables_cmd(filename, outdir):
+    tables = extract_tables(filename)
+    outdir = Path(outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    save_tables(tables, outdir)
 
-if __name__ == "__main__": fire.Fire(extract_tables)
+if __name__ == "__main__": fire.Fire(extract_tables_cmd)
@@ -22,3 +22,7 @@
 
 datasets = data/"datasets"
 datasets_structure = datasets/"structure"
+structure_models = datasets / "structure" / "models"
+
+linking_models = datasets / "linking" / "models"
+linking_data = datasets / "linking" / "data"
@@ -162,9 +162,10 @@ def from_json(cls, json, paper_id=None):
         return paper
 
     @classmethod
-    def from_file(cls, path):
+    def from_file(cls, path, paper_id=None):
         path = Path(path)
-        paper_id = path.parent.name
+        if paper_id is None:
+            paper_id = path.parent.name
         with open(path, "rt") as f:
             json = f.read()
         return cls.from_json(json, paper_id)
@@ -187,6 +188,12 @@ def save(self, **kwargs):
         else:
             return super().save(**kwargs)
 
+    def delete(self, **kwargs):
+        if hasattr(self, 'fragments'):
+            for f in self.fragments:
+                f.delete()
+        return super().delete(**kwargs)
+
     @classmethod
     def parse_html(cls, soup, paper_id):
         put_dummy_anchors(soup)
@@ -254,9 +261,12 @@ def read_html(cls, file):
         return read_html(file)
 
     @classmethod
-    def parse_paper(cls, file):
+    def parse_paper(cls, file, paper_id=None):
+        file = Path(file)
         soup = cls.read_html(file)
-        return cls.parse_html(soup, file.stem)
+        if paper_id is None:
+            paper_id = file.stem
+        return cls.parse_html(soup, paper_id)
 
 
 class Author(InnerDoc):
 
@@ -3,8 +3,9 @@
 from collections import namedtuple
 import hashlib
 from fastai.text import progress_bar
-from .elastic import Fragment
+from .elastic import Fragment, setup_default_connection
 from .json import *
+from .table import reference_re, remove_text_styles, remove_references, style_tags_re
 
 def get_all_tables(papers):
     for paper in papers:
@@ -13,11 +14,18 @@ def get_all_tables(papers):
                 table.paper_id = paper.arxiv_id
                 yield table
 
-def consume_cells(*matrix):
+def consume_cells(table):
     Cell = namedtuple('AnnCell', 'row col vals')
-    for row_id, row in enumerate(zip(*matrix)):
-        for col_id, cell_val in enumerate(zip(*row)):
-            yield Cell(row=row_id, col=col_id, vals=cell_val)
+    for row_id, row in enumerate(table.df.values):
+        for col_id, cell in enumerate(row):
+            vals = [
+                remove_text_styles(remove_references(cell.raw_value)),
+                "",
+                cell.refs[0] if cell.refs else "",
+                cell.layout,
+                bool(style_tags_re.search(cell.raw_value))
+            ]
+            yield Cell(row=row_id, col=col_id, vals=vals)
 
 
 reference_re = re.compile(r"\[[^]]*\]")
@@ -38,10 +46,12 @@ def empty_fragment(paper_id):
     return fragment
 
 
-def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpus_limit=10):
+def fetch_evidence(cell_content, cell_reference, paper_id, table_name, row, col, paper_limit=10, corpus_limit=10):
+    if not filter_cells(cell_content):
+        return [empty_fragment(paper_id)]
     cell_content = clear_cell(cell_content)
     if cell_content == "" and cell_reference == "":
-        return []
+        return [empty_fragment(paper_id)]
 
     evidence_query = Fragment.search().highlight(
         'text', pre_tags="<b>", post_tags="</b>", fragment_size=400)
@@ -65,8 +75,11 @@ def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpu
     other_fagements = list(evidence_query
                            .exclude('term', paper_id=paper_id)
                            .query('match_phrase', text=query)[:corpus_limit])
-    if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
-        print(f"No evidences for '{cell_content}' of {paper_id}")
+
+    ext_id = f"{paper_id}/{table_name}/{row}.{col}"
+    ####print(f"{ext_id} |{cell_content}|: {len(paper_fragments)} paper fragments, {len(reference_fragments)} reference fragments, {len(other_fagements)} other fragments")
+    # if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
+    #     print(f"No evidences for '{cell_content}' of {paper_id}")
     if not len(paper_fragments) and not len(reference_fragments):
         paper_fragments = [empty_fragment(paper_id)]
     return paper_fragments + reference_fragments + other_fagements
@@ -86,13 +99,13 @@ def fix_reference_hightlight(s):
     return partial_highlight_re.sub("xxref-", s)
 
 
-def create_evidence_records(textfrag, cell, table):
+def create_evidence_records(textfrag, cell, paper, table):
     for text_highlited in textfrag.meta['highlight']['text']:
         text_highlited = fix_reference_hightlight(fix_refs(text_highlited))
         text = highlight_re.sub("", text_highlited)
         text_sha1 = hashlib.sha1(text.encode("utf-8")).hexdigest()
 
-        cell_ext_id = f"{table.ext_id}/{cell.row}/{cell.col}"
+        cell_ext_id = f"{paper.paper_id}/{table.name}/{cell.row}/{cell.col}"
 
         yield {"text_sha1": text_sha1,
                "text_highlited": text_highlited,
@@ -103,46 +116,53 @@ def create_evidence_records(textfrag, cell, table):
                "cell_reference": cell.vals[2],
                "cell_layout": cell.vals[3],
                "cell_styles": cell.vals[4],
-               "this_paper": textfrag.paper_id == table.paper_id,
+               "this_paper": textfrag.paper_id == paper.paper_id,
                "row": cell.row,
                "col": cell.col,
-               "row_context": " border ".join([str(s) for s in table.matrix[cell.row]]),
-               "col_context": " border ".join([str(s) for s in table.matrix[:, cell.col]]),
+               "row_context": " border ".join([str(s) for s in table.matrix.values[cell.row]]),
+               "col_context": " border ".join([str(s) for s in table.matrix.values[:, cell.col]]),
                "ext_id": cell_ext_id
                #"table_id":table_id
                }
 
 
-def filter_cells(cell):
-    return re.search("[a-zA-Z]{2,}", cell.vals[1]) is not None
+def filter_cells(cell_content):
+    return re.search("[a-zA-Z]{2,}", cell_content) is not None
 
 
 interesting_types = ["model-paper", "model-best", "model-competing", "dataset", "dataset-sub",  "dataset-task"]
 
 
-def evidence_for_table(table, paper_limit=10, corpus_limit=1, limit_type='interesting'):
-    def get_limits(cell_type):
-        if limit_type == 'interesting' and (cell_type.strip() in interesting_types) or (limit_type == 'max'):
-            return dict(paper_limit=1000, corpus_limit=1000)
-        return dict(paper_limit=paper_limit, corpus_limit=corpus_limit)
+def evidence_for_table(paper, table, paper_limit, corpus_limit):
     records = [
         record
-            for cell in consume_cells(table.matrix, table.matrix_gold_tags, table.matrix_references, table.matrix_layout, table.matrix_styles) if filter_cells(cell)
-            for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=table.paper_id, **get_limits(cell.vals[1]))
-            for record in create_evidence_records(evidence, cell, table=table)
+            for cell in consume_cells(table)
+            for evidence in fetch_evidence(cell.vals[0], cell.vals[2], paper_id=paper.paper_id, table_name=table.name,
+                                           row=cell.row, col=cell.col, paper_limit=paper_limit, corpus_limit=corpus_limit)
+            for record in create_evidence_records(evidence, cell, paper=paper, table=table)
     ]
     df = pd.DataFrame.from_records(records)
     return df
 
 
-def prepare_data(tables, csv_path, limit_type='interesting'):
-    df = pd.concat([evidence_for_table(table,
+def prepare_data(paper, tables, csv_path, limit_type='interesting'):
+    df = pd.concat([evidence_for_table(paper, table,
                                        paper_limit=100,
                                        corpus_limit=20,
                                        limit_type=limit_type) for table in progress_bar(tables)])
     #moved to experiment preprocessing
     #df = df.drop_duplicates(
     #    ["cell_content", "text_highlited", "cell_type", "this_paper"])
     print("Number of text fragments ", len(df))
+
     csv_path.parent.mkdir(parents=True, exist_ok=True)
     df.to_csv(csv_path, index=None)
+
+
+class CellEvidenceExtractor:
+    def __init__(self):
+        # todo: make sure can be called more than once or refactor to singleton
+        setup_default_connection()
+
+    def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
+        return pd.concat([evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables])
@@ -36,6 +36,12 @@ def extract_references(s):
     return text, refs
 
 
+empty_paren_re = re.compile(r"\(\s*\)|\[\s*\]")
+def remove_references(s):
+    s = reference_re.sub("", s)
+    return empty_paren_re.sub("", s)
+
+
 style_tags_re = re.compile(r"</?(bold|italic|red|green|blue)>")
 def remove_text_styles(s):
     return style_tags_re.sub("", s)
@@ -76,10 +82,7 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
             self.old_name = old_name
 
         if layout is not None:
-            #self.layout = layout
-            for r, row in layout.iterrows():
-                for c, cell in enumerate(row):
-                    self.df.iloc[r,c].layout = cell
+            self.set_layout(layout)
 
         if annotations is not None:
             self.gold_tags = annotations.gold_tags.strip()
@@ -97,9 +100,7 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
             elif gt_rows > 0:
                 gt_cols = len(tags[0])
                 if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols):
-                    for r, row in enumerate(tags):
-                        for c, cell in enumerate(row):
-                            self.df.iloc[r,c].gold_tags = cell.strip()
+                    self.set_tags(tags)
                 else:
                     print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
                 #    print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
@@ -112,14 +113,34 @@ def __init__(self, name, df, layout, caption=None, figure_id=None, annotations=N
             self.dataset_text = ''
             self.notes = ''
 
+    def set_layout(self, layout):
+        for r, row in layout.iterrows():
+            for c, cell in enumerate(row):
+                self.df.iloc[r, c].layout = cell
+
+    def set_tags(self, tags):
+        for r, row in tags.iterrows():
+            for c, cell in enumerate(row):
+                # todo: change gold_tags to tags to avoid confusion
+                self.df.iloc[r,c].gold_tags = cell.strip()
+
     @property
     def matrix(self):
         return self.df.applymap(lambda x: x.value)
 
+    @property
+    def matrix_layout(self):
+        return self.df.applymap(lambda x: x.layout)
+
     @property
     def matrix_gold_tags(self):
         return self.df.applymap(lambda x: x.gold_tags)
 
+    # todo: remove gold_tags
+    @property
+    def matrix_tags(self):
+        return self.matrix_gold_tags
+
     @classmethod
     def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=None, guessed_tags=None):
         path = Path(path)
@@ -146,6 +167,14 @@ def from_file(cls, path, metadata, annotations=None, migrate=False, match_name=N
     def display(self):
         display_table(self.df.applymap(lambda x: raw_value_to_html(x.raw_value)).values, self.df.applymap(lambda x: x.gold_tags).values, self.df.applymap(lambda x:x.layout).values)
 
+    def _save_df(self, df, filename):
+        df.to_csv(filename, header=None, index=None)
+
+    def save(self, path, table_name, layout_name):
+        path = Path(path)
+        self._save_df(self.df.applymap(lambda x: x.raw_value), path / table_name)
+        self._save_df(self.df.applymap(lambda x: x.layout), path / layout_name)
+
 #####
 # this code is used to migrate table annotations from
 # tables parsed by htlatex to tables parsed by
 
@@ -0,0 +1,30 @@
+import docker
+from pathlib import Path
+
+def ro_bind(path): return dict(bind=path, mode='ro')
+def rw_bind(path): return dict(bind=path, mode='rw')
+
+
+
+class LatexConverter:
+    def __init__(self, base_path):
+        # pull arxivvanity/engrafo image
+        self.client = docker.from_env()
+        self.base_path = Path(base_path)
+
+    def to_html(self, source_dir, output_dir):
+        base = self.base_path
+        volumes = {
+            base / "latex2html.sh": ro_bind("/files/latex2html.sh"),
+            base / "guess_main.py": ro_bind("/files/guess_main.py"),  # todo: run guess_main outside of docker
+            base / "patches": ro_bind("/files/patches"),  # todo: see which patches can be dropped
+            source_dir.resolve(): ro_bind("/files/ro-source"),
+            output_dir.resolve(): rw_bind("/files/htmls")
+        }
+
+        output_dir.mkdir(parents=True, exist_ok=True)
+        filename = "index.html"
+        command = ["/files/latex2html.sh", filename]
+        self.client.containers.run("arxivvanity/engrafo", command, remove=True,
+                              volumes=volumes)  # todo: check if command as a list protects from shell injection
+    # todo: check for errors
@@ -0,0 +1,15 @@
+from ..data.elastic import Paper as PaperText
+from ..data.paper_collection import Paper
+from extract_tables import extract_tables
+import string
+import random
+
+# todo: make sure multithreading/processing won't cause collisions
+def random_id():
+    return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
+
+
+def temp_paper(path):
+    text = PaperText.parse_paper(path, random_id())
+    tables = extract_tables(path)
+    return Paper(paper_id=text.meta['id'], text=text, tables=tables, annotations=None)
@@ -0,0 +1,7 @@
+from .taxonomy import Taxonomy
+from .linker import Linker
+from .context_search import ContextSearch, DatasetExtractor
+from .proposals_filters import *
+
+__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "ProposalsFilter", "NopFilter",
+           "BestResultFilter", "StructurePredictionFilter", "ConfidenceFilter", "CompoundFilter"]