Make pipeline more robust

mkardas · mkardas · commit 3df46483a46b · 2019-09-24T23:13:26.000+02:00
* add temporary paper type to automatically delete a paper from elastic
* add unpack step
* add custom exceptions
diff --git a/environment.yml b/environment.yml
@@ -16,3 +16,4 @@ dependencies:
 - elasticsearch-dsl=7.0.0
 - ipython=7.5.0
 - joblib=0.13.2
+- python-magic=0.4.15
diff --git a/extract_tables.py b/extract_tables.py
@@ -335,9 +335,7 @@ def remove_footnotes(soup):
         elem.extract()
 
 
-def extract_tables(filename):
-    with open(filename, "rb") as f:
-        html = f.read()
+def extract_tables(html):
     soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
     set_ids_by_labels(soup)
     fix_span_tables(soup)
@@ -377,7 +375,9 @@ def extract_tables(filename):
     return data
 
 def extract_tables_cmd(filename, outdir):
-    tables = extract_tables(filename)
+    with open(filename, "rb") as f:
+        html = f.read()
+    tables = extract_tables(html)
     outdir = Path(outdir)
     outdir.mkdir(parents=True, exist_ok=True)
     save_tables(tables, outdir)
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -1,3 +1,4 @@
+from bs4 import BeautifulSoup
 import pandas as pd
 import re
 
@@ -260,6 +261,11 @@ def print_section(self, name, clean_up=lambda x: x):
     def read_html(cls, file):
         return read_html(file)
 
+    @classmethod
+    def from_html(cls, html, paper_id):
+        soup = BeautifulSoup(html, "html.parser")
+        return cls.parse_html(soup, paper_id)
+
     @classmethod
     def parse_paper(cls, file, paper_id=None):
         file = Path(file)
diff --git a/sota_extractor2/data/structure.py b/sota_extractor2/data/structure.py
@@ -165,4 +165,7 @@ def __init__(self):
         setup_default_connection()
 
     def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
-        return pd.concat([evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables])
+        dfs = [evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables]
+        if len(dfs):
+            return pd.concat(dfs)
+        return pd.DataFrame()
diff --git a/sota_extractor2/errors.py b/sota_extractor2/errors.py
@@ -0,0 +1,10 @@
+class PipelineError(Exception):
+    pass
+
+
+class UnpackError(PipelineError):
+    pass
+
+
+class LatexConversionError(PipelineError):
+    pass
diff --git a/sota_extractor2/helpers/__init__.py b/sota_extractor2/helpers/__init__.py
@@ -0,0 +1,5 @@
+from .temp_paper import TempPaper
+from .latex_converter import LatexConverter
+from .unpack import Unpack
+
+__all__ = ["TempPaper", "LatexConverter", "Unpack"]
diff --git a/sota_extractor2/helpers/latex_converter.py b/sota_extractor2/helpers/latex_converter.py
@@ -1,19 +1,27 @@
 import docker
+from docker.errors import ContainerError, ImageNotFound
 from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from sota_extractor2.errors import LatexConversionError
+
 
 def ro_bind(path): return dict(bind=path, mode='ro')
-def rw_bind(path): return dict(bind=path, mode='rw')
 
 
+def rw_bind(path): return dict(bind=path, mode='rw')
+
 
 class LatexConverter:
     def __init__(self, base_path):
         # pull arxivvanity/engrafo image
         self.client = docker.from_env()
         self.base_path = Path(base_path)
 
-    def to_html(self, source_dir, output_dir):
+    def latex2html(self, source_dir, output_dir):
         base = self.base_path
+        source_dir = Path(source_dir)
+        output_dir = Path(output_dir)
         volumes = {
             base / "latex2html.sh": ro_bind("/files/latex2html.sh"),
             base / "guess_main.py": ro_bind("/files/guess_main.py"),  # todo: run guess_main outside of docker
@@ -25,6 +33,28 @@ def to_html(self, source_dir, output_dir):
         output_dir.mkdir(parents=True, exist_ok=True)
         filename = "index.html"
         command = ["/files/latex2html.sh", filename]
-        self.client.containers.run("arxivvanity/engrafo", command, remove=True,
-                              volumes=volumes)  # todo: check if command as a list protects from shell injection
-    # todo: check for errors
+        self.client.containers.run("arxivvanity/engrafo", command, remove=True, volumes=volumes)
+
+    # todo: check for errors
+
+    def clean_html(self, path):
+        path = Path(path)
+        volumes = {
+            path.resolve(): ro_bind("/files/index.html"),
+        }
+
+        command = "timeout -t 20 -s KILL chromium-browser --headless" \
+                  " --disable-gpu --disable-software-rasterizer --no-sandbox" \
+                  " --timeout=30000 --dump-dom /files/index.html"
+        data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="",
+                                          volumes=volumes)
+        return data.decode('utf-8')
+
+    def to_html(self, source_dir):
+        with TemporaryDirectory() as output_dir:
+            output_dir = Path(output_dir)
+            try:
+                self.latex2html(source_dir, output_dir)
+                return self.clean_html(output_dir / "index.html")
+            except ContainerError as err:
+                raise LatexConversionError from err
diff --git a/sota_extractor2/helpers/temp_paper.py b/sota_extractor2/helpers/temp_paper.py
@@ -4,12 +4,23 @@
 import string
 import random
 
+
 # todo: make sure multithreading/processing won't cause collisions
 def random_id():
     return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
 
 
-def temp_paper(path):
-    text = PaperText.parse_paper(path, random_id())
-    tables = extract_tables(path)
-    return Paper(paper_id=text.meta['id'], text=text, tables=tables, annotations=None)
+class TempPaper(Paper):
+    """Similar to Paper, but can be used as context manager, temporarily saving the paper to elastic"""
+    def __init__(self, html):
+        paper_id = random_id()
+        text = PaperText.from_html(html, paper_id)
+        tables = extract_tables(html)
+        super().__init__(paper_id=paper_id, text=text, tables=tables, annotations=None)
+
+    def __enter__(self):
+        self.text.save()
+        return self
+
+    def __exit__(self, exc, value, tb):
+        self.text.delete()
diff --git a/sota_extractor2/helpers/unpack.py b/sota_extractor2/helpers/unpack.py
@@ -0,0 +1,26 @@
+from magic import Magic
+import tarfile
+import gzip
+from pathlib import Path
+from shutil import copyfileobj
+from sota_extractor2.errors import UnpackError
+
+
+class Unpack:
+    def __init__(self):
+        self.magic = Magic(mime=True, uncompress=True)
+
+    def __call__(self, source, dest):
+        source = Path(source)
+        dest = Path(dest)
+        mime = self.magic.from_file(str(source))
+        if mime == 'application/x-tar':
+            dest.mkdir(parents=True, exist_ok=True)
+            with tarfile.open(source, "r:*") as tar:
+                tar.extractall(dest)
+        elif mime == 'text/x-tex':
+            dest.mkdir(parents=True, exist_ok=True)
+            with gzip.open(source, "rb") as src, open(dest / "main.tex") as dst:
+                copyfileobj(src, dst)
+        else:
+            raise UnpackError(f"Cannot unpack file of type {mime}")
diff --git a/sota_extractor2/models/linking/context_search.py b/sota_extractor2/models/linking/context_search.py
@@ -158,8 +158,8 @@ def compute_context_logprobs(self, context, noise, logprobs):
         dss = set(find_datasets(context)) | set(abbrvs.keys())
         mss = set(find_metrics(context))
         dss -= mss
-        print("dss", dss)
-        print("mss", mss)
+        ###print("dss", dss)
+        ###print("mss", mss)
         self.compute_logprobs(dss, mss, abbrvs, noise, logprobs)
 
     def match(self, contexts):
@@ -177,9 +177,9 @@ def __call__(self, query, datasets, caption, debug_info=None):
         cell = debug_info.cell
         key = (datasets, caption, query)
         cellstr = f"{cell.table_ext_id}/{cell.row}.{cell.col}"
-        print(f"[DEBUG] {cellstr}")
-        print("[DEBUG]", debug_info)
-        print("query:", query, caption)
+        ###print(f"[DEBUG] {cellstr}")
+        ###print("[DEBUG]", debug_info)
+        ###print("query:", query, caption)
         if key in self.queries:
             # print(self.queries[key])
             for context in key:
@@ -188,10 +188,10 @@ def __call__(self, query, datasets, caption, debug_info=None):
                 dss = set(find_datasets(context)) | set(abbrvs.keys())
                 mss = set(find_metrics(context))
                 dss -= mss
-                print("dss", dss)
-                print("mss", mss)
+                ###print("dss", dss)
+                ###print("mss", mss)
 
-            print("Taking result from cache")
+            ###print("Taking result from cache")
             p = self.queries[key]
         else:
             dist = self.match(key)
@@ -212,7 +212,7 @@ def __call__(self, query, datasets, caption, debug_info=None):
 
             self.queries[key] = p
 
-        print(p)
+        ###print(p)
 
         # error analysis only
         if self.debug_gold_df is not None:
diff --git a/sota_extractor2/models/structure/structure_predictor.py b/sota_extractor2/models/structure/structure_predictor.py
@@ -62,11 +62,13 @@ def df2tl(self, df):
         return TextList.from_df(df, cols=text_cols)
 
     def get_features(self, evidences):
-        tl = self.df2tl(evidences)
-        self.learner.data.add_test(tl)
+        if len(evidences):
+            tl = self.df2tl(evidences)
+            self.learner.data.add_test(tl)
 
-        preds, _ = self.learner.get_preds(DatasetType.Test, ordered=True)
-        return preds.cpu().numpy()
+            preds, _ = self.learner.get_preds(DatasetType.Test, ordered=True)
+            return preds.cpu().numpy()
+        return np.zeros((0, n_ulmfit_features))
 
     def to_tables(self, df, transpose=False):
         X_tables = []
@@ -162,12 +164,15 @@ def label_table(self, paper, table, annotations):
         if ext_id in annotations:
             for _, entry in annotations[ext_id].iterrows():
                 structure.iloc[entry.row, entry.col] = entry.predicted_tags if entry.predicted_tags != "model-paper" else "model-best"
-        table = deepcopy(table)
+        table = deepcopy(table)  # fix deepcopy of DataFrame of dataclass instances
         table.set_tags(structure)
         return table
 
     # todo: take EvidenceExtractor in constructor
     def predict(self, paper, tables, raw_evidences):
-        tags = self.predict_tags(raw_evidences)
-        annotations = dict(list(tags.groupby(by=["paper", "table"])))
+        if len(raw_evidences):
+            tags = self.predict_tags(raw_evidences)
+            annotations = dict(list(tags.groupby(by=["paper", "table"])))
+        else:
+            annotations = {}  # just deep-copy all tables
         return [self.label_table(paper, table, annotations) for table in tables]
diff --git a/sota_extractor2/models/structure/type_predictor.py b/sota_extractor2/models/structure/type_predictor.py
@@ -10,9 +10,11 @@ def __init__(self, path, file, sp_path=None, sp_model="spm.model", sp_vocab="spm
 
 
     def predict(self, paper, tables):
-        df = pd.DataFrame({"caption": [table.caption for table in tables]})
+        if len(tables) == 0:
+            return []
+        df = pd.DataFrame({"caption": [table.caption if table.caption else "" for table in tables]})
         tl = TextList.from_df(df, cols="caption")
         self.learner.data.add_test(tl)
         preds, _ = self.learner.get_preds(DatasetType.Test, ordered=True)
         preds, _ = (preds.cpu() > self.threshold).max(dim=1)
-        return preds.numpy().astype(bool)
+        return preds.numpy().astype(bool).tolist()