Add the datasets and extractor helpers

Marcin Kardas · Marcin Kardas · commit 281c90771573 · 2020-05-05T23:37:16.000+01:00
diff --git a/axcell/helpers/datasets.py b/axcell/helpers/datasets.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+def read_arxiv_papers(path):
+    return pd.read_csv(path)
+
+def read_tables(path):
+    return pd.read_json(path)
+
diff --git a/axcell/helpers/paper_extractor.py b/axcell/helpers/paper_extractor.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+from axcell.helpers import LatexConverter, Unpack
+from axcell.errors import UnpackError, LatexConversionError
+from axcell.data.elastic import Paper as PaperText
+import axcell.data.extract_tables as table_extraction
+
+import re
+import warnings
+
+arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$")
+
+
+class PaperExtractor:
+    def __init__(self, root):
+        self.root = Path(root)
+        self.unpack = Unpack()
+        self.latex = LatexConverter()
+
+    def __call__(self, source):
+        source = Path(source)
+
+        m = arxiv_re.match(source.name)
+        if not m:
+            warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename')
+            arxiv_id = source.name
+        else:
+            arxiv_id = m.group('arxiv_id')
+
+        subpath = source.relative_to(self.root / 'sources').parent / arxiv_id
+        unpack_path = self.root / 'unpacked_sources' / subpath
+        try:
+            self.unpack(source, unpack_path)
+        except UnpackError as e:
+            if e.message.startswith('The paper has been withdrawn'):
+                return 'withdrawn'
+            return 'no-tex'
+        html_path = self.root / 'htmls' / subpath / 'index.html'
+        try:
+            html = self.latex.to_html(unpack_path)
+            html_path.parent.mkdir(parents=True, exist_ok=True)
+            html_path.write_text(html, 'utf-8')
+        except LatexConversionError:
+            return 'processing-error'
+
+        text_path = self.root / 'papers' / subpath / 'text.json'
+        doc = PaperText.from_html(html, arxiv_id)
+        text_path.parent.mkdir(parents=True, exist_ok=True)
+        text_path.write_text(doc.to_json(), 'utf-8')
+
+        tables_path = self.root / 'papers' / subpath
+        tables = table_extraction.extract_tables(html)
+        table_extraction.save_tables(tables, tables_path)
+
+        return 'success'