Test multiprocessing

mkardas · mkardas · commit 917f1b3e6431 · 2019-07-04T04:26:36.000+02:00
diff --git a/environment.yml b/environment.yml
@@ -15,3 +15,5 @@ dependencies:
 - Unidecode=1.0.23
 - elasticsearch-dsl=7.0.0
 - ipython=7.5.0
+- tqdm=4.28.1
+- joblib=0.13.2
diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py
@@ -3,6 +3,8 @@
 from .json import load_gql_dump
 from pathlib import Path
 import re
+from tqdm import tqdm
+from joblib import Parallel, delayed
 
 class Paper:
     def __init__(self, text, tables, annotations):
@@ -15,11 +17,11 @@ def __init__(self, text, tables, annotations):
 
 
 arxiv_version_re = re.compile(r"v\d+$")
-def clean_arxiv_version(arxiv_id):
+def clear_arxiv_version(arxiv_id):
     return arxiv_version_re.sub("", arxiv_id)
 
 
-class PaperCollection:
+class PaperCollection(dict):
     def __init__(self, path, load_texts=True, load_tables=True):
         self.path = path
         self.load_texts = load_texts
@@ -50,27 +52,20 @@ def __iter__(self):
         return iter(self._papers)
 
     def _load_texts(self):
-        texts = {}
-
-        for f in (self.path / "texts").glob("**/*.json"):
-            text = PaperText.from_file(f)
-            texts[clean_arxiv_version(text.meta.id)] = text
-        return texts
+        files = list((self.path / "texts").glob("**/*.json"))
+        texts = Parallel(n_jobs=-1, prefer="processes")(delayed(PaperText.from_file)(f) for f in files)
+        return {clear_arxiv_version(text.meta.id): text for text in texts}
 
 
     def _load_tables(self, annotations):
-        tables = {}
-
-        for f in (self.path / "tables").glob("**/metadata.json"):
-            paper_dir = f.parent
-            tbls = read_tables(paper_dir, annotations)
-            tables[clean_arxiv_version(paper_dir.name)] = tbls
-        return tables
+        files = list((self.path / "tables").glob("**/metadata.json"))
+        tables = Parallel(n_jobs=-1, prefer="processes")(delayed(read_tables)(f.parent, annotations) for f in files)
+        return {clear_arxiv_version(f.parent.name): tbls for f, tbls in zip(files, tables)}
 
     def _load_annotated_papers(self):
         dump = load_gql_dump(self.path / "structure-annotations.json.gz", compressed=True)["allPapers"]
         annotations = {}
         for a in dump:
-            arxiv_id = clean_arxiv_version(a.arxiv_id)
+            arxiv_id = clear_arxiv_version(a.arxiv_id)
             annotations[arxiv_id] = a
         return annotations
diff --git a/sota_extractor2/data/table.py b/sota_extractor2/data/table.py
@@ -1,23 +1,34 @@
 import pandas as pd
 import json
 from pathlib import Path
-from dataclasses import dataclass
+import re
+from dataclasses import dataclass, field
 from typing import List
 from ..helpers.jupyter import display_table
 
 @dataclass
 class Cell:
     value: str
     gold_tags: str = ''
-    refs: List[str] = None
+    refs: List[str] = field(default_factory=list)
 
 
+reference_re = re.compile(r"\[(xxref-[^] ]*)\]")
+def extract_references(s):
+    parts = reference_re.split(s)
+    return ''.join(parts[::2]), parts[1::2]
+
+
+def str2cell(s):
+    value, refs = extract_references(s)
+    return Cell(value=value, refs=refs)
+
 class Table:
     def __init__(self, df, caption=None, figure_id=None, annotations=None):
         self.df = df
         self.caption = caption
         self.figure_id = figure_id
-        self.df = df.applymap(lambda x: Cell(value=x))
+        self.df = df.applymap(str2cell)
         if annotations is not None:
             self.gold_tags = annotations.gold_tags.strip()
             tags = annotations.matrix_gold_tags