Reference extraction

Marcin Kardas · Marcin Kardas · commit 6489719a5566 · 2020-03-09T14:32:28.000Z
diff --git a/init_references.py b/init_references.py
@@ -2,7 +2,8 @@
 import json
 from pathlib import Path
 from collections import Counter
-from sota_extractor2.data.elastic import Reference2, setup_default_connection
+from sota_extractor2.data.elastic import Reference2
+from elasticsearch_dsl import connections
 from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
 from tqdm import tqdm
 from elasticsearch.helpers import bulk
@@ -13,7 +14,7 @@
 # required for bulk saving
 http.client._MAXHEADERS = 1000
 
-setup_default_connection()
+connections.create_connection(hosts=['elasticsearch'], timeout=20)
 
 papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")
 
diff --git a/parse_references.py b/parse_references.py
@@ -17,33 +17,7 @@
 pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
 
 
-def get_refstrings(p):
-    paper = p.text if hasattr(p, 'text') else p
-    if not hasattr(paper, 'fragments'):
-        return
-    fragments = paper.fragments
-    ref_sec_started = False
-    for f in reversed(fragments):
-        if f.header.startswith('xxanchor-bib'):
-            ref_sec_started = True
-            yield f.text
-        elif ref_sec_started:
-            # todo: check if a paper can have multiple bibliography sections
-            # (f.e., one in the main paper and one in the appendix)
-            break  # the refsection is only at the end of paper
-
-
-_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
-def extract_refs(p):
-    for ref in get_refstrings(p):
-        m = _ref_re.match(ref)
-        if m:
-            ref_id, ref_str = m.groups()
-            yield {
-                "paper_arxiv_id": p.arxiv_no_version,
-                "ref_id": ref_id,
-                "ref_str": ref_str.strip(r'\s')
-            }
+
 
 class PaperCollectionReferenceParser:
     def __init__(self):
diff --git a/sota_extractor2/data/references.py b/sota_extractor2/data/references.py
@@ -344,3 +344,33 @@ def sync(self):
                 p.save()
             except ConflictError:
                 pass
+
+
+def get_refstrings(p):
+    paper = p.text if hasattr(p, 'text') else p
+    if not hasattr(paper, 'fragments'):
+        return
+    fragments = paper.fragments
+    ref_sec_started = False
+    for f in reversed(fragments):
+        if f.header.startswith('xxanchor-bib'):
+            ref_sec_started = True
+            yield f.text
+        elif ref_sec_started:
+            # todo: check if a paper can have multiple bibliography sections
+            # (f.e., one in the main paper and one in the appendix)
+            break  # the refsection is only at the end of paper
+
+
+
+_ref_re = re.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
+def extract_refs(p):
+    for ref in get_refstrings(p):
+        m = _ref_re.match(ref)
+        if m:
+            ref_id, ref_str = m.groups()
+            yield {
+                "paper_arxiv_id": p.arxiv_no_version,
+                "ref_id": ref_id,
+                "ref_str": ref_str.strip(r'\s')
+            }
diff --git a/sota_extractor2/data/table.py b/sota_extractor2/data/table.py
@@ -24,7 +24,7 @@ class Cell:
 
 def extract_references(s):
     parts = reference_re.split(s)
-    refs = parts[1::3]
+    refs = [r.replace('-', '') for r in parts[1::3]]
     text = []
     for i, x in enumerate(parts):
         if i % 3 == 0:
diff --git a/sota_extractor2/helpers/cache.py b/sota_extractor2/helpers/cache.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import json
 from collections import defaultdict
+from pathlib import Path
 
 
 # these functions are used to cache various results
@@ -10,26 +11,38 @@
 # can be changed.
 
 
+def _load_json(path):
+    with Path(path).open('rt') as f:
+        return json.load(f)
+
+
+def _save_json(obj, path):
+    with Path(path).open('wt') as f:
+        json.dump(obj, f)
+
+
+def load_references(path):
+    return _load_json(path)
+
+
+def save_references(references, path):
+    _save_json(references, path)
+
+
 def load_tags(path):
-    with open(path, 'rt') as f:
-        tags = json.load(f)
-    return tags
+    return _load_json(path)
 
 
 def save_tags(tags, path):
-    with open(path, 'wt') as f:
-        json.dump(tags, f)
+    _save_json(tags, path)
 
 
 def load_structure(path):
-    with open(path, 'rt') as f:
-        structure = json.load(f)
-    return structure
+    return _load_json(path)
 
 
 def save_structure(structure, path):
-    with open(path, 'wt') as f:
-        json.dump(structure, f)
+    _save_json(structure, path)
 
 
 def load_proposals(path):