Merge pull request #4 from paperswithcode/references

mkardas · web-flow · commit acdd321eab2f · 2020-03-02T12:43:34.000Z
References
diff --git a/parse_references.py b/parse_references.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# coding: utf-8
+import pandas as pd
+import regex
+import diskcache
+
+from fastai.text import progress_bar
+
+from sota_extractor2.data.references import *
+from functools import lru_cache
+from sota_extractor2.data.elastic import *
+
+from sota_extractor2.data.paper_collection import PaperCollection
+
+connections.create_connection(hosts=['10.0.1.145'], timeout=20)
+
+pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
+
+
+def get_refstrings(p):
+    paper = p.text if hasattr(p, 'text') else p
+    if not hasattr(paper, 'fragments'):
+        return
+    fragments = paper.fragments
+    ref_sec_started = False
+    for f in reversed(fragments):
+        if f.header.startswith('xxanchor-bib'):
+            ref_sec_started = True
+            yield f.text
+        elif ref_sec_started:
+            break  # the refsection is only at the end of paper
+
+
+_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
+def extract_refs(p):
+    for ref in get_refstrings(p):
+        m = _ref_re.match(ref)
+        if m:
+            ref_id, ref_str = m.groups()
+            yield {
+                "paper_arxiv_id": p.arxiv_no_version,
+                "ref_id": ref_id,
+                "ref_str": ref_str.strip(r'\s')
+            }
+
+class PaperCollectionReferenceParser:
+    def __init__(self):
+        self.refsdb = ReferenceStore()
+        self.cache = diskcache.Cache(Path.home() / '.cache' / 'refs' / 'refs_ids.db')
+
+
+    def parse_refs(self, p):
+        for d in extract_refs(p):
+            if not d["ref_id"].startswith("pwc-"):
+                key = d["paper_arxiv_id"] + d["ref_id"]
+                if key not in self.cache:
+                    new_id = self.refsdb.add_reference_string(d['ref_str'])
+                    if new_id is not None:
+                        new_id = "pwc-" + new_id
+                    self.cache[key] = new_id
+                if self.cache[key] and len(self.cache[key]) > 500:  # fix to self.cache to make the id compatible with elastic
+                    self.cache[key] = self.cache[key][:ID_LIMIT]
+                yield d["ref_id"], self.cache[key]
+        self.refsdb.sync()
+
+
+    def update_references(self, pc):
+        def update_paper(p_idx):
+            p = pc[p_idx]
+            for old_ref_id, new_ref_id in self.parse_refs(p):
+                if new_ref_id is not None:
+                    for f in p.text.fragments:
+                        f.text = f.text.replace(old_ref_id, new_ref_id)
+
+        Parallel(n_jobs=8, require='sharedmem')(
+            delayed(update_paper)(p_idx) for p_idx in progress_bar(range(len(pc))))
+
+
+    def update_references_pickle(self, data_pkl_path="/mnt/efs/pwc/data/pc-small-noann.pkl"):
+        print("Loading pickle", data_pkl_path)
+        pc = PaperCollection.from_pickle(data_pkl_path)
+        self.update_references(pc)
+        print()
+        print("Saving pickle", data_pkl_path)
+        pc.to_pickle(data_pkl_path)
+        return pc
+
+def main(data_pkl_path="/home/ubuntu/data/pc2.pkl"):
+    with PaperCollectionReferenceParser() as worker:
+        worker.update_references_pickle(data_pkl_path)
diff --git a/sota_extractor2/config.py b/sota_extractor2/config.py
@@ -14,7 +14,7 @@
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
-
+grobid = dict(host='10.0.1.145')
 
 arxiv = data/'arxiv'
 htmls_raw = arxiv/'htmls'
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -1,9 +1,10 @@
 from bs4 import BeautifulSoup
 import pandas as pd
 import re
+from dataclasses import asdict
 
 from elasticsearch_dsl import Document, Boolean, Object, \
-    analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
+    analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter, Date
 from elasticsearch_dsl.serializer import serializer
 
 from IPython.display import display, Markdown
@@ -132,7 +133,7 @@ def __repr__(self):
 
 class Paper(Document):
     title = Text()
-    authors = Keyword()
+    authors = Keyword() #TODO: change this to Text() otherwise we can't search using this field.
     abstract = Text(
         analyzer=html_strip
     )
@@ -295,6 +296,67 @@ class Index:
     def __repr__(self):
         return f"{self.title} / {self.authors}"
 
+ID_LIMIT=480
+
+class Reference2(Document):
+    title = Text()
+    authors = Text()
+
+    idno = Keyword()
+    date = Date()
+    ptr = Keyword()
+
+    arxiv_id = Keyword()
+    orig_refs = Text()
+
+    class Index:
+        name = 'references2'
+
+    def add_ref(self, ref):
+        # if not hasattr(self, 'refs'):
+        #     self.refs = []
+        # self.refs.append(asdict(ref))
+        if ref.arxiv_id:
+            self.arxiv_id = ref.arxiv_id
+        if ref.idno:
+            if hasattr(ref.idno, 'values'):
+                self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
+            elif isinstance(ref.idno, str):
+                self.idno = ref.idno
+        # if ref.date:
+        #     self.date = ref.date
+        self.date = None
+        if ref.ptr:
+            self.ptr = ref.ptr
+        self.orig_refs = self.orig_refs if self.orig_refs else []
+        self.orig_refs.append(ref.orig_ref)
+        self.orig_refs = list(set(self.orig_refs))
+
+        # TODO Update authors
+        # titles = Counter([norm_title] + [normalize_title(ref.title) for ref in merged])
+        # norm_title = titles.most_common(1)[0][0]
+
+    @property
+    def stable_id(self):
+        return self.meta.id
+
+    def unique_id(self):
+        return self.meta.id
+
+    @classmethod
+    def from_ref(cls, ref):
+        #title = ref.title
+        #first_author = ref.authors[0].short() if len(ref.authors) > 0 else "unknown"
+        # Todo figure out what to do here so stable_id is recoverable, and it has no collisions
+        #  stable_id = first_author + "-" + normalize_title(until_first_nonalphanumeric(title))[:50]
+        stable_id = ref.unique_id()[:ID_LIMIT]
+
+        self = cls(meta={"id":stable_id},
+                   title=ref.title,
+                   authors=[asdict(a) for a in ref.authors if a is not None])
+
+        return self
+
 #
 # arxiv = Path('data/arxiv')
 # html = arxiv/'html'
diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py
@@ -151,5 +151,10 @@ def to_pickle(self, path):
 
     @classmethod
     def from_pickle(cls, path):
-        with open(path, "rb") as f:
-            return pickle.load(f)
+        import gc
+        try:
+            gc.disable()
+            with open(path, "rb") as f:
+                return pickle.load(f)
+        finally:
+            gc.enable()
diff --git a/sota_extractor2/data/references.py b/sota_extractor2/data/references.py