Small adaptations

Marcin Kardas · Marcin Kardas · commit 0b308c238f00 · 2020-03-05T10:44:38.000Z
diff --git a/parse_references.py b/parse_references.py
@@ -28,6 +28,8 @@ def get_refstrings(p):
             ref_sec_started = True
             yield f.text
         elif ref_sec_started:
+            # todo: check if a paper can have multiple bibliography sections
+            # (f.e., one in the main paper and one in the appendix)
             break  # the refsection is only at the end of paper
 
 
@@ -58,7 +60,7 @@ def parse_refs(self, p):
                     if new_id is not None:
                         new_id = "pwc-" + new_id
                     self.cache[key] = new_id
-                if self.cache[key] and len(self.cache[key]) > 500:  # fix to self.cache to make the id compatible with elastic
+                if self.cache[key] and len(self.cache[key]) > ID_LIMIT:  # fix to self.cache to make the id compatible with elastic
                     self.cache[key] = self.cache[key][:ID_LIMIT]
                 yield d["ref_id"], self.cache[key]
         self.refsdb.sync()
diff --git a/sota_extractor2/config.py b/sota_extractor2/config.py
@@ -14,7 +14,7 @@
 
 
 elastic = dict(hosts=['localhost'], timeout=20)
-grobid = dict(host='10.0.1.145')
+grobid = dict(host='grobid')
 
 arxiv = data/'arxiv'
 htmls_raw = arxiv/'htmls'
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -307,6 +307,7 @@ class Reference2(Document):
     ptr = Keyword()
 
     arxiv_id = Keyword()
+    pwc_slug = Keyword()
     orig_refs = Text()
 
     class Index:
@@ -318,14 +319,15 @@ def add_ref(self, ref):
         # self.refs.append(asdict(ref))
         if ref.arxiv_id:
             self.arxiv_id = ref.arxiv_id
+        if ref.pwc_slug:
+            self.pwc_slug = ref.pwc_slug
         if ref.idno:
             if hasattr(ref.idno, 'values'):
                 self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
             elif isinstance(ref.idno, str):
                 self.idno = ref.idno
         # if ref.date:
         #     self.date = ref.date
-        self.date = None
         if ref.ptr:
             self.ptr = ref.ptr
         self.orig_refs = self.orig_refs if self.orig_refs else []
diff --git a/sota_extractor2/data/references.py b/sota_extractor2/data/references.py
@@ -82,7 +82,6 @@ def close(self):
         self.cache.close()
         self.cache = None
 
-grobidclient=GrobidClient(**config.grobid)
 
 def pop_first(dictionary, *path):
     if dictionary is None:
@@ -124,6 +123,14 @@ def from_tei_dict(cls, d):
             warn(f"{err} - Unable to parse {d} as Author")
             print(d)
 
+    @classmethod
+    def from_fullname(cls, fullname):
+        names = fullname.split()
+        return cls(
+            forenames=tuple(names[:-1]),
+            surname=names[-1]
+        )
+
     def __repr__(self):
         fnames = ', '.join(self.forenames)
         return f'"{self.surname}; {fnames}"'
@@ -156,11 +163,6 @@ def extract_arxivid(ref_str):
             ref_str = ref_str[:b] + " " +ref_str[e:]
     return ref_str, arxiv_id
 
-with Path('/mnt/efs/pwc/data/ref-names.json').open() as f:
-    preloaded_surnames_db = json.load(f)
-
-def is_surname(word):
-    return word in preloaded_surnames_db
 
 def is_publication_venue(word):
     return word.lower() in conferences
@@ -203,7 +205,7 @@ def score_sent(part, idx):
 
         title = max(scores)[1]
 
-    title = strip_conferences(title)
+    # title = strip_conferences(title)
     title = title.rstrip(' .')
     return title
 
@@ -221,6 +223,7 @@ class PReference:
     orig_ref: str = field(repr=False, default_factory=lambda:None)
 
     arxiv_id: str = None
+    pwc_slug: str = None
 
     def unique_id(self):
         if not self.title:
@@ -257,12 +260,12 @@ def from_tei_dict(cls, citation, **kwargs):
         )
 
     @classmethod
-    def parse_ref_str(cls, ref_str, orig_key=None, is_surname=is_surname, is_publication_venue=is_publication_venue):
+    def parse_ref_str(cls, ref_str, grobid_client, orig_key=None, is_surname=None, is_publication_venue=is_publication_venue):
         try:
             clean_ref_str = strip_latex_artefacts(ref_str)
             clean_ref_str = strip_anchor(clean_ref_str)
             clean_ref_str, arxiv_id = extract_arxivid(clean_ref_str)
-            d = grobidclient.parse_ref_str_to_tei_dict(clean_ref_str)
+            d = grobid_client.parse_ref_str_to_tei_dict(clean_ref_str)
             ref = cls.from_tei_dict(d, orig_ref=ref_str, arxiv_id=arxiv_id)
             ref.orig_key = orig_key
 
@@ -278,13 +281,19 @@ def until_first_nonalphanumeric(string):
     return nonalphanumeric_re.split(string)[0]
 
 class ReferenceStore:
-    def __init__(self):
+    def __init__(self, grobid_client, surnames_path='/mnt/efs/pwc/data/ref-names.json'):
+        self.grobid_client = grobid_client
         self.refdb = {}
         self.tosync = []
         self.surnames_db = defaultdict(lambda: 0)
+        self._load_surnames(surnames_path)
+
+    def _load_surnames(self, path):
+        with Path(path).open() as f:
+            self.preloaded_surnames_db = json.load(f)
 
     def is_surname(self, word):
-        return is_surname(word) or self.surnames_db[word] > 5
+        return word in self.preloaded_surnames_db  #or self.surnames_db[word] > 5
 
     def get_reference(self, key):
         if key not in self.refdb:
@@ -305,7 +314,7 @@ def add_or_merge(self, ref):
         return self.refdb[curr_uid].stable_id
 
     def add_reference_string(self, ref_str):
-        ref = PReference.parse_ref_str(ref_str, self.is_surname)
+        ref = PReference.parse_ref_str(ref_str, self.grobid_client, is_surname=self.is_surname)
         if ref is None or ref.unique_id() is None:
             for r in Reference2.search().query('match', orig_refs=ref_str)[:10]:
                 if r.stable_id in normalize_title(ref_str):