Skip to content

Commit acdd321

Browse files
authored
Merge pull request #4 from paperswithcode/references
References
2 parents 9fc128a + 6b85869 commit acdd321

File tree

5 files changed

+498
-5
lines changed

5 files changed

+498
-5
lines changed

parse_references.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
import pandas as pd
4+
import regex
5+
import diskcache
6+
7+
from fastai.text import progress_bar
8+
9+
from sota_extractor2.data.references import *
10+
from functools import lru_cache
11+
from sota_extractor2.data.elastic import *
12+
13+
from sota_extractor2.data.paper_collection import PaperCollection
14+
15+
connections.create_connection(hosts=['10.0.1.145'], timeout=20)
16+
17+
pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
18+
19+
20+
def get_refstrings(p):
21+
paper = p.text if hasattr(p, 'text') else p
22+
if not hasattr(paper, 'fragments'):
23+
return
24+
fragments = paper.fragments
25+
ref_sec_started = False
26+
for f in reversed(fragments):
27+
if f.header.startswith('xxanchor-bib'):
28+
ref_sec_started = True
29+
yield f.text
30+
elif ref_sec_started:
31+
break # the refsection is only at the end of paper
32+
33+
34+
_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
35+
def extract_refs(p):
36+
for ref in get_refstrings(p):
37+
m = _ref_re.match(ref)
38+
if m:
39+
ref_id, ref_str = m.groups()
40+
yield {
41+
"paper_arxiv_id": p.arxiv_no_version,
42+
"ref_id": ref_id,
43+
"ref_str": ref_str.strip(r'\s')
44+
}
45+
46+
class PaperCollectionReferenceParser:
47+
def __init__(self):
48+
self.refsdb = ReferenceStore()
49+
self.cache = diskcache.Cache(Path.home() / '.cache' / 'refs' / 'refs_ids.db')
50+
51+
52+
def parse_refs(self, p):
53+
for d in extract_refs(p):
54+
if not d["ref_id"].startswith("pwc-"):
55+
key = d["paper_arxiv_id"] + d["ref_id"]
56+
if key not in self.cache:
57+
new_id = self.refsdb.add_reference_string(d['ref_str'])
58+
if new_id is not None:
59+
new_id = "pwc-" + new_id
60+
self.cache[key] = new_id
61+
if self.cache[key] and len(self.cache[key]) > 500: # fix to self.cache to make the id compatible with elastic
62+
self.cache[key] = self.cache[key][:ID_LIMIT]
63+
yield d["ref_id"], self.cache[key]
64+
self.refsdb.sync()
65+
66+
67+
def update_references(self, pc):
68+
def update_paper(p_idx):
69+
p = pc[p_idx]
70+
for old_ref_id, new_ref_id in self.parse_refs(p):
71+
if new_ref_id is not None:
72+
for f in p.text.fragments:
73+
f.text = f.text.replace(old_ref_id, new_ref_id)
74+
75+
Parallel(n_jobs=8, require='sharedmem')(
76+
delayed(update_paper)(p_idx) for p_idx in progress_bar(range(len(pc))))
77+
78+
79+
def update_references_pickle(self, data_pkl_path="/mnt/efs/pwc/data/pc-small-noann.pkl"):
80+
print("Loading pickle", data_pkl_path)
81+
pc = PaperCollection.from_pickle(data_pkl_path)
82+
self.update_references(pc)
83+
print()
84+
print("Saving pickle", data_pkl_path)
85+
pc.to_pickle(data_pkl_path)
86+
return pc
87+
88+
def main(data_pkl_path="/home/ubuntu/data/pc2.pkl"):
89+
with PaperCollectionReferenceParser() as worker:
90+
worker.update_references_pickle(data_pkl_path)

sota_extractor2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
elastic = dict(hosts=['localhost'], timeout=20)
17-
17+
grobid = dict(host='10.0.1.145')
1818

1919
arxiv = data/'arxiv'
2020
htmls_raw = arxiv/'htmls'

sota_extractor2/data/elastic.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from bs4 import BeautifulSoup
22
import pandas as pd
33
import re
4+
from dataclasses import asdict
45

56
from elasticsearch_dsl import Document, Boolean, Object, \
6-
analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
7+
analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter, Date
78
from elasticsearch_dsl.serializer import serializer
89

910
from IPython.display import display, Markdown
@@ -132,7 +133,7 @@ def __repr__(self):
132133

133134
class Paper(Document):
134135
title = Text()
135-
authors = Keyword()
136+
authors = Keyword() #TODO: change this to Text() otherwise we can't search using this field.
136137
abstract = Text(
137138
analyzer=html_strip
138139
)
@@ -295,6 +296,67 @@ class Index:
295296
def __repr__(self):
296297
return f"{self.title} / {self.authors}"
297298

299+
ID_LIMIT=480
300+
301+
class Reference2(Document):
302+
title = Text()
303+
authors = Text()
304+
305+
idno = Keyword()
306+
date = Date()
307+
ptr = Keyword()
308+
309+
arxiv_id = Keyword()
310+
orig_refs = Text()
311+
312+
class Index:
313+
name = 'references2'
314+
315+
def add_ref(self, ref):
316+
# if not hasattr(self, 'refs'):
317+
# self.refs = []
318+
# self.refs.append(asdict(ref))
319+
if ref.arxiv_id:
320+
self.arxiv_id = ref.arxiv_id
321+
if ref.idno:
322+
if hasattr(ref.idno, 'values'):
323+
self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
324+
elif isinstance(ref.idno, str):
325+
self.idno = ref.idno
326+
# if ref.date:
327+
# self.date = ref.date
328+
self.date = None
329+
if ref.ptr:
330+
self.ptr = ref.ptr
331+
self.orig_refs = self.orig_refs if self.orig_refs else []
332+
self.orig_refs.append(ref.orig_ref)
333+
self.orig_refs = list(set(self.orig_refs))
334+
335+
# TODO Update authors
336+
# titles = Counter([norm_title] + [normalize_title(ref.title) for ref in merged])
337+
# norm_title = titles.most_common(1)[0][0]
338+
339+
@property
340+
def stable_id(self):
341+
return self.meta.id
342+
343+
def unique_id(self):
344+
return self.meta.id
345+
346+
@classmethod
347+
def from_ref(cls, ref):
348+
#title = ref.title
349+
#first_author = ref.authors[0].short() if len(ref.authors) > 0 else "unknown"
350+
# Todo figure out what to do here so stable_id is recoverable, and it has no collisions
351+
# stable_id = first_author + "-" + normalize_title(until_first_nonalphanumeric(title))[:50]
352+
stable_id = ref.unique_id()[:ID_LIMIT]
353+
354+
self = cls(meta={"id":stable_id},
355+
title=ref.title,
356+
authors=[asdict(a) for a in ref.authors if a is not None])
357+
358+
return self
359+
298360
#
299361
# arxiv = Path('data/arxiv')
300362
# html = arxiv/'html'

sota_extractor2/data/paper_collection.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,5 +151,10 @@ def to_pickle(self, path):
151151

152152
@classmethod
153153
def from_pickle(cls, path):
154-
with open(path, "rb") as f:
155-
return pickle.load(f)
154+
import gc
155+
try:
156+
gc.disable()
157+
with open(path, "rb") as f:
158+
return pickle.load(f)
159+
finally:
160+
gc.enable()

0 commit comments

Comments
 (0)