Skip to content

Commit 0b308c2

Browse files
author
Marcin Kardas
committed
Small adaptations
1 parent d8caebf commit 0b308c2

File tree

4 files changed

+28
-15
lines changed

4 files changed

+28
-15
lines changed

parse_references.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ def get_refstrings(p):
2828
ref_sec_started = True
2929
yield f.text
3030
elif ref_sec_started:
31+
# todo: check if a paper can have multiple bibliography sections
32+
# (f.e., one in the main paper and one in the appendix)
3133
break # the refsection is only at the end of paper
3234

3335

@@ -58,7 +60,7 @@ def parse_refs(self, p):
5860
if new_id is not None:
5961
new_id = "pwc-" + new_id
6062
self.cache[key] = new_id
61-
if self.cache[key] and len(self.cache[key]) > 500: # fix to self.cache to make the id compatible with elastic
63+
if self.cache[key] and len(self.cache[key]) > ID_LIMIT: # fix to self.cache to make the id compatible with elastic
6264
self.cache[key] = self.cache[key][:ID_LIMIT]
6365
yield d["ref_id"], self.cache[key]
6466
self.refsdb.sync()

sota_extractor2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
elastic = dict(hosts=['localhost'], timeout=20)
17-
grobid = dict(host='10.0.1.145')
17+
grobid = dict(host='grobid')
1818

1919
arxiv = data/'arxiv'
2020
htmls_raw = arxiv/'htmls'

sota_extractor2/data/elastic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ class Reference2(Document):
307307
ptr = Keyword()
308308

309309
arxiv_id = Keyword()
310+
pwc_slug = Keyword()
310311
orig_refs = Text()
311312

312313
class Index:
@@ -318,14 +319,15 @@ def add_ref(self, ref):
318319
# self.refs.append(asdict(ref))
319320
if ref.arxiv_id:
320321
self.arxiv_id = ref.arxiv_id
322+
if ref.pwc_slug:
323+
self.pwc_slug = ref.pwc_slug
321324
if ref.idno:
322325
if hasattr(ref.idno, 'values'):
323326
self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
324327
elif isinstance(ref.idno, str):
325328
self.idno = ref.idno
326329
# if ref.date:
327330
# self.date = ref.date
328-
self.date = None
329331
if ref.ptr:
330332
self.ptr = ref.ptr
331333
self.orig_refs = self.orig_refs if self.orig_refs else []

sota_extractor2/data/references.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def close(self):
8282
self.cache.close()
8383
self.cache = None
8484

85-
grobidclient=GrobidClient(**config.grobid)
8685

8786
def pop_first(dictionary, *path):
8887
if dictionary is None:
@@ -124,6 +123,14 @@ def from_tei_dict(cls, d):
124123
warn(f"{err} - Unable to parse {d} as Author")
125124
print(d)
126125

126+
@classmethod
127+
def from_fullname(cls, fullname):
128+
names = fullname.split()
129+
return cls(
130+
forenames=tuple(names[:-1]),
131+
surname=names[-1]
132+
)
133+
127134
def __repr__(self):
128135
fnames = ', '.join(self.forenames)
129136
return f'"{self.surname}; {fnames}"'
@@ -156,11 +163,6 @@ def extract_arxivid(ref_str):
156163
ref_str = ref_str[:b] + " " +ref_str[e:]
157164
return ref_str, arxiv_id
158165

159-
with Path('/mnt/efs/pwc/data/ref-names.json').open() as f:
160-
preloaded_surnames_db = json.load(f)
161-
162-
def is_surname(word):
163-
return word in preloaded_surnames_db
164166

165167
def is_publication_venue(word):
166168
return word.lower() in conferences
@@ -203,7 +205,7 @@ def score_sent(part, idx):
203205

204206
title = max(scores)[1]
205207

206-
title = strip_conferences(title)
208+
# title = strip_conferences(title)
207209
title = title.rstrip(' .')
208210
return title
209211

@@ -221,6 +223,7 @@ class PReference:
221223
orig_ref: str = field(repr=False, default_factory=lambda:None)
222224

223225
arxiv_id: str = None
226+
pwc_slug: str = None
224227

225228
def unique_id(self):
226229
if not self.title:
@@ -257,12 +260,12 @@ def from_tei_dict(cls, citation, **kwargs):
257260
)
258261

259262
@classmethod
260-
def parse_ref_str(cls, ref_str, orig_key=None, is_surname=is_surname, is_publication_venue=is_publication_venue):
263+
def parse_ref_str(cls, ref_str, grobid_client, orig_key=None, is_surname=None, is_publication_venue=is_publication_venue):
261264
try:
262265
clean_ref_str = strip_latex_artefacts(ref_str)
263266
clean_ref_str = strip_anchor(clean_ref_str)
264267
clean_ref_str, arxiv_id = extract_arxivid(clean_ref_str)
265-
d = grobidclient.parse_ref_str_to_tei_dict(clean_ref_str)
268+
d = grobid_client.parse_ref_str_to_tei_dict(clean_ref_str)
266269
ref = cls.from_tei_dict(d, orig_ref=ref_str, arxiv_id=arxiv_id)
267270
ref.orig_key = orig_key
268271

@@ -278,13 +281,19 @@ def until_first_nonalphanumeric(string):
278281
return nonalphanumeric_re.split(string)[0]
279282

280283
class ReferenceStore:
281-
def __init__(self):
284+
def __init__(self, grobid_client, surnames_path='/mnt/efs/pwc/data/ref-names.json'):
285+
self.grobid_client = grobid_client
282286
self.refdb = {}
283287
self.tosync = []
284288
self.surnames_db = defaultdict(lambda: 0)
289+
self._load_surnames(surnames_path)
290+
291+
def _load_surnames(self, path):
292+
with Path(path).open() as f:
293+
self.preloaded_surnames_db = json.load(f)
285294

286295
def is_surname(self, word):
287-
return is_surname(word) or self.surnames_db[word] > 5
296+
return word in self.preloaded_surnames_db #or self.surnames_db[word] > 5
288297

289298
def get_reference(self, key):
290299
if key not in self.refdb:
@@ -305,7 +314,7 @@ def add_or_merge(self, ref):
305314
return self.refdb[curr_uid].stable_id
306315

307316
def add_reference_string(self, ref_str):
308-
ref = PReference.parse_ref_str(ref_str, self.is_surname)
317+
ref = PReference.parse_ref_str(ref_str, self.grobid_client, is_surname=self.is_surname)
309318
if ref is None or ref.unique_id() is None:
310319
for r in Reference2.search().query('match', orig_refs=ref_str)[:10]:
311320
if r.stable_id in normalize_title(ref_str):

0 commit comments

Comments
 (0)