@@ -82,7 +82,6 @@ def close(self):
82
82
self .cache .close ()
83
83
self .cache = None
84
84
85
- grobidclient = GrobidClient (** config .grobid )
86
85
87
86
def pop_first (dictionary , * path ):
88
87
if dictionary is None :
@@ -124,6 +123,14 @@ def from_tei_dict(cls, d):
124
123
warn (f"{ err } - Unable to parse { d } as Author" )
125
124
print (d )
126
125
126
+ @classmethod
127
+ def from_fullname (cls , fullname ):
128
+ names = fullname .split ()
129
+ return cls (
130
+ forenames = tuple (names [:- 1 ]),
131
+ surname = names [- 1 ]
132
+ )
133
+
127
134
def __repr__ (self ):
128
135
fnames = ', ' .join (self .forenames )
129
136
return f'"{ self .surname } ; { fnames } "'
@@ -156,11 +163,6 @@ def extract_arxivid(ref_str):
156
163
ref_str = ref_str [:b ] + " " + ref_str [e :]
157
164
return ref_str , arxiv_id
158
165
159
- with Path ('/mnt/efs/pwc/data/ref-names.json' ).open () as f :
160
- preloaded_surnames_db = json .load (f )
161
-
162
- def is_surname (word ):
163
- return word in preloaded_surnames_db
164
166
165
167
def is_publication_venue (word ):
166
168
return word .lower () in conferences
@@ -203,7 +205,7 @@ def score_sent(part, idx):
203
205
204
206
title = max (scores )[1 ]
205
207
206
- title = strip_conferences (title )
208
+ # title = strip_conferences(title)
207
209
title = title .rstrip (' .' )
208
210
return title
209
211
@@ -221,6 +223,7 @@ class PReference:
221
223
orig_ref : str = field (repr = False , default_factory = lambda :None )
222
224
223
225
arxiv_id : str = None
226
+ pwc_slug : str = None
224
227
225
228
def unique_id (self ):
226
229
if not self .title :
@@ -257,12 +260,12 @@ def from_tei_dict(cls, citation, **kwargs):
257
260
)
258
261
259
262
@classmethod
260
- def parse_ref_str (cls , ref_str , orig_key = None , is_surname = is_surname , is_publication_venue = is_publication_venue ):
263
+ def parse_ref_str (cls , ref_str , grobid_client , orig_key = None , is_surname = None , is_publication_venue = is_publication_venue ):
261
264
try :
262
265
clean_ref_str = strip_latex_artefacts (ref_str )
263
266
clean_ref_str = strip_anchor (clean_ref_str )
264
267
clean_ref_str , arxiv_id = extract_arxivid (clean_ref_str )
265
- d = grobidclient .parse_ref_str_to_tei_dict (clean_ref_str )
268
+ d = grobid_client .parse_ref_str_to_tei_dict (clean_ref_str )
266
269
ref = cls .from_tei_dict (d , orig_ref = ref_str , arxiv_id = arxiv_id )
267
270
ref .orig_key = orig_key
268
271
@@ -278,13 +281,19 @@ def until_first_nonalphanumeric(string):
278
281
return nonalphanumeric_re .split (string )[0 ]
279
282
280
283
class ReferenceStore :
281
- def __init__ (self ):
284
+ def __init__ (self , grobid_client , surnames_path = '/mnt/efs/pwc/data/ref-names.json' ):
285
+ self .grobid_client = grobid_client
282
286
self .refdb = {}
283
287
self .tosync = []
284
288
self .surnames_db = defaultdict (lambda : 0 )
289
+ self ._load_surnames (surnames_path )
290
+
291
+ def _load_surnames (self , path ):
292
+ with Path (path ).open () as f :
293
+ self .preloaded_surnames_db = json .load (f )
285
294
286
295
def is_surname (self , word ):
287
- return is_surname ( word ) or self .surnames_db [word ] > 5
296
+ return word in self . preloaded_surnames_db # or self.surnames_db[word] > 5
288
297
289
298
def get_reference (self , key ):
290
299
if key not in self .refdb :
@@ -305,7 +314,7 @@ def add_or_merge(self, ref):
305
314
return self .refdb [curr_uid ].stable_id
306
315
307
316
def add_reference_string (self , ref_str ):
308
- ref = PReference .parse_ref_str (ref_str , self .is_surname )
317
+ ref = PReference .parse_ref_str (ref_str , self .grobid_client , is_surname = self . is_surname )
309
318
if ref is None or ref .unique_id () is None :
310
319
for r in Reference2 .search ().query ('match' , orig_refs = ref_str )[:10 ]:
311
320
if r .stable_id in normalize_title (ref_str ):
0 commit comments