@@ -94,7 +94,7 @@ def __init__(
94
94
self .use_tags = use_tags
95
95
self .full_paragraph = full_paragraph
96
96
self .re_tokenizer = re .compile (r"[\w']+|[^\w ]" )
97
- self .not_found_str = "not in wiki "
97
+ self .not_found_str = "not_in_wiki "
98
98
99
99
self .load ()
100
100
@@ -277,27 +277,31 @@ def process_cand_ent(self, cand_ent_init, entities_and_ids, entity_substr_split,
277
277
cand_ent_init [cand_entity_id ].add ((substr_score , cand_entity_rels ))
278
278
return cand_ent_init
279
279
280
+ def find_title (self , entity_substr ):
281
+ entities_and_ids = []
282
+ try :
283
+ res = self .cur .execute ("SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (entity_substr ))
284
+ entities_and_ids = res .fetchall ()
285
+ except sqlite3 .OperationalError as e :
286
+ log .debug (f"error in searching an entity { e } " )
287
+ return entities_and_ids
288
+
280
289
def find_exact_match (self , entity_substr , tag ):
281
290
entity_substr_split = entity_substr .split ()
282
291
cand_ent_init = defaultdict (set )
283
- res = self .cur .execute ("SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (entity_substr ))
284
- entities_and_ids = res .fetchall ()
292
+ entities_and_ids = self .find_title (entity_substr )
285
293
if entities_and_ids :
286
294
cand_ent_init = self .process_cand_ent (cand_ent_init , entities_and_ids , entity_substr_split , tag )
287
295
if entity_substr .startswith ("the " ):
288
296
entity_substr = entity_substr .split ("the " )[1 ]
289
297
entity_substr_split = entity_substr_split [1 :]
290
- res = self .cur .execute ("SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (entity_substr ))
291
- entities_and_ids = res .fetchall ()
298
+ entities_and_ids = self .find_title (entity_substr )
292
299
cand_ent_init = self .process_cand_ent (cand_ent_init , entities_and_ids , entity_substr_split , tag )
293
300
if self .lang == "@ru" :
294
301
entity_substr_split_lemm = [self .morph .parse (tok )[0 ].normal_form for tok in entity_substr_split ]
295
302
entity_substr_lemm = " " .join (entity_substr_split_lemm )
296
303
if entity_substr_lemm != entity_substr :
297
- res = self .cur .execute (
298
- "SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (entity_substr_lemm )
299
- )
300
- entities_and_ids = res .fetchall ()
304
+ entities_and_ids = self .find_title (entity_substr_lemm )
301
305
if entities_and_ids :
302
306
cand_ent_init = self .process_cand_ent (
303
307
cand_ent_init , entities_and_ids , entity_substr_split_lemm , tag
@@ -311,14 +315,12 @@ def find_fuzzy_match(self, entity_substr_split, tag):
311
315
entity_substr_split_lemm = entity_substr_split
312
316
cand_ent_init = defaultdict (set )
313
317
for word in entity_substr_split :
314
- res = self .cur .execute ("SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (word ))
315
- part_entities_and_ids = res .fetchall ()
318
+ part_entities_and_ids = self .find_title (word )
316
319
cand_ent_init = self .process_cand_ent (cand_ent_init , part_entities_and_ids , entity_substr_split , tag )
317
320
if self .lang == "@ru" :
318
321
word_lemm = self .morph .parse (word )[0 ].normal_form
319
322
if word != word_lemm :
320
- res = self .cur .execute ("SELECT * FROM inverted_index WHERE title MATCH '{}';" .format (word_lemm ))
321
- part_entities_and_ids = res .fetchall ()
323
+ part_entities_and_ids = self .find_title (word_lemm )
322
324
cand_ent_init = self .process_cand_ent (
323
325
cand_ent_init ,
324
326
part_entities_and_ids ,
0 commit comments