14
14
15
15
import re
16
16
import sqlite3
17
+ from collections import defaultdict
17
18
from logging import getLogger
18
19
from typing import List , Dict , Tuple , Union , Any
19
- from collections import defaultdict
20
20
21
- import pymorphy2
21
+ import spacy
22
22
from hdt import HDTDocument
23
23
from nltk .corpus import stopwords
24
24
from rapidfuzz import fuzz
25
25
26
+ from deeppavlov .core .commands .utils import expand_path
26
27
from deeppavlov .core .common .registry import register
27
28
from deeppavlov .core .models .component import Component
28
29
from deeppavlov .core .models .serializable import Serializable
29
- from deeppavlov .core .commands .utils import expand_path
30
30
31
31
log = getLogger (__name__ )
32
32
@@ -75,7 +75,6 @@ def __init__(
75
75
**kwargs:
76
76
"""
77
77
super ().__init__ (save_path = None , load_path = load_path )
78
- self .morph = pymorphy2 .MorphAnalyzer ()
79
78
self .lemmatize = lemmatize
80
79
self .entities_database_filename = entities_database_filename
81
80
self .num_entities_for_bert_ranking = num_entities_for_bert_ranking
@@ -86,8 +85,10 @@ def __init__(
86
85
self .lang = f"@{ lang } "
87
86
if self .lang == "@en" :
88
87
self .stopwords = set (stopwords .words ("english" ))
88
+ self .nlp = spacy .load ("en_core_web_sm" )
89
89
elif self .lang == "@ru" :
90
90
self .stopwords = set (stopwords .words ("russian" ))
91
+ self .nlp = spacy .load ("ru_core_news_sm" )
91
92
self .use_descriptions = use_descriptions
92
93
self .use_connections = use_connections
93
94
self .max_paragraph_len = max_paragraph_len
@@ -198,7 +199,7 @@ def link_entities(
198
199
):
199
200
cand_ent_scores = []
200
201
if len (entity_substr ) > 1 :
201
- entity_substr_split_lemm = [self .morph . parse (tok )[0 ].normal_form for tok in entity_substr_split ]
202
+ entity_substr_split_lemm = [self .nlp (tok )[0 ].lemma_ for tok in entity_substr_split ]
202
203
cand_ent_init = self .find_exact_match (entity_substr , tag )
203
204
if not cand_ent_init or entity_substr_split != entity_substr_split_lemm :
204
205
cand_ent_init = self .find_fuzzy_match (entity_substr_split , tag )
@@ -297,28 +298,23 @@ def find_exact_match(self, entity_substr, tag):
297
298
entity_substr_split = entity_substr_split [1 :]
298
299
entities_and_ids = self .find_title (entity_substr )
299
300
cand_ent_init = self .process_cand_ent (cand_ent_init , entities_and_ids , entity_substr_split , tag )
300
- if self .lang == "@ru" :
301
- entity_substr_split_lemm = [self .morph .parse (tok )[0 ].normal_form for tok in entity_substr_split ]
302
- entity_substr_lemm = " " .join (entity_substr_split_lemm )
303
- if entity_substr_lemm != entity_substr :
304
- entities_and_ids = self .find_title (entity_substr_lemm )
305
- if entities_and_ids :
306
- cand_ent_init = self .process_cand_ent (
307
- cand_ent_init , entities_and_ids , entity_substr_split_lemm , tag
308
- )
301
+
302
+ entity_substr_split_lemm = [self .nlp (tok )[0 ].lemma_ for tok in entity_substr_split ]
303
+ entity_substr_lemm = " " .join (entity_substr_split_lemm )
304
+ if entity_substr_lemm != entity_substr :
305
+ entities_and_ids = self .find_title (entity_substr_lemm )
306
+ if entities_and_ids :
307
+ cand_ent_init = self .process_cand_ent (cand_ent_init , entities_and_ids , entity_substr_split_lemm , tag )
309
308
return cand_ent_init
310
309
311
310
def find_fuzzy_match (self , entity_substr_split , tag ):
312
- if self .lang == "@ru" :
313
- entity_substr_split_lemm = [self .morph .parse (tok )[0 ].normal_form for tok in entity_substr_split ]
314
- else :
315
- entity_substr_split_lemm = entity_substr_split
311
+ entity_substr_split_lemm = [self .nlp (tok )[0 ].lemma_ for tok in entity_substr_split ]
316
312
cand_ent_init = defaultdict (set )
317
313
for word in entity_substr_split :
318
314
part_entities_and_ids = self .find_title (word )
319
315
cand_ent_init = self .process_cand_ent (cand_ent_init , part_entities_and_ids , entity_substr_split , tag )
320
316
if self .lang == "@ru" :
321
- word_lemm = self .morph . parse (word )[0 ].normal_form
317
+ word_lemm = self .nlp (word )[0 ].lemma_
322
318
if word != word_lemm :
323
319
part_entities_and_ids = self .find_title (word_lemm )
324
320
cand_ent_init = self .process_cand_ent (
@@ -329,11 +325,6 @@ def find_fuzzy_match(self, entity_substr_split, tag):
329
325
)
330
326
return cand_ent_init
331
327
332
- def morph_parse (self , word ):
333
- morph_parse_tok = self .morph .parse (word )[0 ]
334
- normal_form = morph_parse_tok .normal_form
335
- return normal_form
336
-
337
328
def calc_substr_score (self , cand_entity_title , entity_substr_split ):
338
329
label_tokens = cand_entity_title .split ()
339
330
cnt = 0.0
0 commit comments