4040# Configure logger
4141logger = logging .getLogger (__name__ )
4242
43+ def is_negator (tok ):
44+ """
45+ Check if a token is a negator using Universal Dependencies guidelines
46+ Note: some languages may have different negation markers. That's why we use UD guidelines.
47+
48+ :param tok: spaCy token
49+ """
50+ # Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
51+ if tok .dep_ == "neg" :
52+ return True
53+ # Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
54+ if "Neg" in tok .morph .get ("Polarity" ):
55+ return True
56+ # Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
57+ if "Neg" in tok .morph .get ("PronType" ):
58+ return True
59+ return False
60+
61+ def preprocess_with_ud_negation (text , nlp ):
62+ """
63+ Preprocess text using Universal Dependencies negation handling.
64+ It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
65+ Stop words are removed.
66+
67+ :param text: input text
68+ :param nlp: spaCy language model
69+ """
70+ doc = nlp (text )
71+ # 1) Negators indexes
72+ neg_idxs = {t .i for t in doc if is_negator (t )}
73+ # 2) Negated heads indexes
74+ negated_heads = set ()
75+ for i in neg_idxs :
76+ head = doc [i ].head
77+ if head .is_alpha and not head .is_stop :
78+ negated_heads .add (head .i )
79+
80+ toks = []
81+ for t in doc :
82+ if not t .is_alpha :
83+ continue
84+ # Keep negators as is
85+ if is_negator (t ):
86+ toks .append ("NEGATOR" )
87+ continue
88+ if t .is_stop :
89+ continue
90+
91+ lemma = t .lemma_ .lower ()
92+ if t .i in negated_heads :
93+ toks .append ("NEG_" + lemma )
94+ else :
95+ toks .append (lemma )
96+ return " " .join (toks )
4397
4498def get_text_similarity_with_spacy (text , expected_text , model_name = None ):
4599 """
@@ -50,11 +104,18 @@ def get_text_similarity_with_spacy(text, expected_text, model_name=None):
50104 :param model_name: name of the spaCy model to use
51105 :returns: similarity score between the two texts
52106 """
107+ # NOTE: spaCy similarity performance can be enhanced using some strategies like:
108+ # - Normalizing texts (lowercase, extra points, etc.)
109+ # - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
110+ # - Preprocessing texts. In this approach, we only preprocess negations.
53111 if spacy is None :
54112 raise ImportError ("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features" )
55113 config = DriverWrappersPool .get_default_wrapper ().config
56- model_name = model_name or config .get_optional ('AI' , 'spacy_model' , 'en_core_web_sm' )
114+ model_name = model_name or config .get_optional ('AI' , 'spacy_model' , 'es_core_news_md' )
115+ # TODO: Cache loaded models to improve performance using @lru_cache(maxsize=N) as decorator
57116 model = spacy .load (model_name )
117+ text = model (preprocess_with_ud_negation (text , model ))
118+ expected_text = model (preprocess_with_ud_negation (expected_text , model ))
58119 similarity = model (text ).similarity (model (expected_text ))
59120 logger .info (f"spaCy similarity: { similarity } between '{ text } ' and '{ expected_text } '" )
60121 return similarity
0 commit comments