Skip to content

Commit ecc5f3d

Browse files
improve spacy similarity
1 parent 7bc3e90 commit ecc5f3d

File tree

1 file changed

+62
-1
lines changed

1 file changed

+62
-1
lines changed

toolium/utils/ai_utils.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,60 @@
4040
# Configure logger
4141
logger = logging.getLogger(__name__)
4242

43+
def is_negator(tok):
44+
"""
45+
Check if a token is a negator using Universal Dependencies guidelines
46+
Note: some languages may have different negation markers. That's why we use UD guidelines.
47+
48+
:param tok: spaCy token
49+
"""
50+
# Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
51+
if tok.dep_ == "neg":
52+
return True
53+
# Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
54+
if "Neg" in tok.morph.get("Polarity"):
55+
return True
56+
# Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
57+
if "Neg" in tok.morph.get("PronType"):
58+
return True
59+
return False
60+
61+
def preprocess_with_ud_negation(text, nlp):
62+
"""
63+
Preprocess text using Universal Dependencies negation handling.
64+
It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
65+
Stop words are removed.
66+
67+
:param text: input text
68+
:param nlp: spaCy language model
69+
"""
70+
doc = nlp(text)
71+
# 1) Negators indexes
72+
neg_idxs = {t.i for t in doc if is_negator(t)}
73+
# 2) Negated heads indexes
74+
negated_heads = set()
75+
for i in neg_idxs:
76+
head = doc[i].head
77+
if head.is_alpha and not head.is_stop:
78+
negated_heads.add(head.i)
79+
80+
toks = []
81+
for t in doc:
82+
if not t.is_alpha:
83+
continue
84+
# Keep negators as is
85+
if is_negator(t):
86+
toks.append("NEGATOR")
87+
continue
88+
if t.is_stop:
89+
continue
90+
91+
lemma = t.lemma_.lower()
92+
if t.i in negated_heads:
93+
toks.append("NEG_" + lemma)
94+
else:
95+
toks.append(lemma)
96+
return " ".join(toks)
4397

4498
def get_text_similarity_with_spacy(text, expected_text, model_name=None):
4599
"""
@@ -50,11 +104,18 @@ def get_text_similarity_with_spacy(text, expected_text, model_name=None):
50104
:param model_name: name of the spaCy model to use
51105
:returns: similarity score between the two texts
52106
"""
107+
# NOTE: spaCy similarity performance can be enhanced using some strategies like:
108+
# - Normalizing texts (lowercase, extra points, etc.)
109+
# - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
110+
# - Preprocessing texts. In this approach, we only preprocess negations.
53111
if spacy is None:
54112
raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
55113
config = DriverWrappersPool.get_default_wrapper().config
56-
model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_sm')
114+
model_name = model_name or config.get_optional('AI', 'spacy_model', 'es_core_news_md')
115+
# TODO: Cache loaded models to improve performance using @lru_cache(maxsize=N) as decorator
57116
model = spacy.load(model_name)
117+
text = model(preprocess_with_ud_negation(text, model))
118+
expected_text = model(preprocess_with_ud_negation(expected_text, model))
58119
similarity = model(text).similarity(model(expected_text))
59120
logger.info(f"spaCy similarity: {similarity} between '{text}' and '{expected_text}'")
60121
return similarity

0 commit comments

Comments
 (0)