Skip to content

Commit 1bcea5e

Browse files
feat(ai): improve spacy similarity (#434)
* improve spacy similarity * move to file * fix lint * model update * en_core_web_md * split in files * fix typo * move toolium import * improve tests * no spacy import at text_similarity file
1 parent f93d205 commit 1bcea5e

File tree

4 files changed

+117
-13
lines changed

4 files changed

+117
-13
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
python -m pip install --upgrade pip
2222
pip install -r requirements.txt
2323
pip install -r requirements_dev.txt
24-
python -m spacy download en_core_web_sm
24+
python -m spacy download en_core_web_md
2525
- name: Lint with flake8
2626
run: |
2727
flake8 . --count --max-complexity=10 --max-line-length=121 --show-source --statistics

toolium/test/utils/ai_utils/test_text_similarity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def configure_default_openai_model():
3939

4040
get_similarity_examples = (
4141
('Today it will be sunny', 'Today it will be sunny', 0.9, 1),
42-
('Today is sunny', 'Today it will be sunny', 0.6, 0.9),
43-
('It is sunny', 'Today it will be sunny', 0.5, 0.7),
42+
('Today is sunny', 'Today it will be sunny', 0.7, 1),
43+
('It is sunny', 'Today it will be sunny', 0.7, 1),
4444
('Nothing related', 'Today it will be sunny', 0, 0.6),
4545
)
4646

toolium/utils/ai_utils/spacy.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Copyright 2025 Telefónica Innovación Digital, S.L.
4+
This file is part of Toolium.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
"""
18+
19+
import logging
20+
from functools import lru_cache
21+
22+
# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
23+
try:
24+
import spacy
25+
except ImportError:
26+
spacy = None
27+
28+
29+
# Configure logger
30+
logger = logging.getLogger(__name__)
31+
32+
33+
@lru_cache(maxsize=8)
34+
def get_spacy_model(model_name):
35+
"""
36+
get spaCy model.
37+
This method uses lru cache to get spaCy model to improve performance.
38+
39+
:param model_name: spaCy model name
40+
:return: spaCy model
41+
"""
42+
if spacy is None:
43+
return None
44+
return spacy.load(model_name)
45+
46+
47+
def is_negator(tok):
48+
"""
49+
Check if a token is a negator using Universal Dependencies guidelines
50+
Note: some languages may have different negation markers. That's why we use UD guidelines.
51+
52+
:param tok: spaCy token
53+
"""
54+
# Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
55+
if tok.dep_ == "neg":
56+
return True
57+
# Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
58+
if "Neg" in tok.morph.get("Polarity"):
59+
return True
60+
# Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
61+
if "Neg" in tok.morph.get("PronType"):
62+
return True
63+
return False
64+
65+
66+
def preprocess_with_ud_negation(text, nlp):
67+
"""
68+
Preprocess text using Universal Dependencies negation handling.
69+
It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
70+
Stop words are removed.
71+
72+
:param text: input text
73+
:param nlp: spaCy language model
74+
"""
75+
doc = nlp(text)
76+
# 1) Negators indexes
77+
neg_idxs = {t.i for t in doc if is_negator(t)}
78+
# 2) Negated heads indexes
79+
negated_heads = set()
80+
for i in neg_idxs:
81+
head = doc[i].head
82+
if head.is_alpha and not head.is_stop:
83+
negated_heads.add(head.i)
84+
85+
toks = []
86+
for t in doc:
87+
if not t.is_alpha:
88+
continue
89+
# Keep negators as is
90+
if is_negator(t):
91+
toks.append("NEGATOR")
92+
continue
93+
if t.is_stop:
94+
continue
95+
96+
lemma = t.lemma_.lower()
97+
if t.i in negated_heads:
98+
toks.append("NEG_" + lemma)
99+
else:
100+
toks.append(lemma)
101+
return " ".join(toks)

toolium/utils/ai_utils/text_similarity.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,14 @@
1919
import json
2020
import logging
2121

22-
# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
23-
try:
24-
import spacy
25-
except ImportError:
26-
spacy = None
2722
try:
2823
from sentence_transformers import SentenceTransformer
2924
except ImportError:
3025
SentenceTransformer = None
3126

3227
from toolium.driver_wrappers_pool import DriverWrappersPool
3328
from toolium.utils.ai_utils.openai import openai_request
29+
from toolium.utils.ai_utils.spacy import get_spacy_model, preprocess_with_ud_negation
3430

3531

3632
# Configure logger
@@ -39,18 +35,25 @@
3935

4036
def get_text_similarity_with_spacy(text, expected_text, model_name=None):
4137
"""
42-
Return similarity between two texts using spaCy
38+
Return similarity between two texts using spaCy.
39+
This method normalize both texts before comparing them.
4340
4441
:param text: string to compare
4542
:param expected_text: string with the expected text
4643
:param model_name: name of the spaCy model to use
4744
:returns: similarity score between the two texts
4845
"""
49-
if spacy is None:
50-
raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
46+
# NOTE: spaCy similarity performance can be enhanced using some strategies like:
47+
# - Normalizing texts (lowercase, extra points, etc.)
48+
# - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
49+
# - Preprocessing texts. Now we only preprocess negations.
5150
config = DriverWrappersPool.get_default_wrapper().config
52-
model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_sm')
53-
model = spacy.load(model_name)
51+
model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_md')
52+
model = get_spacy_model(model_name)
53+
if model is None:
54+
raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
55+
text = model(preprocess_with_ud_negation(text, model))
56+
expected_text = model(preprocess_with_ud_negation(expected_text, model))
5457
similarity = model(text).similarity(model(expected_text))
5558
logger.info(f"spaCy similarity: {similarity} between '{text}' and '{expected_text}'")
5659
return similarity

0 commit comments

Comments
 (0)