Telefonica · manuelporrasojeda · Oct 29, 2025 · Oct 23, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r requirements_dev.txt
-        python -m spacy download en_core_web_sm
+        python -m spacy download en_core_web_md
     - name: Lint with flake8
       run: |
         flake8 . --count --max-complexity=10 --max-line-length=121 --show-source --statistics

diff --git a/toolium/utils/ai_utils/spacy.py b/toolium/utils/ai_utils/spacy.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright 2025 Telefónica Innovación Digital, S.L.
+This file is part of Toolium.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+from functools import lru_cache
+
+# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
+
+# Configure logger
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=8)
+def get_spacy_model(model_name):
+    """
+    get spaCy model.
+    This method uses lru cache to get spaCy model to improve performance.
+
+    :param model_name: spaCy model name
+    :return: spaCy model
+    """
+    return spacy.load(model_name)
+
+
+def is_negator(tok):
+    """
+    Check if a token is a negator using Universal Dependencies guidelines
+    Note: some languages may have different negation markers. That's why we use UD guidelines.
+
+    :param tok: spaCy token
+    """
+    # Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
+    if tok.dep_ == "neg":
+        return True
+    # Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
+    if "Neg" in tok.morph.get("Polarity"):
+        return True
+    # Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
+    if "Neg" in tok.morph.get("PronType"):
+        return True
+    return False
+
+
+def preprocess_with_ud_negation(text, nlp):
+    """
+    Preprocess text using Universal Dependencies negation handling.
+    It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
+    Stop words are removed.
+
+    :param text: input text
+    :param nlp: spaCy language model
+    """
+    doc = nlp(text)
+    # 1) Negators indexes
+    neg_idxs = {t.i for t in doc if is_negator(t)}
+    # 2) Negated heads indexes
+    negated_heads = set()
+    for i in neg_idxs:
+        head = doc[i].head
+        if head.is_alpha and not head.is_stop:
+            negated_heads.add(head.i)
+
+    toks = []
+    for t in doc:
+        if not t.is_alpha:
+            continue
+        # Keep negators as is
+        if is_negator(t):
+            toks.append("NEGATOR")
+            continue
+        if t.is_stop:
+            continue
+
+        lemma = t.lemma_.lower()
+        if t.i in negated_heads:
+            toks.append("NEG_" + lemma)
+        else:
+            toks.append(lemma)
+    return " ".join(toks)
diff --git a/toolium/utils/ai_utils/text_similarity.py b/toolium/utils/ai_utils/text_similarity.py
@@ -31,6 +31,7 @@
 
 from toolium.driver_wrappers_pool import DriverWrappersPool
 from toolium.utils.ai_utils.openai import openai_request
+from toolium.utils.ai_utils.spacy import get_spacy_model, preprocess_with_ud_negation
 
 
 # Configure logger
@@ -39,18 +40,25 @@
 
 def get_text_similarity_with_spacy(text, expected_text, model_name=None):
     """
-    Return similarity between two texts using spaCy
+    Return similarity between two texts using spaCy.
+    This method normalize both texts before comparing them.
 
     :param text: string to compare
     :param expected_text: string with the expected text
     :param model_name: name of the spaCy model to use
     :returns: similarity score between the two texts
     """
+    # NOTE: spaCy similarity performance can be enhanced using some strategies like:
+    # - Normalizing texts (lowercase, extra points, etc.)
+    # - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
+    # - Preprocessing texts. Now we only preprocess negations.
     if spacy is None:
         raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
     config = DriverWrappersPool.get_default_wrapper().config
-    model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_sm')
-    model = spacy.load(model_name)
+    model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_md')
+    model = get_spacy_model(model_name)
+    text = model(preprocess_with_ud_negation(text, model))
+    expected_text = model(preprocess_with_ud_negation(expected_text, model))
     similarity = model(text).similarity(model(expected_text))
     logger.info(f"spaCy similarity: {similarity} between '{text}' and '{expected_text}'")
     return similarity