feat(ai): improve spacy similarity (#434)

manuelporrasojeda · web-flow · commit 1bcea5ea8bdc · 2025-10-29T10:24:28.000+01:00
* improve spacy similarity

* move to file

* fix lint

* model update

* en_core_web_md

* split in files

* fix typo

* move toolium import

* improve tests

* no spacy import at text_similarity file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r requirements_dev.txt
-        python -m spacy download en_core_web_sm
+        python -m spacy download en_core_web_md
     - name: Lint with flake8
       run: |
         flake8 . --count --max-complexity=10 --max-line-length=121 --show-source --statistics
diff --git a/toolium/test/utils/ai_utils/test_text_similarity.py b/toolium/test/utils/ai_utils/test_text_similarity.py
@@ -39,8 +39,8 @@ def configure_default_openai_model():
 
 get_similarity_examples = (
     ('Today it will be sunny', 'Today it will be sunny', 0.9, 1),
-    ('Today is sunny', 'Today it will be sunny', 0.6, 0.9),
-    ('It is sunny', 'Today it will be sunny', 0.5, 0.7),
+    ('Today is sunny', 'Today it will be sunny', 0.7, 1),
+    ('It is sunny', 'Today it will be sunny', 0.7, 1),
     ('Nothing related', 'Today it will be sunny', 0, 0.6),
 )
 
diff --git a/toolium/utils/ai_utils/spacy.py b/toolium/utils/ai_utils/spacy.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright 2025 Telefónica Innovación Digital, S.L.
+This file is part of Toolium.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+from functools import lru_cache
+
+# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
+
+# Configure logger
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=8)
+def get_spacy_model(model_name):
+    """
+    get spaCy model.
+    This method uses lru cache to get spaCy model to improve performance.
+
+    :param model_name: spaCy model name
+    :return: spaCy model
+    """
+    if spacy is None:
+        return None
+    return spacy.load(model_name)
+
+
+def is_negator(tok):
+    """
+    Check if a token is a negator using Universal Dependencies guidelines
+    Note: some languages may have different negation markers. That's why we use UD guidelines.
+
+    :param tok: spaCy token
+    """
+    # Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
+    if tok.dep_ == "neg":
+        return True
+    # Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
+    if "Neg" in tok.morph.get("Polarity"):
+        return True
+    # Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
+    if "Neg" in tok.morph.get("PronType"):
+        return True
+    return False
+
+
+def preprocess_with_ud_negation(text, nlp):
+    """
+    Preprocess text using Universal Dependencies negation handling.
+    It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
+    Stop words are removed.
+
+    :param text: input text
+    :param nlp: spaCy language model
+    """
+    doc = nlp(text)
+    # 1) Negators indexes
+    neg_idxs = {t.i for t in doc if is_negator(t)}
+    # 2) Negated heads indexes
+    negated_heads = set()
+    for i in neg_idxs:
+        head = doc[i].head
+        if head.is_alpha and not head.is_stop:
+            negated_heads.add(head.i)
+
+    toks = []
+    for t in doc:
+        if not t.is_alpha:
+            continue
+        # Keep negators as is
+        if is_negator(t):
+            toks.append("NEGATOR")
+            continue
+        if t.is_stop:
+            continue
+
+        lemma = t.lemma_.lower()
+        if t.i in negated_heads:
+            toks.append("NEG_" + lemma)
+        else:
+            toks.append(lemma)
+    return " ".join(toks)
diff --git a/toolium/utils/ai_utils/text_similarity.py b/toolium/utils/ai_utils/text_similarity.py
@@ -19,18 +19,14 @@
 import json
 import logging
 
-# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
-try:
-    import spacy
-except ImportError:
-    spacy = None
 try:
     from sentence_transformers import SentenceTransformer
 except ImportError:
     SentenceTransformer = None
 
 from toolium.driver_wrappers_pool import DriverWrappersPool
 from toolium.utils.ai_utils.openai import openai_request
+from toolium.utils.ai_utils.spacy import get_spacy_model, preprocess_with_ud_negation
 
 
 # Configure logger
@@ -39,18 +35,25 @@
 
 def get_text_similarity_with_spacy(text, expected_text, model_name=None):
     """
-    Return similarity between two texts using spaCy
+    Return similarity between two texts using spaCy.
+    This method normalize both texts before comparing them.
 
     :param text: string to compare
     :param expected_text: string with the expected text
     :param model_name: name of the spaCy model to use
     :returns: similarity score between the two texts
     """
-    if spacy is None:
-        raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
+    # NOTE: spaCy similarity performance can be enhanced using some strategies like:
+    # - Normalizing texts (lowercase, extra points, etc.)
+    # - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
+    # - Preprocessing texts. Now we only preprocess negations.
     config = DriverWrappersPool.get_default_wrapper().config
-    model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_sm')
-    model = spacy.load(model_name)
+    model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_md')
+    model = get_spacy_model(model_name)
+    if model is None:
+        raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
+    text = model(preprocess_with_ud_negation(text, model))
+    expected_text = model(preprocess_with_ud_negation(expected_text, model))
     similarity = model(text).similarity(model(expected_text))
     logger.info(f"spaCy similarity: {similarity} between '{text}' and '{expected_text}'")
     return similarity

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@ def configure_default_openai_model():`
`39`	`39`
`40`	`40`	`get_similarity_examples = (`
`41`	`41`	`('Today it will be sunny', 'Today it will be sunny', 0.9, 1),`
`42`		`- ('Today is sunny', 'Today it will be sunny', 0.6, 0.9),`
`43`		`- ('It is sunny', 'Today it will be sunny', 0.5, 0.7),`
	`42`	`+ ('Today is sunny', 'Today it will be sunny', 0.7, 1),`
	`43`	`+ ('It is sunny', 'Today it will be sunny', 0.7, 1),`
`44`	`44`	`('Nothing related', 'Today it will be sunny', 0, 0.6),`
`45`	`45`	`)`
`46`	`46`