Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements_dev.txt
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_md
- name: Lint with flake8
run: |
flake8 . --count --max-complexity=10 --max-line-length=121 --show-source --statistics
Expand Down
99 changes: 99 additions & 0 deletions toolium/utils/ai_utils/spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
"""
Copyright 2025 Telefónica Innovación Digital, S.L.
This file is part of Toolium.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import logging
from functools import lru_cache

# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
try:
import spacy
except ImportError:
spacy = None


# Configure logger
logger = logging.getLogger(__name__)


@lru_cache(maxsize=8)
def get_spacy_model(model_name):
"""
get spaCy model.
This method uses lru cache to get spaCy model to improve performance.

:param model_name: spaCy model name
:return: spaCy model
"""
return spacy.load(model_name)


def is_negator(tok):
"""
Check if a token is a negator using Universal Dependencies guidelines
Note: some languages may have different negation markers. That's why we use UD guidelines.

:param tok: spaCy token
"""
# Universal Dependencies negation detection (e.g., Spanish "no", "nunca", etc.)
if tok.dep_ == "neg":
return True
# Some languages use Polarity=Neg for negation words (e.g., Spanish "no", "sin", etc.)
if "Neg" in tok.morph.get("Polarity"):
return True
# Some languages use PronType=Neg for negation words (e.g., Spanish "nunca", "nadie", etc.)
if "Neg" in tok.morph.get("PronType"):
return True
return False


def preprocess_with_ud_negation(text, nlp):
"""
Preprocess text using Universal Dependencies negation handling.
It tags negated words with "NEG_" prefix and replaces negators with "NEGATOR" token.
Stop words are removed.

:param text: input text
:param nlp: spaCy language model
"""
doc = nlp(text)
# 1) Negators indexes
neg_idxs = {t.i for t in doc if is_negator(t)}
# 2) Negated heads indexes
negated_heads = set()
for i in neg_idxs:
head = doc[i].head
if head.is_alpha and not head.is_stop:
negated_heads.add(head.i)

toks = []
for t in doc:
if not t.is_alpha:
continue
# Keep negators as is
if is_negator(t):
toks.append("NEGATOR")
continue
if t.is_stop:
continue

lemma = t.lemma_.lower()
if t.i in negated_heads:
toks.append("NEG_" + lemma)
else:
toks.append(lemma)
return " ".join(toks)
14 changes: 11 additions & 3 deletions toolium/utils/ai_utils/text_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

from toolium.driver_wrappers_pool import DriverWrappersPool
from toolium.utils.ai_utils.openai import openai_request
from toolium.utils.ai_utils.spacy import get_spacy_model, preprocess_with_ud_negation


# Configure logger
Expand All @@ -39,18 +40,25 @@

def get_text_similarity_with_spacy(text, expected_text, model_name=None):
"""
Return similarity between two texts using spaCy
Return similarity between two texts using spaCy.
This method normalize both texts before comparing them.

:param text: string to compare
:param expected_text: string with the expected text
:param model_name: name of the spaCy model to use
:returns: similarity score between the two texts
"""
# NOTE: spaCy similarity performance can be enhanced using some strategies like:
# - Normalizing texts (lowercase, extra points, etc.)
# - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
# - Preprocessing texts. Now we only preprocess negations.
if spacy is None:
raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
config = DriverWrappersPool.get_default_wrapper().config
model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_sm')
model = spacy.load(model_name)
model_name = model_name or config.get_optional('AI', 'spacy_model', 'en_core_web_md')
model = get_spacy_model(model_name)
text = model(preprocess_with_ud_negation(text, model))
expected_text = model(preprocess_with_ud_negation(expected_text, model))
similarity = model(text).similarity(model(expected_text))
logger.info(f"spaCy similarity: {similarity} between '{text}' and '{expected_text}'")
return similarity
Expand Down
Loading