Skip to content

Commit c41cdf4

Browse files
move to file
1 parent 7fe2a7d commit c41cdf4

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

toolium/utils/ai_utils/text_similarity.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import json
2020
import logging
21+
from functools import lru_cache
2122

2223
# AI library imports must be optional to allow installing Toolium without `ai` extra dependency
2324
try:
@@ -36,6 +37,17 @@
3637
# Configure logger
3738
logger = logging.getLogger(__name__)
3839

40+
@lru_cache(maxsize=8)
41+
def get_nlp(model_name):
42+
"""
43+
get spaCy model.
44+
This method uses lru cache to get spaCy model to improve performance.
45+
46+
:param model_name: spaCy model name
47+
:return: spaCy model
48+
"""
49+
return spacy.load(model_name)
50+
3951
def is_negator(tok):
4052
"""
4153
Check if a token is a negator using Universal Dependencies guidelines
@@ -93,7 +105,8 @@ def preprocess_with_ud_negation(text, nlp):
93105

94106
def get_text_similarity_with_spacy(text, expected_text, model_name=None):
95107
"""
96-
Return similarity between two texts using spaCy
108+
Return similarity between two texts using spaCy.
109+
This method normalize both texts before comparing them.
97110
98111
:param text: string to compare
99112
:param expected_text: string with the expected text
@@ -103,13 +116,12 @@ def get_text_similarity_with_spacy(text, expected_text, model_name=None):
103116
# NOTE: spaCy similarity performance can be enhanced using some strategies like:
104117
# - Normalizing texts (lowercase, extra points, etc.)
105118
# - Use only models that include word vectors (e.g., 'en_core_news_md' or 'en_core_news_lg')
106-
# - Preprocessing texts. In this approach, we only preprocess negations.
119+
# - Preprocessing texts. Now we only preprocess negations.
107120
if spacy is None:
108121
raise ImportError("spaCy is not installed. Please run 'pip install toolium[ai]' to use spaCy features")
109122
config = DriverWrappersPool.get_default_wrapper().config
110123
model_name = model_name or config.get_optional('AI', 'spacy_model', 'es_core_news_md')
111-
# TODO: Cache loaded models to improve performance using @lru_cache(maxsize=N) as decorator
112-
model = spacy.load(model_name)
124+
model = get_nlp(model_name)
113125
text = model(preprocess_with_ud_negation(text, model))
114126
expected_text = model(preprocess_with_ud_negation(expected_text, model))
115127
similarity = model(text).similarity(model(expected_text))

0 commit comments

Comments
 (0)