Skip to content

Commit 409f879

Browse files
text analysis
1 parent bd8696a commit 409f879

File tree

3 files changed

+166
-4
lines changed

3 files changed

+166
-4
lines changed

CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ v3.7.1
66

77
*Release date: In development*
88

9+
- Add text analysis tool to get an overall match of a text against a list of expected caractersitics
10+
using AI libraries that come with the `ai` extra dependency
11+
912
v3.7.0
1013
------
1114

toolium/utils/ai_utils/openai.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,16 @@ def openai_request(system_message, user_message, model_name=None, azure=False, *
4949
model_name = model_name or config.get_optional('AI', 'openai_model', 'gpt-4o-mini')
5050
logger.info(f"Calling to OpenAI API with model {model_name}")
5151
client = AzureOpenAI(**kwargs) if azure else OpenAI(**kwargs)
52+
msg = []
53+
if isinstance(system_message, list):
54+
for prompt in system_message:
55+
msg.append({"role": "system", "content": prompt})
56+
else:
57+
msg.append({"role": "system", "content": system_message})
58+
msg.append({"role": "user", "content": user_message})
5259
completion = client.chat.completions.create(
5360
model=model_name,
54-
messages=[
55-
{"role": "system", "content": system_message},
56-
{"role": "user", "content": user_message},
57-
],
61+
messages=msg,
5862
)
5963
response = completion.choices[0].message.content
6064
logger.debug(f"OpenAI response: {response}")
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Copyright 2025 Telefónica Innovación Digital, S.L.
4+
This file is part of Toolium.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
"""
18+
import logging
19+
20+
try:
21+
from sentence_transformers import SentenceTransformer, util
22+
except ImportError:
23+
SentenceTransformer = None
24+
25+
from toolium.utils.ai_utils.openai import openai_request
26+
from toolium.driver_wrappers_pool import DriverWrappersPool
27+
28+
# Configure logger
29+
logger = logging.getLogger(__name__)
30+
31+
def build_system_message(characteristics):
32+
"""
33+
Build system message for text criteria analysis prompt.
34+
35+
:param characteristics: list of target characteristics to evaluate
36+
"""
37+
feature_list = "\n".join(f"- {c}" for c in characteristics)
38+
base_prompt = f"""
39+
You are an assistant that scores how well a given text matches a set of target characteristics and returns a JSON object.
40+
41+
You will receive a user message that contains ONLY the text to analyze.
42+
43+
Target characteristics:
44+
{feature_list}
45+
46+
Tasks:
47+
1) For EACH characteristic, decide how well the text satisfies it on a scale from 0.0 (does not satisfy it at all) to 1.0 (perfectly satisfies it). Consider style, tone and content when relevant.
48+
2) Only for each low scored characteristic (<=0.2), output:
49+
- "name": the exact characteristic name as listed above.
50+
- "score": a float between 0.0 and 0.2.
51+
3) Compute an overall score "overall_match" between 0.0 and 1.0 that summarizes how well the text matches the whole set. It does not have to be a simple arithmetic mean, but must still be in [0.0, 1.0].
52+
4) Produce a "data" object that can contain extra structured analysis sections:
53+
- "data" MUST always be present.
54+
- "data" MUST be a JSON object.
55+
- Each key in "data" is the title/name of a section (e.g. "genres", "entities", "style_breakdown").
56+
- Each value is a JSON array (the structure of its objects will be defined by additional system instructions).
57+
58+
Output format (IMPORTANT):
59+
Return ONLY a single valid JSON object with this exact top-level structure and property names:
60+
61+
{{
62+
"overall_match": float,
63+
"features": [
64+
{{
65+
"name": string,
66+
"score": float
67+
}}
68+
],
69+
"data": {{
70+
"<section_title>": [
71+
{{}}
72+
]
73+
}}
74+
}}
75+
76+
Constraints:
77+
- The "data" field must ALWAYS be present. If there are no extra sections, it MUST be: "data": {{}}.
78+
- Use a dot as decimal separator (e.g. 0.75, not 0,75).
79+
- Use at most 2 decimal places for all scores.
80+
- Do NOT include any text outside the JSON (no Markdown, no comments, no explanations).
81+
- If a characteristic is not applicable to the text, give it a low score (<= 0.2).
82+
"""
83+
return base_prompt.strip()
84+
85+
86+
def get_text_criteria_analysis_openai(text_input, target_features, extra_tasks=None, model_name=None, azure=False, **kwargs):
87+
"""
88+
Get text criteria analysis using Azure OpenAI. To analyze how well a given text
89+
matches a set of target characteristics.
90+
The response is a structured JSON object with overall match score, individual feature scores,
91+
and additional data sections.
92+
93+
:param text_input: text to analyze
94+
:param target_features: list of target characteristics to evaluate
95+
:param extra_tasks: additional system messages for extra analysis sections (optional)
96+
:param model_name: name of the Azure OpenAI model to use
97+
:param azure: whether to use Azure OpenAI or standard OpenAI
98+
:param kwargs: additional parameters to be used by Azure OpenAI client
99+
:returns: response from Azure OpenAI
100+
"""
101+
# Build prompt using base prompt and target features
102+
system_message = build_system_message(target_features)
103+
msg = [system_message]
104+
if extra_tasks:
105+
if isinstance(extra_tasks, list):
106+
for task in extra_tasks:
107+
msg.append(task)
108+
else:
109+
msg.append(extra_tasks)
110+
return openai_request(system_message, text_input, model_name, azure, **kwargs)
111+
112+
113+
def get_text_criteria_analysis_sentence_transformers(text_input, target_features, extra_tasks=None,
114+
model_name=None, azure=True, **kwargs):
115+
"""
116+
Get text criteria analysis using Sentence Transformers. To analyze how well a given text
117+
matches a set of target characteristics.
118+
119+
:param text_input: text to analyze
120+
:param target_features: list of target characteristics to evaluate
121+
:param extra_tasks: additional system messages for extra analysis sections (not used here, for compatibility)
122+
:param model_name: name of the Sentence Transformers model to use
123+
:param azure: whether to use Azure OpenAI or standard OpenAI (not used here, for compatibility)
124+
:param kwargs: additional parameters to be used by Sentence Transformers client
125+
"""
126+
if SentenceTransformer is None:
127+
raise ImportError("Sentence Transformers is not installed. Please run 'pip install toolium[ai]'"
128+
" to use Sentence Transformers features")
129+
config = DriverWrappersPool.get_default_wrapper().config
130+
model_name = model_name or config.get_optional('AI', 'sentence_transformers_model', 'all-mpnet-base-v2')
131+
model = SentenceTransformer(model_name, **kwargs)
132+
# Pre-compute feature embeddings
133+
feature_embs = model.encode([f for f in target_features], normalize_embeddings=True)
134+
# text_input embedding
135+
text_emb = model.encode(text_input, normalize_embeddings=True)
136+
# Computes cosine-similarities between the text and features tensors (range [-1, 1])
137+
sims = util.cos_sim(text_emb, feature_embs)[0].tolist()
138+
results = []
139+
# Generate contracted results
140+
for f, sim in zip(target_features, sims):
141+
# Normalize similarity from [-1, 1] to [0, 1]
142+
score = (sim + 1.0) / 2.0
143+
results.append({
144+
"name": f,
145+
"score": round(score, 2),
146+
})
147+
148+
# overall score as average of feature scores
149+
overall = sum(r["score"] for r in results) / len(results)
150+
151+
return {
152+
"overall_match": round(overall, 2),
153+
"features": results,
154+
"data": {}
155+
}

0 commit comments

Comments
 (0)