|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Copyright 2025 Telefónica Innovación Digital, S.L. |
| 4 | +This file is part of Toolium. |
| 5 | +
|
| 6 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +you may not use this file except in compliance with the License. |
| 8 | +You may obtain a copy of the License at |
| 9 | +
|
| 10 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +
|
| 12 | +Unless required by applicable law or agreed to in writing, software |
| 13 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +See the License for the specific language governing permissions and |
| 16 | +limitations under the License. |
| 17 | +""" |
| 18 | +import logging |
| 19 | + |
| 20 | +try: |
| 21 | + from sentence_transformers import SentenceTransformer, util |
| 22 | +except ImportError: |
| 23 | + SentenceTransformer = None |
| 24 | + |
| 25 | +from toolium.utils.ai_utils.openai import openai_request |
| 26 | +from toolium.driver_wrappers_pool import DriverWrappersPool |
| 27 | + |
| 28 | +# Configure logger |
| 29 | +logger = logging.getLogger(__name__) |
| 30 | + |
| 31 | +def build_system_message(characteristics): |
| 32 | + """ |
| 33 | + Build system message for text criteria analysis prompt. |
| 34 | +
|
| 35 | + :param characteristics: list of target characteristics to evaluate |
| 36 | + """ |
| 37 | + feature_list = "\n".join(f"- {c}" for c in characteristics) |
| 38 | + base_prompt = f""" |
| 39 | + You are an assistant that scores how well a given text matches a set of target characteristics and returns a JSON object. |
| 40 | +
|
| 41 | + You will receive a user message that contains ONLY the text to analyze. |
| 42 | +
|
| 43 | + Target characteristics: |
| 44 | + {feature_list} |
| 45 | +
|
| 46 | + Tasks: |
| 47 | + 1) For EACH characteristic, decide how well the text satisfies it on a scale from 0.0 (does not satisfy it at all) to 1.0 (perfectly satisfies it). Consider style, tone and content when relevant. |
| 48 | + 2) Only for each low scored characteristic (<=0.2), output: |
| 49 | + - "name": the exact characteristic name as listed above. |
| 50 | + - "score": a float between 0.0 and 0.2. |
| 51 | + 3) Compute an overall score "overall_match" between 0.0 and 1.0 that summarizes how well the text matches the whole set. It does not have to be a simple arithmetic mean, but must still be in [0.0, 1.0]. |
| 52 | + 4) Produce a "data" object that can contain extra structured analysis sections: |
| 53 | + - "data" MUST always be present. |
| 54 | + - "data" MUST be a JSON object. |
| 55 | + - Each key in "data" is the title/name of a section (e.g. "genres", "entities", "style_breakdown"). |
| 56 | + - Each value is a JSON array (the structure of its objects will be defined by additional system instructions). |
| 57 | +
|
| 58 | + Output format (IMPORTANT): |
| 59 | + Return ONLY a single valid JSON object with this exact top-level structure and property names: |
| 60 | +
|
| 61 | + {{ |
| 62 | + "overall_match": float, |
| 63 | + "features": [ |
| 64 | + {{ |
| 65 | + "name": string, |
| 66 | + "score": float |
| 67 | + }} |
| 68 | + ], |
| 69 | + "data": {{ |
| 70 | + "<section_title>": [ |
| 71 | + {{}} |
| 72 | + ] |
| 73 | + }} |
| 74 | + }} |
| 75 | +
|
| 76 | + Constraints: |
| 77 | + - The "data" field must ALWAYS be present. If there are no extra sections, it MUST be: "data": {{}}. |
| 78 | + - Use a dot as decimal separator (e.g. 0.75, not 0,75). |
| 79 | + - Use at most 2 decimal places for all scores. |
| 80 | + - Do NOT include any text outside the JSON (no Markdown, no comments, no explanations). |
| 81 | + - If a characteristic is not applicable to the text, give it a low score (<= 0.2). |
| 82 | + """ |
| 83 | + return base_prompt.strip() |
| 84 | + |
| 85 | + |
| 86 | +def get_text_criteria_analysis_openai(text_input, target_features, extra_tasks=None, model_name=None, azure=False, **kwargs): |
| 87 | + """ |
| 88 | + Get text criteria analysis using Azure OpenAI. To analyze how well a given text |
| 89 | + matches a set of target characteristics. |
| 90 | + The response is a structured JSON object with overall match score, individual feature scores, |
| 91 | + and additional data sections. |
| 92 | +
|
| 93 | + :param text_input: text to analyze |
| 94 | + :param target_features: list of target characteristics to evaluate |
| 95 | + :param extra_tasks: additional system messages for extra analysis sections (optional) |
| 96 | + :param model_name: name of the Azure OpenAI model to use |
| 97 | + :param azure: whether to use Azure OpenAI or standard OpenAI |
| 98 | + :param kwargs: additional parameters to be used by Azure OpenAI client |
| 99 | + :returns: response from Azure OpenAI |
| 100 | + """ |
| 101 | + # Build prompt using base prompt and target features |
| 102 | + system_message = build_system_message(target_features) |
| 103 | + msg = [system_message] |
| 104 | + if extra_tasks: |
| 105 | + if isinstance(extra_tasks, list): |
| 106 | + for task in extra_tasks: |
| 107 | + msg.append(task) |
| 108 | + else: |
| 109 | + msg.append(extra_tasks) |
| 110 | + return openai_request(system_message, text_input, model_name, azure, **kwargs) |
| 111 | + |
| 112 | + |
| 113 | +def get_text_criteria_analysis_sentence_transformers(text_input, target_features, extra_tasks=None, |
| 114 | + model_name=None, azure=True, **kwargs): |
| 115 | + """ |
| 116 | + Get text criteria analysis using Sentence Transformers. To analyze how well a given text |
| 117 | + matches a set of target characteristics. |
| 118 | +
|
| 119 | + :param text_input: text to analyze |
| 120 | + :param target_features: list of target characteristics to evaluate |
| 121 | + :param extra_tasks: additional system messages for extra analysis sections (not used here, for compatibility) |
| 122 | + :param model_name: name of the Sentence Transformers model to use |
| 123 | + :param azure: whether to use Azure OpenAI or standard OpenAI (not used here, for compatibility) |
| 124 | + :param kwargs: additional parameters to be used by Sentence Transformers client |
| 125 | + """ |
| 126 | + if SentenceTransformer is None: |
| 127 | + raise ImportError("Sentence Transformers is not installed. Please run 'pip install toolium[ai]'" |
| 128 | + " to use Sentence Transformers features") |
| 129 | + config = DriverWrappersPool.get_default_wrapper().config |
| 130 | + model_name = model_name or config.get_optional('AI', 'sentence_transformers_model', 'all-mpnet-base-v2') |
| 131 | + model = SentenceTransformer(model_name, **kwargs) |
| 132 | + # Pre-compute feature embeddings |
| 133 | + feature_embs = model.encode([f for f in target_features], normalize_embeddings=True) |
| 134 | + # text_input embedding |
| 135 | + text_emb = model.encode(text_input, normalize_embeddings=True) |
| 136 | + # Computes cosine-similarities between the text and features tensors (range [-1, 1]) |
| 137 | + sims = util.cos_sim(text_emb, feature_embs)[0].tolist() |
| 138 | + results = [] |
| 139 | + # Generate contracted results |
| 140 | + for f, sim in zip(target_features, sims): |
| 141 | + # Normalize similarity from [-1, 1] to [0, 1] |
| 142 | + score = (sim + 1.0) / 2.0 |
| 143 | + results.append({ |
| 144 | + "name": f, |
| 145 | + "score": round(score, 2), |
| 146 | + }) |
| 147 | + |
| 148 | + # overall score as average of feature scores |
| 149 | + overall = sum(r["score"] for r in results) / len(results) |
| 150 | + |
| 151 | + return { |
| 152 | + "overall_match": round(overall, 2), |
| 153 | + "features": results, |
| 154 | + "data": {} |
| 155 | + } |
0 commit comments