Skip to content

Commit 1234486

Browse files
authored
[evaluation] task adherence prompty update (#43709)
* update prompty for task adherence evaluator * fix formatting * updates
1 parent 50d2424 commit 1234486

File tree

3 files changed

+170
-453
lines changed

3 files changed

+170
-453
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py

Lines changed: 86 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from ..._common.utils import (
1414
reformat_conversation_history,
1515
reformat_agent_response,
16-
reformat_tool_definitions,
1716
)
1817
from azure.ai.evaluation._model_configurations import Message
1918
from azure.ai.evaluation._common._experimental import experimental
@@ -23,20 +22,18 @@
2322

2423
@experimental
2524
class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
26-
"""The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
25+
"""The Task Adherence evaluator assesses whether an AI assistant's actions fully align with the user's intent
26+
and fully achieve the intended goal across three dimensions:
2727
28-
- Alignment with instructions and definitions
29-
- Accuracy and clarity of the response
30-
- Proper use of provided tool definitions
28+
- Goal adherence: Did the assistant achieve the user's objective within scope and constraints?
29+
- Rule adherence: Did the assistant respect safety, privacy, authorization, and presentation contracts?
30+
- Procedural adherence: Did the assistant follow required workflows, tool use, sequencing, and verification?
3131
32-
Scoring is based on five levels:
33-
1. Fully Inadherent - Response completely ignores instructions.
34-
2. Barely Adherent - Partial alignment with critical gaps.
35-
3. Moderately Adherent - Meets core requirements but lacks precision.
36-
4. Mostly Adherent - Clear and accurate with minor issues.
37-
5. Fully Adherent - Flawless adherence to instructions.
32+
The evaluator returns a boolean flag indicating whether there was any material failure in any dimension.
33+
A material failure is an issue that makes the output unusable, creates verifiable risk, violates an explicit
34+
constraint, or is a critical issue as defined in the evaluation dimensions.
3835
39-
The evaluation includes a step-by-step reasoning process, a brief explanation, and a final integer score.
36+
The evaluation includes step-by-step reasoning and a flagged boolean result.
4037
4138
4239
:param model_config: Configuration for the Azure OpenAI model.
@@ -66,9 +63,9 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6663

6764
_PROMPTY_FILE = "task_adherence.prompty"
6865
_RESULT_KEY = "task_adherence"
69-
_OPTIONAL_PARAMS = ["tool_definitions"]
66+
_OPTIONAL_PARAMS = []
7067

71-
_DEFAULT_TASK_ADHERENCE_SCORE = 3
68+
_DEFAULT_TASK_ADHERENCE_SCORE = 0
7269

7370
id = "azureai://built-in/evaluators/task_adherence"
7471
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
@@ -82,7 +79,6 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, cre
8279
model_config=model_config,
8380
prompty_file=prompty_path,
8481
result_key=self._RESULT_KEY,
85-
threshold=threshold,
8682
credential=credential,
8783
_higher_is_better=True,
8884
**kwargs,
@@ -96,33 +92,23 @@ def __call__(
9692
response: Union[str, List[dict]],
9793
tool_definitions: Optional[Union[dict, List[dict]]] = None,
9894
) -> Dict[str, Union[str, float]]:
99-
"""Evaluate task adherence for a given query, response, and optional tool defintions.
100-
The query and response can be either a string or a list of messages.
95+
"""Evaluate task adherence for a given query and response.
96+
The query and response must be lists of messages in conversation format.
10197
10298
103-
Example with string inputs and no tools:
104-
evaluator = TaskAdherenceEvaluator(model_config)
105-
query = "What is the weather today?"
106-
response = "The weather is sunny."
107-
108-
result = evaluator(query=query, response=response)
109-
11099
Example with list of messages:
111100
evaluator = TaskAdherenceEvaluator(model_config)
112101
query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
113102
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
114-
tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
115103
116-
result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
104+
result = evaluator(query=query, response=response)
117105
118-
:keyword query: The query being evaluated, either a string or a list of messages.
106+
:keyword query: The query being evaluated, must be a list of messages including system and user messages.
119107
:paramtype query: Union[str, List[dict]]
120-
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
108+
:keyword response: The response being evaluated, must be a list of messages (full agent response including tool calls and results)
121109
:paramtype response: Union[str, List[dict]]
122-
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
123-
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
124-
:return: A dictionary with the task adherence evaluation results.
125-
:rtype: Dict[str, Union[str, float]]
110+
:return: A dictionary with the task adherence evaluation results including flagged (bool) and reasoning (str).
111+
:rtype: Dict[str, Union[str, float, bool]]
126112
"""
127113

128114
@override
@@ -139,7 +125,7 @@ def __call__( # pylint: disable=docstring-missing-param
139125
return super().__call__(*args, **kwargs)
140126

141127
@override
142-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
128+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]: # type: ignore[override]
143129
"""Do Task Adherence evaluation.
144130
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
145131
:type eval_input: Dict
@@ -148,35 +134,83 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
148134
"""
149135
# we override the _do_eval method as we want the output to be a dictionary,
150136
# which is a different schema than _base_prompty_eval.py
151-
if "query" not in eval_input and "response" not in eval_input:
137+
if "query" not in eval_input or "response" not in eval_input:
152138
raise EvaluationException(
153139
message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
154140
internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
155141
blame=ErrorBlame.USER_ERROR,
156142
category=ErrorCategory.MISSING_FIELD,
157143
target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
158144
)
159-
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
160-
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
161-
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
162-
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
163145

164-
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
146+
# Reformat conversation history and extract system message
147+
query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
148+
system_message = ""
149+
user_query = ""
150+
151+
# Parse query messages to extract system message and user query
152+
if isinstance(query_messages, list):
153+
for msg in query_messages:
154+
if isinstance(msg, dict) and msg.get("role") == "system":
155+
system_message = msg.get("content", "")
156+
elif isinstance(msg, dict) and msg.get("role") == "user":
157+
user_query = msg.get("content", "")
158+
elif isinstance(query_messages, str):
159+
user_query = query_messages
160+
161+
# Reformat response and separate assistant messages from tool calls
162+
response_messages = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
163+
assistant_response = ""
164+
tool_calls = ""
165+
166+
# Parse response messages to extract assistant response and tool calls
167+
if isinstance(response_messages, list):
168+
assistant_parts = []
169+
tool_parts = []
170+
for msg in response_messages:
171+
if isinstance(msg, dict):
172+
role = msg.get("role", "")
173+
if role == "assistant":
174+
content = msg.get("content", "")
175+
if isinstance(content, list):
176+
for item in content:
177+
if isinstance(item, dict):
178+
if item.get("type") == "text":
179+
assistant_parts.append(item.get("text", ""))
180+
elif item.get("type") == "tool_call":
181+
tool_parts.append(str(item.get("tool_call", "")))
182+
else:
183+
assistant_parts.append(str(content))
184+
elif role == "tool":
185+
tool_parts.append(str(msg))
186+
assistant_response = "\n".join(assistant_parts)
187+
tool_calls = "\n".join(tool_parts)
188+
elif isinstance(response_messages, str):
189+
assistant_response = response_messages
190+
191+
# Prepare inputs for prompty
192+
prompty_input = {
193+
"system_message": system_message,
194+
"query": user_query,
195+
"response": assistant_response,
196+
"tool_calls": tool_calls,
197+
}
198+
199+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
165200
llm_output = prompty_output_dict["llm_output"]
166201

167-
score = math.nan
168202
if isinstance(llm_output, dict):
169-
score = float(llm_output.get("score", math.nan))
170-
score_result = "pass" if score >= self._threshold else "fail"
171-
reason = llm_output.get("explanation", "")
203+
flagged = llm_output.get("flagged", False)
204+
reasoning = llm_output.get("reasoning", "")
205+
# Convert flagged to numeric score for backward compatibility (1 = pass, 0 = fail)
206+
score = 0.0 if flagged else 1.0
207+
score_result = "fail" if flagged else "pass"
208+
172209
return {
173210
f"{self._result_key}": score,
174-
f"gpt_{self._result_key}": score,
175211
f"{self._result_key}_result": score_result,
176-
f"{self._result_key}_threshold": self._threshold,
177-
f"{self._result_key}_reason": reason,
178-
# Uncomment the following line in the next iteration after UI contracts are validated.
179-
# f"{self._result_key}_additional_details": llm_output
212+
f"{self._result_key}_reason": reasoning,
213+
f"{self._result_key}_details": llm_output.get("details", ""),
180214
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
181215
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
182216
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
@@ -185,13 +219,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
185219
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
186220
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
187221
}
222+
188223
if logger:
189-
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
190-
191-
binary_result = self._get_binary_result(score)
192-
return {
193-
self._result_key: float(score),
194-
f"gpt_{self._result_key}": float(score),
195-
f"{self._result_key}_result": binary_result,
196-
f"{self._result_key}_threshold": self._threshold,
197-
}
224+
logger.warning("LLM output is not a dictionary, returning 0 for the success.")
225+
226+
return {self._result_key: 0}

0 commit comments

Comments
 (0)