Skip to content

Commit 757f51e

Browse files
salma-elshafeySalma Elshafey
andauthored
Add Task Success Evaluator V0 (#42456)
* Add Task Success Evaluator V0 * Add samples for task success evaluator * Run black * Modify output format * Modify output format in the examples * Make Task Success a private preview evaluator * Minor TaskSuccessEvaluator prompt update * Fix path for importing Task Success Evaluator in samples * Modify path for TaskSuccessEvaluator in eval mapping * Remove sample notebook * To retrigger build pipelines * Add credential to TaskSuccessEvaluator * Run Black * To retrigger build pipeline * Minor prompt modification * Change tool_definitions type in TaskSuccess prompt * Mark model grader tests as skip * Remove task success evaluator from the samples notebook --------- Co-authored-by: Salma Elshafey <[email protected]>
1 parent 627a2ad commit 757f51e

File tree

10 files changed

+574
-9
lines changed

10 files changed

+574
-9
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
# Import all evals
1313
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
14+
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
1415
from azure.ai.evaluation import (
1516
BleuScoreEvaluator,
1617
CodeVulnerabilityEvaluator,
@@ -67,6 +68,7 @@
6768
SexualEvaluator: "sexual",
6869
SimilarityEvaluator: "similarity",
6970
TaskAdherenceEvaluator: "task_adherence",
71+
TaskSuccessEvaluator: "task_success",
7072
ToolCallAccuracyEvaluator: "tool_call_accuracy",
7173
UngroundedAttributesEvaluator: "ungrounded_attributes",
7274
ViolenceEvaluator: "violence",
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from ._task_success import TaskSuccessEvaluator
6+
7+
__all__ = ["TaskSuccessEvaluator"]
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
import os
5+
import math
6+
import logging
7+
from typing import Dict, Union, List, Optional
8+
9+
from typing_extensions import overload, override
10+
11+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13+
from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
14+
from azure.ai.evaluation._model_configurations import Message
15+
from azure.ai.evaluation._common._experimental import experimental
16+
17+
logger = logging.getLogger(__name__)
18+
19+
20+
@experimental
21+
class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
22+
"""The Task Success evaluator determines whether an AI agent successfully completed the requested task based on:
23+
24+
- Final outcome and deliverable of the task
25+
- Completeness of task requirements
26+
27+
This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
28+
29+
Scoring is binary:
30+
- TRUE: Task fully completed with usable deliverable that meets all user requirements
31+
- FALSE: Task incomplete, partially completed, or deliverable does not meet requirements
32+
33+
The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
34+
35+
36+
:param model_config: Configuration for the Azure OpenAI model.
37+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
38+
~azure.ai.evaluation.OpenAIModelConfiguration]
39+
40+
.. admonition:: Example:
41+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
42+
:start-after: [START task_success_evaluator]
43+
:end-before: [END task_success_evaluator]
44+
:language: python
45+
:dedent: 8
46+
:caption: Initialize and call a TaskSuccessEvaluator with a query and response.
47+
48+
.. admonition:: Example using Azure AI Project URL:
49+
50+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51+
:start-after: [START task_success_evaluator]
52+
:end-before: [END task_success_evaluator]
53+
:language: python
54+
:dedent: 8
55+
:caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format
56+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57+
58+
"""
59+
60+
_PROMPTY_FILE = "task_success.prompty"
61+
_RESULT_KEY = "task_success"
62+
_OPTIONAL_PARAMS = ["tool_definitions"]
63+
64+
id = "azureai://built-in/evaluators/task_success"
65+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
66+
67+
@override
68+
def __init__(self, model_config, *, credential=None, **kwargs):
69+
current_dir = os.path.dirname(__file__)
70+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
71+
super().__init__(
72+
model_config=model_config,
73+
prompty_file=prompty_path,
74+
result_key=self._RESULT_KEY,
75+
credential=credential,
76+
**kwargs,
77+
)
78+
79+
@overload
80+
def __call__(
81+
self,
82+
*,
83+
query: Union[str, List[dict]],
84+
response: Union[str, List[dict]],
85+
tool_definitions: Optional[Union[dict, List[dict]]] = None,
86+
) -> Dict[str, Union[str, bool]]:
87+
"""Evaluate task success for a given query, response, and optionally tool definitions.
88+
The query and response can be either a string or a list of messages.
89+
90+
91+
Example with string inputs and no tools:
92+
evaluator = TaskSuccessEvaluator(model_config)
93+
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
94+
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
95+
96+
result = evaluator(query=query, response=response)
97+
98+
Example with list of messages:
99+
evaluator = TaskSuccessEvaluator(model_config)
100+
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
101+
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
102+
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
103+
104+
result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
105+
106+
:keyword query: The query being evaluated, either a string or a list of messages.
107+
:paramtype query: Union[str, List[dict]]
108+
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
109+
:paramtype response: Union[str, List[dict]]
110+
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
111+
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
112+
:return: A dictionary with the task success evaluation results.
113+
:rtype: Dict[str, Union[str, bool]]
114+
"""
115+
116+
@override
117+
def __call__( # pylint: disable=docstring-missing-param
118+
self,
119+
*args,
120+
**kwargs,
121+
):
122+
"""
123+
Invokes the instance using the overloaded __call__ signature.
124+
125+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
126+
"""
127+
return super().__call__(*args, **kwargs)
128+
129+
@override
130+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
131+
"""Do Task Success evaluation.
132+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
133+
:type eval_input: Dict
134+
:return: The evaluation result.
135+
:rtype: Dict
136+
"""
137+
# we override the _do_eval method as we want the output to be a dictionary,
138+
# which is a different schema than _base_prompty_eval.py
139+
if "query" not in eval_input and "response" not in eval_input:
140+
raise EvaluationException(
141+
message=f"Both query and response must be provided as input to the Task Success evaluator.",
142+
internal_message=f"Both query and response must be provided as input to the Task Success evaluator.",
143+
blame=ErrorBlame.USER_ERROR,
144+
category=ErrorCategory.MISSING_FIELD,
145+
target=ErrorTarget.TASK_SUCCESS_EVALUATOR,
146+
)
147+
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
148+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
149+
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
150+
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
151+
152+
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
153+
if isinstance(llm_output, dict):
154+
success = llm_output.get("success", False)
155+
if isinstance(success, str):
156+
success = success.upper() == "TRUE"
157+
158+
success_result = "pass" if success == True else "fail"
159+
reason = llm_output.get("explanation", "")
160+
return {
161+
f"{self._result_key}": success,
162+
f"{self._result_key}_result": success_result,
163+
f"{self._result_key}_reason": reason,
164+
f"{self._result_key}_details": llm_output.get("details", ""),
165+
}
166+
if logger:
167+
logger.warning("LLM output is not a dictionary, returning False for the success.")
168+
return {self._result_key: False}

0 commit comments

Comments
 (0)