Skip to content

Commit 05f1eae

Browse files
salma-elshafeySalma Elshafey
andauthored
Rename Task Success Evaluator to Task Completion Evaluator (#43190)
* Add Task Success Evaluator V0 * Add samples for task success evaluator * Run black * Modify output format * Modify output format in the examples * Make Task Success a private preview evaluator * Minor TaskSuccessEvaluator prompt update * Fix path for importing Task Success Evaluator in samples * Modify path for TaskSuccessEvaluator in eval mapping * Remove sample notebook * To retrigger build pipelines * Add credential to TaskSuccessEvaluator * Run Black * To retrigger build pipeline * Minor prompt modification * Change tool_definitions type in TaskSuccess prompt * Mark model grader tests as skip * Remove task success evaluator from the samples notebook * Rename Task Success to Task Completion * Minor definition modification * Minor rename * remove task_success * Fix merge issue --------- Co-authored-by: Salma Elshafey <[email protected]>
1 parent 07395dd commit 05f1eae

File tree

8 files changed

+39
-39
lines changed

8 files changed

+39
-39
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
# Import all evals
1313
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
14-
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
14+
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
1515
from azure.ai.evaluation import (
1616
BleuScoreEvaluator,
1717
CodeVulnerabilityEvaluator,
@@ -68,7 +68,7 @@
6868
SexualEvaluator: "sexual",
6969
SimilarityEvaluator: "similarity",
7070
TaskAdherenceEvaluator: "task_adherence",
71-
TaskSuccessEvaluator: "task_success",
71+
TaskCompletionEvaluator: "task_completion",
7272
ToolCallAccuracyEvaluator: "tool_call_accuracy",
7373
UngroundedAttributesEvaluator: "ungrounded_attributes",
7474
ViolenceEvaluator: "violence",
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from ._task_success import TaskSuccessEvaluator
5+
from ._task_completion import TaskCompletionEvaluator
66

7-
__all__ = ["TaskSuccessEvaluator"]
7+
__all__ = ["TaskCompletionEvaluator"]
Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818

1919

2020
@experimental
21-
class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
22-
"""The Task Success evaluator determines whether an AI agent successfully completed the requested task based on:
21+
class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
22+
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:
2323
2424
- Final outcome and deliverable of the task
2525
- Completeness of task requirements
@@ -39,29 +39,29 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
3939
4040
.. admonition:: Example:
4141
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
42-
:start-after: [START task_success_evaluator]
43-
:end-before: [END task_success_evaluator]
42+
:start-after: [START task_completion_evaluator]
43+
:end-before: [END task_completion_evaluator]
4444
:language: python
4545
:dedent: 8
46-
:caption: Initialize and call a TaskSuccessEvaluator with a query and response.
46+
:caption: Initialize and call a TaskCompletionEvaluator with a query and response.
4747
4848
.. admonition:: Example using Azure AI Project URL:
4949
5050
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51-
:start-after: [START task_success_evaluator]
52-
:end-before: [END task_success_evaluator]
51+
:start-after: [START task_completion_evaluator]
52+
:end-before: [END task_completion_evaluator]
5353
:language: python
5454
:dedent: 8
55-
:caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format
55+
:caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
5656
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
5757
5858
"""
5959

60-
_PROMPTY_FILE = "task_success.prompty"
61-
_RESULT_KEY = "task_success"
60+
_PROMPTY_FILE = "task_completion.prompty"
61+
_RESULT_KEY = "task_completion"
6262
_OPTIONAL_PARAMS = ["tool_definitions"]
6363

64-
id = "azureai://built-in/evaluators/task_success"
64+
id = "azureai://built-in/evaluators/task_completion"
6565
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6666

6767
@override
@@ -84,19 +84,19 @@ def __call__(
8484
response: Union[str, List[dict]],
8585
tool_definitions: Optional[Union[dict, List[dict]]] = None,
8686
) -> Dict[str, Union[str, bool]]:
87-
"""Evaluate task success for a given query, response, and optionally tool definitions.
87+
"""Evaluate task completion for a given query, response, and optionally tool definitions.
8888
The query and response can be either a string or a list of messages.
8989
9090
9191
Example with string inputs and no tools:
92-
evaluator = TaskSuccessEvaluator(model_config)
92+
evaluator = TaskCompletionEvaluator(model_config)
9393
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
9494
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
9595
9696
result = evaluator(query=query, response=response)
9797
9898
Example with list of messages:
99-
evaluator = TaskSuccessEvaluator(model_config)
99+
evaluator = TaskCompletionEvaluator(model_config)
100100
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
101101
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
102102
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
@@ -109,7 +109,7 @@ def __call__(
109109
:paramtype response: Union[str, List[dict]]
110110
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
111111
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
112-
:return: A dictionary with the task success evaluation results.
112+
:return: A dictionary with the task completion evaluation results.
113113
:rtype: Dict[str, Union[str, bool]]
114114
"""
115115

@@ -128,7 +128,7 @@ def __call__( # pylint: disable=docstring-missing-param
128128

129129
@override
130130
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
131-
"""Do Task Success evaluation.
131+
"""Do Task Completion evaluation.
132132
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
133133
:type eval_input: Dict
134134
:return: The evaluation result.
@@ -138,11 +138,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
138138
# which is a different schema than _base_prompty_eval.py
139139
if "query" not in eval_input and "response" not in eval_input:
140140
raise EvaluationException(
141-
message=f"Both query and response must be provided as input to the Task Success evaluator.",
142-
internal_message=f"Both query and response must be provided as input to the Task Success evaluator.",
141+
message=f"Both query and response must be provided as input to the Task Completion evaluator.",
142+
internal_message=f"Both query and response must be provided as input to the Task Completion evaluator.",
143143
blame=ErrorBlame.USER_ERROR,
144144
category=ErrorCategory.MISSING_FIELD,
145-
target=ErrorTarget.TASK_SUCCESS_EVALUATOR,
145+
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
146146
)
147147
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
148148
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
@@ -155,7 +155,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
155155
if isinstance(success, str):
156156
success = success.upper() == "TRUE"
157157

158-
success_result = "pass" if success == True else "fail"
158+
success_result = "pass" if success else "fail"
159159
reason = llm_output.get("explanation", "")
160160
return {
161161
f"{self._result_key}": success,
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
name: Task Success
2+
name: Task Completion
33
description: Evaluates whether a task was successfully completed
44
model:
55
api: chat
@@ -27,7 +27,7 @@ You are an expert evaluator who determines if an agent has successfully complete
2727
user:
2828
ROLE
2929
====
30-
You are a judge on Task Success who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
30+
You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
3131

3232
You are NOT evaluating:
3333
- How well the agent followed instructions

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class ErrorTarget(Enum):
8585
FLUENCY_EVALUATOR = "FluencyEvaluator"
8686
RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
8787
TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator"
88-
TASK_SUCCESS_EVALUATOR = "TaskSuccessEvaluator"
88+
TASK_COMPLETION_EVALUATOR = "TaskCompletionEvaluator"
8989
INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
9090
INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
9191
ADVERSARIAL_SIMULATOR = "AdversarialSimulator"

sdk/evaluation/azure-ai-evaluation/cspell.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty",
3434
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_data_sources/grounding.json",
3535
"sdk/evaluation/azure-ai-evaluation/samples/data/evaluate_test_data.jsonl",
36-
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/task_success.prompty",
37-
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_success/_task_success.py"
36+
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty",
37+
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py"
3838
],
3939
"words": [
4040
"Aoai",

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -422,17 +422,17 @@ def evaluation_evaluate_classes_methods(self):
422422
task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions)
423423
# [END task_adherence_evaluator]
424424

425-
# [START task_success_evaluator]
425+
# [START task_completion_evaluator]
426426
import os
427-
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
427+
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
428428

429429
model_config = {
430430
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
431431
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
432432
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
433433
}
434434

435-
task_success_evaluator = TaskSuccessEvaluator(model_config=model_config)
435+
task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config)
436436

437437
query = [
438438
{"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."},
@@ -499,8 +499,8 @@ def evaluation_evaluate_classes_methods(self):
499499
}
500500
]
501501

502-
task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions)
503-
# [END task_success_evaluator]
502+
task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions)
503+
# [END task_completion_evaluator]
504504

505505
# [START indirect_attack_evaluator]
506506
import os

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -432,17 +432,17 @@ def evaluation_evaluate_classes_methods(self):
432432
task_adherence_evaluator(query=query, response=response, tool_definitions=tool_definitions)
433433
# [END task_adherence_evaluator]
434434

435-
# [START task_success_evaluator]
435+
# [START task_completion_evaluator]
436436
import os
437-
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
437+
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
438438

439439
model_config = {
440440
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://<account_name>.services.ai.azure.com
441441
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
442442
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
443443
}
444444

445-
task_success_evaluator = TaskSuccessEvaluator(model_config=model_config)
445+
task_completion_evaluator = TaskCompletionEvaluator(model_config=model_config)
446446

447447
query = [
448448
{"role": "system", "content": "You are a travel booking assistant. Help users find and book flights."},
@@ -509,8 +509,8 @@ def evaluation_evaluate_classes_methods(self):
509509
}
510510
]
511511

512-
task_success_evaluator(query=query, response=response, tool_definitions=tool_definitions)
513-
# [END task_success_evaluator]
512+
task_completion_evaluator(query=query, response=response, tool_definitions=tool_definitions)
513+
# [END task_completion_evaluator]
514514

515515
# [START indirect_attack_evaluator]
516516
import os

0 commit comments

Comments
 (0)