Skip to content

Commit 772ee5a

Browse files
salma-elshafeySalma Elshafey
andauthored
Rename Tool Call Accuracy Evaluator to Tool Call Quality (#43246)
* Rename Tool Call Accuracy Evaluator to Tool Call Quality * To retrigger build pipeline --------- Co-authored-by: Salma Elshafey <[email protected]>
1 parent 05f1eae commit 772ee5a

File tree

11 files changed

+124
-119
lines changed

11 files changed

+124
-119
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from ._evaluators._xpia import IndirectAttackEvaluator
3131
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
3232
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
33-
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
33+
from ._evaluators._tool_call_quality import ToolCallQualityEvaluator
3434
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
3535
from ._model_configurations import (
3636
AzureAIProject,
@@ -130,7 +130,8 @@ def lazy_import():
130130
"EvaluationResult",
131131
"CodeVulnerabilityEvaluator",
132132
"UngroundedAttributesEvaluator",
133-
"ToolCallAccuracyEvaluator",
133+
"ToolCallQualityEvaluator",
134+
"ToolCallAccuracyEvaluator", # Backward compatibility alias
134135
"AzureOpenAIGrader",
135136
"AzureOpenAILabelGrader",
136137
"AzureOpenAIStringCheckGrader",
@@ -141,6 +142,9 @@ def lazy_import():
141142

142143
__all__.extend([p for p in _patch_all if p not in __all__])
143144

145+
# Backward compatibility alias
146+
ToolCallAccuracyEvaluator = ToolCallQualityEvaluator
147+
144148

145149
def __getattr__(name):
146150
"""Handle lazy imports for optional dependencies."""

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
SexualEvaluator,
3838
SimilarityEvaluator,
3939
TaskAdherenceEvaluator,
40-
ToolCallAccuracyEvaluator,
40+
ToolCallAccuracyEvaluator, # Backward compatibility alias
41+
ToolCallQualityEvaluator,
4142
UngroundedAttributesEvaluator,
4243
ViolenceEvaluator,
4344
)
@@ -69,7 +70,8 @@
6970
SimilarityEvaluator: "similarity",
7071
TaskAdherenceEvaluator: "task_adherence",
7172
TaskCompletionEvaluator: "task_completion",
72-
ToolCallAccuracyEvaluator: "tool_call_accuracy",
73+
ToolCallAccuracyEvaluator: "tool_call_quality", # Backward compatibility
74+
ToolCallQualityEvaluator: "tool_call_quality",
7375
UngroundedAttributesEvaluator: "ungrounded_attributes",
7476
ViolenceEvaluator: "violence",
7577
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from ._tool_call_accuracy import ToolCallAccuracyEvaluator
5+
from ._tool_call_quality import ToolCallQualityEvaluator
66

77
__all__ = [
8-
"ToolCallAccuracyEvaluator",
8+
"ToolCallQualityEvaluator",
99
]
Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,14 @@ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
5858

5959

6060
@experimental
61-
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62-
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
61+
class ToolCallQualityEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62+
"""The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:
6363
- Relevance to the conversation.
6464
- Parameter correctness according to tool definitions.
6565
- Parameter value extraction from the conversation.
6666
6767
The evaluator uses a scoring rubric of 1 to 5:
68-
- Score 1: The tool calls are irrelevant
68+
- Score 1: The tool calls are irrelevant.
6969
- Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
7070
- Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
7171
- Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
@@ -82,20 +82,20 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
8282
.. admonition:: Example:
8383
8484
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
85-
:start-after: [START tool_call_accuracy_evaluator]
86-
:end-before: [END tool_call_accuracy_evaluator]
85+
:start-after: [START tool_call_quality_evaluator]
86+
:end-before: [END tool_call_quality_evaluator]
8787
:language: python
8888
:dedent: 8
89-
:caption: Initialize and call a ToolCallAccuracyEvaluator.
89+
:caption: Initialize and call a ToolCallQualityEvaluator.
9090
9191
.. admonition:: Example using Azure AI Project URL:
9292
9393
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
94-
:start-after: [START tool_call_accuracy_evaluator]
95-
:end-before: [END tool_call_accuracy_evaluator]
94+
:start-after: [START tool_call_quality_evaluator]
95+
:end-before: [END tool_call_quality_evaluator]
9696
:language: python
9797
:dedent: 8
98-
:caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
98+
:caption: Initialize and call ToolCallQualityEvaluator using Azure AI Project URL in the following format
9999
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
100100
101101
.. note::
@@ -105,25 +105,25 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
105105
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
106106
"""
107107

108-
_PROMPTY_FILE = "tool_call_accuracy.prompty"
109-
_RESULT_KEY = "tool_call_accuracy"
108+
_PROMPTY_FILE = "tool_call_quality.prompty"
109+
_RESULT_KEY = "tool_call_quality"
110110

111-
_MAX_TOOL_CALL_ACCURACY_SCORE = 5
112-
_MIN_TOOL_CALL_ACCURACY_SCORE = 1
113-
_DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
111+
_MAX_TOOL_CALL_QUALITY_SCORE = 5
112+
_MIN_TOOL_CALL_QUALITY_SCORE = 1
113+
_DEFAULT_TOOL_CALL_QUALITY_SCORE = 3
114114

115115
_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
116116
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
117117
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
118-
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
118+
_INVALID_SCORE_MESSAGE = "Tool call quality score must be between 1 and 5."
119119

120120
_LLM_SCORE_KEY = "tool_calls_success_level"
121121

122-
id = "azureai://built-in/evaluators/tool_call_accuracy"
122+
id = "azureai://built-in/evaluators/tool_call_quality"
123123
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
124124

125125
@override
126-
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
126+
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_QUALITY_SCORE, credential=None, **kwargs):
127127
current_dir = os.path.dirname(__file__)
128128
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
129129
self.threshold = threshold
@@ -241,11 +241,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
241241
score = llm_output.get(self._LLM_SCORE_KEY, None)
242242
if not score or not check_score_is_valid(
243243
score,
244-
ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
245-
ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
244+
ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE,
245+
ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE,
246246
):
247247
raise EvaluationException(
248-
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
248+
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE}, {ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE}].",
249249
internal_message="Invalid score value.",
250250
category=ErrorCategory.FAILED_EXECUTION,
251251
blame=ErrorBlame.SYSTEM_ERROR,
@@ -266,10 +266,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
266266

267267
else:
268268
raise EvaluationException(
269-
message="Tool call accuracy evaluator returned invalid output.",
269+
message="Tool call quality evaluator returned invalid output.",
270270
blame=ErrorBlame.SYSTEM_ERROR,
271271
category=ErrorCategory.FAILED_EXECUTION,
272-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
272+
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
273273
)
274274

275275
async def _real_call(self, **kwargs):
@@ -346,30 +346,30 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
346346
message=f"Tool definition for {tool_name} not found",
347347
blame=ErrorBlame.USER_ERROR,
348348
category=ErrorCategory.INVALID_VALUE,
349-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
349+
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
350350
)
351351
else:
352352
raise EvaluationException(
353353
message=f"Tool call missing name: {tool_call}",
354354
blame=ErrorBlame.USER_ERROR,
355355
category=ErrorCategory.INVALID_VALUE,
356-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
356+
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
357357
)
358358
else:
359359
# Unsupported tool format - only converter format is supported
360360
raise EvaluationException(
361361
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
362362
blame=ErrorBlame.USER_ERROR,
363363
category=ErrorCategory.INVALID_VALUE,
364-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
364+
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
365365
)
366366
else:
367367
# Tool call is not a dictionary
368368
raise EvaluationException(
369369
message=f"Tool call is not a dictionary: {tool_call}",
370370
blame=ErrorBlame.USER_ERROR,
371371
category=ErrorCategory.INVALID_VALUE,
372-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
372+
target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
373373
)
374374

375375
return needed_tool_definitions
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
2-
name: Tool Call Accuracy
3-
description: Evaluates Tool Call Accuracy for tool used by agent
2+
name: Tool Call Quality
3+
description: Evaluates Tool Call Quality for tool used by agent
44
model:
55
api: chat
66
parameters:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class ErrorTarget(Enum):
9696
UNKNOWN = "Unknown"
9797
CONVERSATION = "Conversation"
9898
TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
99+
TOOL_CALL_QUALITY_EVALUATOR = "ToolCallQualityEvaluator"
99100
RED_TEAM = "RedTeam"
100101
AOAI_GRADER = "AoaiGrader"
101102
CONVERSATION_HISTORY_PARSING = "_get_conversation_history"

sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_accuracy.ipynb renamed to sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# Tool Call Accuracy Evaluator"
7+
"# Tool Call Quality Evaluator"
88
]
99
},
1010
{
@@ -13,7 +13,7 @@
1313
"source": [
1414
"### Getting Started\n",
1515
"\n",
16-
"This sample demonstrates how to use Tool Call Accuracy Evaluator\n",
16+
"This sample demonstrates how to use Tool Call Quality Evaluator\n",
1717
"Before running the sample:\n",
1818
"```bash\n",
1919
"pip install azure-ai-projects azure-identity azure-ai-evaluation\n",
@@ -33,7 +33,7 @@
3333
"cell_type": "markdown",
3434
"metadata": {},
3535
"source": [
36-
"The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:\n",
36+
"The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:\n",
3737
"- Relevance to the conversation\n",
3838
"- Parameter correctness according to tool definitions\n",
3939
"- Parameter value extraction from the conversation\n",
@@ -53,18 +53,18 @@
5353
"cell_type": "markdown",
5454
"metadata": {},
5555
"source": [
56-
"Tool Call Accuracy requires following input:\n",
56+
"Tool Call Quality requires following input:\n",
5757
"- Query - This can be a single query or a list of messages(conversation history with agent). Latter helps to determine if Agent used the information in history to make right tool calls.\n",
5858
"- Tool Calls - Tool Call(s) made by Agent to answer the query. Optional - if response has tool calls, if not provided evaluator will look for tool calls in response.\n",
59-
"- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Accuracy Evaluator will look at response for tool calls.\n",
59+
"- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Quality Evaluator will look at response for tool calls.\n",
6060
"- Tool Definitions - Tool(s) definition used by Agent to answer the query. \n"
6161
]
6262
},
6363
{
6464
"cell_type": "markdown",
6565
"metadata": {},
6666
"source": [
67-
"### Initialize Tool Call Accuracy Evaluator\n"
67+
"### Initialize Tool Call Quality Evaluator\n"
6868
]
6969
},
7070
{
@@ -74,7 +74,7 @@
7474
"outputs": [],
7575
"source": [
7676
"import os\n",
77-
"from azure.ai.evaluation import ToolCallAccuracyEvaluator , AzureOpenAIModelConfiguration\n",
77+
"from azure.ai.evaluation import ToolCallQualityEvaluator , AzureOpenAIModelConfiguration\n",
7878
"from pprint import pprint\n",
7979
"\n",
8080
"model_config = AzureOpenAIModelConfiguration(\n",
@@ -85,7 +85,7 @@
8585
")\n",
8686
"\n",
8787
"\n",
88-
"tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)"
88+
"tool_call_quality = ToolCallQualityEvaluator(model_config=model_config)"
8989
]
9090
},
9191
{
@@ -140,7 +140,7 @@
140140
"metadata": {},
141141
"outputs": [],
142142
"source": [
143-
"response = tool_call_accuracy(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
143+
"response = tool_call_quality(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
144144
"pprint(response)"
145145
]
146146
},
@@ -197,7 +197,7 @@
197197
"metadata": {},
198198
"outputs": [],
199199
"source": [
200-
"response = tool_call_accuracy(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
200+
"response = tool_call_quality(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
201201
"pprint(response)"
202202
]
203203
},
@@ -206,7 +206,7 @@
206206
"metadata": {},
207207
"source": [
208208
"#### Tool Calls passed as part of `Response` (common for agent case)\n",
209-
"- Tool Call Accuracy Evaluator extracts tool calls from response"
209+
"- Tool Call Quality Evaluator extracts tool calls from response"
210210
]
211211
},
212212
{

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -541,18 +541,18 @@ def evaluation_evaluate_classes_methods(self):
541541
)
542542
# [END groundedness_pro_evaluator]
543543

544-
# [START tool_call_accuracy_evaluator]
544+
# [START tool_call_quality_evaluator]
545545
import os
546-
from azure.ai.evaluation import ToolCallAccuracyEvaluator
546+
from azure.ai.evaluation import ToolCallQualityEvaluator
547547

548548
model_config = {
549549
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
550550
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
551551
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
552552
}
553553

554-
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
555-
tool_call_accuracy_evaluator(
554+
tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
555+
tool_call_quality_evaluator(
556556
query="How is the weather in New York?",
557557
response="The weather in New York is sunny.",
558558
tool_calls={
@@ -573,7 +573,7 @@ def evaluation_evaluate_classes_methods(self):
573573
},
574574
},
575575
)
576-
# [END tool_call_accuracy_evaluator]
576+
# [END tool_call_quality_evaluator]
577577

578578
# [START path_efficiency_evaluator]
579579
from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -547,18 +547,18 @@ def evaluation_evaluate_classes_methods(self):
547547
)
548548
# [END groundedness_pro_evaluator]
549549

550-
# [START tool_call_accuracy_evaluator]
550+
# [START tool_call_quality_evaluator]
551551
import os
552-
from azure.ai.evaluation import ToolCallAccuracyEvaluator
552+
from azure.ai.evaluation import ToolCallQualityEvaluator
553553

554554
model_config = {
555555
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"), # https://<account_name>.services.ai.azure.com
556556
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
557557
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
558558
}
559559

560-
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
561-
tool_call_accuracy_evaluator(
560+
tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
561+
tool_call_quality_evaluator(
562562
query="How is the weather in New York?",
563563
response="The weather in New York is sunny.",
564564
tool_calls={
@@ -579,7 +579,7 @@ def evaluation_evaluate_classes_methods(self):
579579
},
580580
},
581581
)
582-
# [END tool_call_accuracy_evaluator]
582+
# [END tool_call_quality_evaluator]
583583

584584
# [START path_efficiency_evaluator]
585585
from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator

0 commit comments

Comments
 (0)