Rename Tool Call Accuracy Evaluator to Tool Call Quality (#43246)

salma-elshafey · Salma Elshafey · web-flow · commit 772ee5ad10ad · 2025-10-06T05:47:08.000-07:00
* Rename Tool Call Accuracy Evaluator to Tool Call Quality

* To retrigger build pipeline

---------

Co-authored-by: Salma Elshafey &lt;selshafey@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -30,7 +30,7 @@
 from ._evaluators._xpia import IndirectAttackEvaluator
 from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
 from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
-from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
+from ._evaluators._tool_call_quality import ToolCallQualityEvaluator
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._model_configurations import (
     AzureAIProject,
@@ -130,7 +130,8 @@ def lazy_import():
     "EvaluationResult",
     "CodeVulnerabilityEvaluator",
     "UngroundedAttributesEvaluator",
-    "ToolCallAccuracyEvaluator",
+    "ToolCallQualityEvaluator",
+    "ToolCallAccuracyEvaluator",  # Backward compatibility alias
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",
@@ -141,6 +142,9 @@ def lazy_import():
 
 __all__.extend([p for p in _patch_all if p not in __all__])
 
+# Backward compatibility alias
+ToolCallAccuracyEvaluator = ToolCallQualityEvaluator
+
 
 def __getattr__(name):
     """Handle lazy imports for optional dependencies."""
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py
@@ -37,7 +37,8 @@
     SexualEvaluator,
     SimilarityEvaluator,
     TaskAdherenceEvaluator,
-    ToolCallAccuracyEvaluator,
+    ToolCallAccuracyEvaluator,  # Backward compatibility alias
+    ToolCallQualityEvaluator,
     UngroundedAttributesEvaluator,
     ViolenceEvaluator,
 )
@@ -69,7 +70,8 @@
     SimilarityEvaluator: "similarity",
     TaskAdherenceEvaluator: "task_adherence",
     TaskCompletionEvaluator: "task_completion",
-    ToolCallAccuracyEvaluator: "tool_call_accuracy",
+    ToolCallAccuracyEvaluator: "tool_call_quality",  # Backward compatibility
+    ToolCallQualityEvaluator: "tool_call_quality",
     UngroundedAttributesEvaluator: "ungrounded_attributes",
     ViolenceEvaluator: "violence",
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/__init__.py
@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from ._tool_call_accuracy import ToolCallAccuracyEvaluator
+from ._tool_call_quality import ToolCallQualityEvaluator
 
 __all__ = [
-    "ToolCallAccuracyEvaluator",
+    "ToolCallQualityEvaluator",
 ]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/_tool_call_quality.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/_tool_call_quality.py
@@ -58,14 +58,14 @@ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
 
 
 @experimental
-class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
-    """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
+class ToolCallQualityEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:
         - Relevance to the conversation.
         - Parameter correctness according to tool definitions.
         - Parameter value extraction from the conversation.
 
     The evaluator uses a scoring rubric of 1 to 5:
-        - Score 1: The tool calls are irrelevant
+        - Score 1: The tool calls are irrelevant.
         - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
         - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
         - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
@@ -82,20 +82,20 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     .. admonition:: Example:
 
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START tool_call_accuracy_evaluator]
-            :end-before: [END tool_call_accuracy_evaluator]
+            :start-after: [START tool_call_quality_evaluator]
+            :end-before: [END tool_call_quality_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a ToolCallAccuracyEvaluator.
+            :caption: Initialize and call a ToolCallQualityEvaluator.
 
     .. admonition:: Example using Azure AI Project URL:
 
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
-            :start-after: [START tool_call_accuracy_evaluator]
-            :end-before: [END tool_call_accuracy_evaluator]
+            :start-after: [START tool_call_quality_evaluator]
+            :end-before: [END tool_call_quality_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call ToolCallQualityEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. note::
@@ -105,25 +105,25 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
-    _PROMPTY_FILE = "tool_call_accuracy.prompty"
-    _RESULT_KEY = "tool_call_accuracy"
+    _PROMPTY_FILE = "tool_call_quality.prompty"
+    _RESULT_KEY = "tool_call_quality"
 
-    _MAX_TOOL_CALL_ACCURACY_SCORE = 5
-    _MIN_TOOL_CALL_ACCURACY_SCORE = 1
-    _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
+    _MAX_TOOL_CALL_QUALITY_SCORE = 5
+    _MIN_TOOL_CALL_QUALITY_SCORE = 1
+    _DEFAULT_TOOL_CALL_QUALITY_SCORE = 3
 
     _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
     _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
     _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
-    _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
+    _INVALID_SCORE_MESSAGE = "Tool call quality score must be between 1 and 5."
 
     _LLM_SCORE_KEY = "tool_calls_success_level"
 
-    id = "azureai://built-in/evaluators/tool_call_accuracy"
+    id = "azureai://built-in/evaluators/tool_call_quality"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_QUALITY_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
@@ -241,11 +241,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             score = llm_output.get(self._LLM_SCORE_KEY, None)
             if not score or not check_score_is_valid(
                 score,
-                ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
-                ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
+                ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE,
+                ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE,
             ):
                 raise EvaluationException(
-                    message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
+                    message=f"Invalid score value: {score}. Expected a number in range [{ToolCallQualityEvaluator._MIN_TOOL_CALL_QUALITY_SCORE}, {ToolCallQualityEvaluator._MAX_TOOL_CALL_QUALITY_SCORE}].",
                     internal_message="Invalid score value.",
                     category=ErrorCategory.FAILED_EXECUTION,
                     blame=ErrorBlame.SYSTEM_ERROR,
@@ -266,10 +266,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         else:
             raise EvaluationException(
-                message="Tool call accuracy evaluator returned invalid output.",
+                message="Tool call quality evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
                 category=ErrorCategory.FAILED_EXECUTION,
-                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
             )
 
     async def _real_call(self, **kwargs):
@@ -346,30 +346,30 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
                                 message=f"Tool definition for {tool_name} not found",
                                 blame=ErrorBlame.USER_ERROR,
                                 category=ErrorCategory.INVALID_VALUE,
-                                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                                target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
                             )
                     else:
                         raise EvaluationException(
                             message=f"Tool call missing name: {tool_call}",
                             blame=ErrorBlame.USER_ERROR,
                             category=ErrorCategory.INVALID_VALUE,
-                            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                            target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
                         )
                 else:
                     # Unsupported tool format - only converter format is supported
                     raise EvaluationException(
                         message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
                         blame=ErrorBlame.USER_ERROR,
                         category=ErrorCategory.INVALID_VALUE,
-                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
                     )
             else:
                 # Tool call is not a dictionary
                 raise EvaluationException(
                     message=f"Tool call is not a dictionary: {tool_call}",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
-                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    target=ErrorTarget.TOOL_CALL_QUALITY_EVALUATOR,
                 )
 
         return needed_tool_definitions
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/tool_call_quality.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_quality/tool_call_quality.prompty
@@ -1,6 +1,6 @@
 ---
-name: Tool Call Accuracy
-description: Evaluates Tool Call Accuracy for tool used by agent
+name: Tool Call Quality
+description: Evaluates Tool Call Quality for tool used by agent
 model:
   api: chat
   parameters:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py
@@ -96,6 +96,7 @@ class ErrorTarget(Enum):
     UNKNOWN = "Unknown"
     CONVERSATION = "Conversation"
     TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
+    TOOL_CALL_QUALITY_EVALUATOR = "ToolCallQualityEvaluator"
     RED_TEAM = "RedTeam"
     AOAI_GRADER = "AoaiGrader"
     CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/tool_call_quality.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Tool Call Accuracy Evaluator"
+    "# Tool Call Quality Evaluator"
    ]
   },
   {
@@ -13,7 +13,7 @@
    "source": [
     "### Getting Started\n",
     "\n",
-    "This sample demonstrates how to use Tool Call Accuracy Evaluator\n",
+    "This sample demonstrates how to use Tool Call Quality Evaluator\n",
     "Before running the sample:\n",
     "```bash\n",
     "pip install azure-ai-projects azure-identity azure-ai-evaluation\n",
@@ -33,7 +33,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:\n",
+    "The Tool Call Quality evaluator assesses how accurately an AI uses tools by examining:\n",
     "- Relevance to the conversation\n",
     "- Parameter correctness according to tool definitions\n",
     "- Parameter value extraction from the conversation\n",
@@ -53,18 +53,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Tool Call Accuracy requires following input:\n",
+    "Tool Call Quality requires following input:\n",
     "- Query - This can be a single query or a list of messages(conversation history with agent). Latter helps to determine if Agent used the information in history to make right tool calls.\n",
     "- Tool Calls - Tool Call(s) made by Agent to answer the query. Optional - if response has tool calls, if not provided evaluator will look for tool calls in response.\n",
-    "- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Accuracy Evaluator will look at response for tool calls.\n",
+    "- Response - (Optional)Response from Agent (or any GenAI App). This can be a single text response or a list or messages generated as part of Agent Response. If tool calls are not provide Tool Call Quality Evaluator will look at response for tool calls.\n",
     "- Tool Definitions - Tool(s) definition used by Agent to answer the query. \n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Initialize Tool Call Accuracy Evaluator\n"
+    "### Initialize Tool Call Quality Evaluator\n"
    ]
   },
   {
@@ -74,7 +74,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "from azure.ai.evaluation import ToolCallAccuracyEvaluator , AzureOpenAIModelConfiguration\n",
+    "from azure.ai.evaluation import ToolCallQualityEvaluator , AzureOpenAIModelConfiguration\n",
     "from pprint import pprint\n",
     "\n",
     "model_config = AzureOpenAIModelConfiguration(\n",
@@ -85,7 +85,7 @@
     ")\n",
     "\n",
     "\n",
-    "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)"
+    "tool_call_quality = ToolCallQualityEvaluator(model_config=model_config)"
    ]
   },
   {
@@ -140,7 +140,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "response = tool_call_accuracy(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
+    "response = tool_call_quality(query=query, tool_calls=tool_call, tool_definitions=tool_definition)\n",
     "pprint(response)"
    ]
   },
@@ -197,7 +197,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "response = tool_call_accuracy(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
+    "response = tool_call_quality(query=query, tool_calls=tool_calls, tool_definitions=tool_definition)\n",
     "pprint(response)"
    ]
   },
@@ -206,7 +206,7 @@
    "metadata": {},
    "source": [
     "#### Tool Calls passed as part of `Response` (common for agent case)\n",
-    "- Tool Call Accuracy Evaluator extracts tool calls from response"
+    "- Tool Call Quality Evaluator extracts tool calls from response"
    ]
   },
   {
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py
@@ -541,18 +541,18 @@ def evaluation_evaluate_classes_methods(self):
         )
         # [END groundedness_pro_evaluator]
 
-        # [START tool_call_accuracy_evaluator]
+        # [START tool_call_quality_evaluator]
         import os
-        from azure.ai.evaluation import ToolCallAccuracyEvaluator
+        from azure.ai.evaluation import ToolCallQualityEvaluator
 
         model_config = {
             "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
             "api_key": os.environ.get("AZURE_OPENAI_KEY"),
             "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
         }
 
-        tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
-        tool_call_accuracy_evaluator(
+        tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
+        tool_call_quality_evaluator(
             query="How is the weather in New York?",
             response="The weather in New York is sunny.",
             tool_calls={
@@ -573,7 +573,7 @@ def evaluation_evaluate_classes_methods(self):
                 },
             },
         )
-        # [END tool_call_accuracy_evaluator]
+        # [END tool_call_quality_evaluator]
 
         # [START path_efficiency_evaluator]
         from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py
@@ -547,18 +547,18 @@ def evaluation_evaluate_classes_methods(self):
         )
         # [END groundedness_pro_evaluator]
 
-        # [START tool_call_accuracy_evaluator]
+        # [START tool_call_quality_evaluator]
         import os
-        from azure.ai.evaluation import ToolCallAccuracyEvaluator
+        from azure.ai.evaluation import ToolCallQualityEvaluator
 
         model_config = {
             "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),  # https://<account_name>.services.ai.azure.com
             "api_key": os.environ.get("AZURE_OPENAI_KEY"),
             "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
         }
 
-        tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
-        tool_call_accuracy_evaluator(
+        tool_call_quality_evaluator = ToolCallQualityEvaluator(model_config=model_config)
+        tool_call_quality_evaluator(
             query="How is the weather in New York?",
             response="The weather in New York is sunny.",
             tool_calls={
@@ -579,7 +579,7 @@ def evaluation_evaluate_classes_methods(self):
                 },
             },
         )
-        # [END tool_call_accuracy_evaluator]
+        # [END tool_call_quality_evaluator]
 
         # [START path_efficiency_evaluator]
         from azure.ai.evaluation._evaluators._path_efficiency import PathEfficiencyEvaluator
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_quality_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_quality_evaluator.py

Original file line number	Diff line number	Diff line change
`@@ -2,8 +2,8 @@`
`2`	`2`	`# Copyright (c) Microsoft Corporation. All rights reserved.`
`3`	`3`	`# ---------------------------------------------------------`
`4`	`4`
`5`		`-from ._tool_call_accuracy import ToolCallAccuracyEvaluator`
	`5`	`+from ._tool_call_quality import ToolCallQualityEvaluator`
`6`	`6`
`7`	`7`	`__all__ = [`
`8`		`- "ToolCallAccuracyEvaluator",`
	`8`	`+ "ToolCallQualityEvaluator",`
`9`	`9`	`]`