azure-sdk
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py‎
Lines changed: 3 additions & 41 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py‎
Lines changed: 3 additions & 41 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py‎
Lines changed: 65 additions & 3 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py‎
Lines changed: 65 additions & 3 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 4 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py‎
Lines changed: 70 additions & 10 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py‎
Lines changed: 70 additions & 10 deletions
@@ -15,7 +15,7 @@
 from azure.ai.evaluation._common._experimental import experimental
 
 # Constants.
-from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION
+from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
 
 # Message instances.
 from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
@@ -32,46 +32,6 @@
 # Maximum number of workers allowed to make API calls at the same time.
 _MAX_WORKERS = 10
 
-# Constants to only be used internally in this file for the built-in tools.
-_CODE_INTERPRETER = "code_interpreter"
-_BING_GROUNDING = "bing_grounding"
-_FILE_SEARCH = "file_search"
-
-# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
-# for evaluation purposes.
-_BUILT_IN_DESCRIPTIONS = {
-    _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
-    + "generate code, and create graphs and charts using your data. Supports "
-    + "up to 20 files.",
-    _BING_GROUNDING: "Enhance model output with web data.",
-    _FILE_SEARCH: "Search for data across uploaded files.",
-}
-
-# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
-_BUILT_IN_PARAMS = {
-    _CODE_INTERPRETER: {
-        "type": "object",
-        "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
-    },
-    _BING_GROUNDING: {
-        "type": "object",
-        "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
-    },
-    _FILE_SEARCH: {
-        "type": "object",
-        "properties": {
-            "ranking_options": {
-                "type": "object",
-                "properties": {
-                    "ranker": {"type": "string", "description": "Ranking algorithm to use."},
-                    "score_threshold": {"type": "number", "description": "Threshold for search results."},
-                },
-                "description": "Ranking options for search results.",
-            }
-        },
-    },
-}
-
 @experimental
 class AIAgentConverter:
     """
@@ -202,6 +162,7 @@ def _extract_function_tool_definitions(thread_run: ThreadRun) -> List[ToolDefini
 
                 final_tools.append(
                     ToolDefinition(
+                        type="function",
                         name=tool_function.name,
                         description=tool_function.description,
                         parameters=parameters,
@@ -213,6 +174,7 @@ def _extract_function_tool_definitions(thread_run: ThreadRun) -> List[ToolDefini
                 if tool.type in _BUILT_IN_DESCRIPTIONS and tool.type in _BUILT_IN_PARAMS:
                     final_tools.append(
                         ToolDefinition(
+                            type=tool.type,
                             name=tool.type,
                             description=_BUILT_IN_DESCRIPTIONS[tool.type],
                             parameters=_BUILT_IN_PARAMS[tool.type],
 
@@ -21,6 +21,57 @@
 # This is returned by AI services in the API to filter against tool invocations.
 _TOOL_CALLS = "tool_calls"
 
+# Constants to only be used internally in this file for the built-in tools.
+_CODE_INTERPRETER = "code_interpreter"
+_BING_GROUNDING = "bing_grounding"
+_FILE_SEARCH = "file_search"
+_AZURE_AI_SEARCH = "azure_ai_search"
+_FABRIC_DATAAGENT = "fabric_dataagent"
+
+# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
+# for evaluation purposes.
+_BUILT_IN_DESCRIPTIONS = {
+    _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
+    + "generate code, and create graphs and charts using your data. Supports "
+    + "up to 20 files.",
+    _BING_GROUNDING: "Enhance model output with web data.",
+    _FILE_SEARCH: "Search for data across uploaded files.",
+    _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
+    _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
+}
+
+# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
+_BUILT_IN_PARAMS = {
+    _CODE_INTERPRETER: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
+    },
+    _BING_GROUNDING: {
+        "type": "object",
+        "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
+    },
+    _FILE_SEARCH: {
+        "type": "object",
+        "properties": {
+            "ranking_options": {
+                "type": "object",
+                "properties": {
+                    "ranker": {"type": "string", "description": "Ranking algorithm to use."},
+                    "score_threshold": {"type": "number", "description": "Threshold for search results."},
+                },
+                "description": "Ranking options for search results.",
+            }
+        },
+    },
+    _AZURE_AI_SEARCH: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+    _FABRIC_DATAAGENT: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+}
 
 class Message(BaseModel):
     """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,13 +149,16 @@ class ToolDefinition(BaseModel):
 
     :param name: The name of the tool.
     :type name: str
+    :param type: The type of the tool.
+    :type type: str
     :param description: A description of the tool.
     :type description: str
     :param parameters: The parameters required by the tool.
     :type parameters: dict
     """
 
     name: str
+    type: str
     description: Optional[str] = None
     parameters: dict
 
@@ -191,6 +245,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             arguments = {
                 "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
             }
+        elif tool_call.details["type"] == "azure_ai_search":
+            arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
+        elif tool_call.details["type"] == "fabric_dataagent":
+            arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
         else:
             # unsupported tool type, skip
             return messages
@@ -217,11 +275,11 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             # Some built-ins may have output, others may not
             # Try to retrieve it, but if we don't find anything, skip adding the message
             # Just manually converting to dicts for easy serialization for now rather than custom serializers
-            if tool_call.details.type == "code_interpreter":
+            if tool_call.details.type == _CODE_INTERPRETER:
                 output = tool_call.details.code_interpreter.outputs
-            elif tool_call.details.type == "bing_grounding":
+            elif tool_call.details.type == _BING_GROUNDING:
                 return messages  # not supported yet from bing grounding tool
-            elif tool_call.details.type == "file_search":
+            elif tool_call.details.type == _FILE_SEARCH:
                 output = [
                     {
                         "file_id": result.file_id,
@@ -231,6 +289,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
                     }
                     for result in tool_call.details.file_search.results
                 ]
+            elif tool_call.details.type == _AZURE_AI_SEARCH:
+                output = tool_call.details.azure_ai_search["output"]
+            elif tool_call.details.type == _FABRIC_DATAAGENT:
+                output = tool_call.details.fabric_dataagent["output"]
         except:
             return messages
 
 
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type _higher_is_better: Optional[bool]
     """
 
+    _NOT_APPLICABLE_RESULT = "not applicable"
+    _PASS_RESULT = "pass"
+    _FAIL_RESULT = "fail"
+
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
 
     # Make sure to call super().__init__() in the child class's __init__ method.
 
@@ -214,12 +214,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         score = math.nan
         if llm_output:
             score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
-            return {
-                self._result_key: bool(float(score)),
-                f"{self._result_key}_reason": reason,
-                "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
-            }
-        return {self._result_key: float(score)}
+            if score >= 0 and score <= 1:
+                return {
+                    self._result_key: bool(float(score)),
+                    f"{self._result_key}_reason": reason,
+                    "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+                }
+        raise EvaluationException(
+            message="Tool call accuracy evaluator: Invalid score returned from LLM.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+        )
 
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +237,55 @@ async def _real_call(self, **kwargs):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
+        if len(eval_input_list) == 0:
+            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                    f"{self._AGGREGATE_RESULT_KEY}_reason":
+                        "No tool calls were made.",
+                    "per_tool_call_details": []
+                    }
+
         per_turn_results = []
         # Evaluate all inputs.
         for eval_input in eval_input_list:
-            per_turn_results.append(await self._do_eval(eval_input))
+            if self._is_applicable_tool(eval_input):
+                per_turn_results.append(await self._do_eval(eval_input))
+            else:
+                per_turn_results.append(self._not_applicable_result(eval_input))
 
         return self._aggregate_results(per_turn_results=per_turn_results)
 
+    def _is_applicable_tool(self, eval_input):
+        """Determine if a given tool should be evaluated, since we only evaluate tools that
+        have sufficient context available.
+
+        :type eval_input: Dict
+        :return: True if the tool call should be evaluated
+        :rtype: bool
+        """
+        tool_definition = eval_input.get("tool_definition")
+        if tool_definition is None or len(tool_definition) != 1:
+            return False
+        tool_type = tool_definition[0].get("type")
+        if tool_type is None or tool_type != "function":
+            return False
+        return True
+
+    def _not_applicable_result(self, eval_input):
+        """Return a result indicating that the tool call is not applicable for evaluation.
+
+        :param eval_input: The input to the evaluator.
+        :type eval_input: Dict
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        return {
+            f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_reason": "Tool call not supported for evaluation",
+            "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+        }
+
     def _aggregate_results(self, per_turn_results):
         """Aggregate the evaluation results of each conversation turn into a single result.
 
@@ -260,11 +308,23 @@ def _aggregate_results(self, per_turn_results):
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
 
-        score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
+        num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
+                             if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
+        if num_evaluated == 0:
+            # None of the invoked tools were applicable, return not applicable result
+            # (If a tool fails evaluation, we'll throw an exception)
+            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                    f"{self._AGGREGATE_RESULT_KEY}_reason":
+                        "Tool call accuracy evaluation is not yet supported for the invoked tools.",
+                    "per_tool_call_details": []
+                    }
+        # ignore not_applicable results, where the _result_key will be "not applicable"
+        score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
         aggregated[self._AGGREGATE_RESULT_KEY] = score
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
+        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
         aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
-
         aggregated["per_tool_call_details"] = per_turn_results
         return aggregated