Tool Call Accuracy OpenAPI Tools (#42494)

singankit · web-flow · commit 44e41b9970cd · 2025-08-13T13:21:06.000-07:00
* Tool Call Accuracy OpenAPI Tools

* Liniting issues
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py
@@ -11,7 +11,18 @@
 from packaging.version import Version
 
 # Constants.
-from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+from ._models import (
+    _USER,
+    _AGENT,
+    _TOOL,
+    _TOOL_CALL,
+    _TOOL_CALLS,
+    _FUNCTION,
+    _BUILT_IN_DESCRIPTIONS,
+    _BUILT_IN_PARAMS,
+    _OPENAPI,
+    OpenAPIToolDefinition,
+)
 
 # Message instances.
 from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
@@ -93,7 +104,7 @@ def _list_tool_calls_chronological(self, thread_id: str, run_id: str) -> List[To
         return tool_calls_chronological
 
     @staticmethod
-    def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinition]:
+    def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]:
         """
         Extracts tool definitions from a thread run.
 
@@ -121,6 +132,26 @@ def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinitio
                         parameters=parameters,
                     )
                 )
+            elif tool.type == _OPENAPI:
+                openapi_tool = tool.openapi
+                tool_definition = OpenAPIToolDefinition(
+                    name=openapi_tool.name,
+                    description=openapi_tool.description,
+                    type=_OPENAPI,
+                    spec=openapi_tool.spec,
+                    auth=openapi_tool.auth.as_dict(),
+                    default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None,
+                    functions=[
+                        ToolDefinition(
+                            name=func.get("name"),
+                            description=func.get("description"),
+                            parameters=func.get("parameters"),
+                            type="function",
+                        )
+                        for func in openapi_tool.get("functions")
+                    ],
+                )
+                final_tools.append(tool_definition)
             else:
                 # Add limited support for built-in tools. Descriptions and parameters
                 # are not published, but we'll include placeholders.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py
@@ -52,7 +52,6 @@
     _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
     _SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
     _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
-    _OPENAPI: "Connects agents to external RESTful APIs using OpenAPI 3.0 specifications, enabling seamless access to third-party services.",
 }
 
 # Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
@@ -101,13 +100,6 @@
         "type": "object",
         "properties": {"input": {"type": "string", "description": "Search terms to use."}},
     },
-    _OPENAPI: {
-        "type": "object",
-        "properties": {
-            "name": {"type": "string", "description": "The name of the function to call."},
-            "arguments": {"type": "string", "description": "JSON string of the arguments to pass to the function."},
-        },
-    },
 }
 
 
@@ -245,6 +237,27 @@ class ToolDefinition(BaseModel):
     parameters: dict
 
 
+class OpenAPIToolDefinition(BaseModel):
+    """Represents OpenAPI tool definition that will be used in the agent.
+    :param name: The name of the tool.
+    :type name: str
+    :param type: The type of the tool.
+    :type type: str
+    :param description: A description of the tool.
+    :type description: str
+    :param parameters: The parameters required by the tool.
+    :type parameters: dict
+    """
+
+    name: str
+    type: str
+    description: Optional[str] = None
+    spec: object
+    auth: object
+    default_params: Optional[list[str]] = None
+    functions: list[ToolDefinition]
+
+
 class ToolCall:
     """Represents a tool call, used as an intermediate step in the conversion process.
 
@@ -275,7 +288,7 @@ class EvaluatorData(BaseModel):
 
     query: List[Message]
     response: List[Message]
-    tool_definitions: List[ToolDefinition]
+    tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]]
 
     def to_json(self):
         """Converts the result to a JSON string.
@@ -305,14 +318,16 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     # all in most of the cases, and bing would only show the API URL, without arguments or results.
     # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
     # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
-    if hasattr(tool_call.details, _FUNCTION):
+    if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
         # This is the internals of the content object that will be included with the tool call.
         tool_call_id = tool_call.details.id
         content_tool_call = {
             "type": _TOOL_CALL,
             "tool_call_id": tool_call_id,
-            "name": tool_call.details.function.name,
-            "arguments": safe_loads(tool_call.details.function.arguments),
+            "name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None,
+            "arguments": safe_loads(
+                tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None
+            ),
         }
     else:
         # Treat built-in tools separately.  Object models may be unique so handle each case separately
@@ -350,8 +365,8 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     # assistant's action of calling the tool.
     messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
 
-    if hasattr(tool_call.details, _FUNCTION):
-        output = safe_loads(tool_call.details.function["output"])
+    if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
+        output = safe_loads(tool_call.details.get("function")["output"])
     else:
         try:
             # Some built-ins may have output, others may not
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -1,6 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from itertools import chain
 import math
 import os
 import logging
@@ -315,6 +316,14 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
         built_in_definitions = _get_needed_built_in_definitions(tool_calls)
         needed_tool_definitions.extend(built_in_definitions)
 
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
+
         # Validate that all tool calls have corresponding definitions
         for tool_call in tool_calls:
             if isinstance(tool_call, dict):
@@ -329,7 +338,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
                         # This is a regular function tool from converter
                         tool_definition_exists = any(
                             tool.get("name") == tool_name and tool.get("type", "function") == "function"
-                            for tool in tool_definitions
+                            for tool in tool_definitions_expanded
                         )
                         if not tool_definition_exists:
                             raise EvaluationException(
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -603,6 +603,83 @@ def test_evaluate_open_api(self, mock_model_config):
         tool_definitions = []
         result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
+        key = ToolCallAccuracyEvaluator._RESULT_KEY
+        assert result is not None
+        assert result[key] == "not applicable"
+        assert result[f"{key}_result"] == "pass"
+
+    def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=flow_side_effect)
+
+        # Test OpenAPI function call for exchange rates - converter format
+        query = "What is the exchange rate from GBP to EUR?"
+        tool_calls = [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_builtin_good",
+                "name": "get_countries_LookupCountryByCurrency",
+                "arguments": {"currency": "GBP"},
+            },
+        ]
+        tool_definitions = [
+            {
+                "name": "get_countries",
+                "type": "openapi",
+                "description": "Retrieve a list of countries",
+                "spec": {
+                    "openapi": "3.1.0",
+                    "info": {
+                        "title": "RestCountries.NET API",
+                        "description": "Web API version 3.1 for managing country items, based on previous implementations from restcountries.eu and restcountries.com.",
+                        "version": "v3.1",
+                    },
+                    "servers": [{"url": "https://restcountries.net"}],
+                    "auth": [],
+                    "paths": {
+                        "/v3.1/currency": {
+                            "get": {
+                                "description": "Search by currency.",
+                                "operationId": "LookupCountryByCurrency",
+                                "parameters": [
+                                    {
+                                        "name": "currency",
+                                        "in": "query",
+                                        "description": "The currency to search for.",
+                                        "required": "true",
+                                        "schema": {"type": "string"},
+                                    }
+                                ],
+                                "responses": {
+                                    "200": {
+                                        "description": "Success",
+                                        "content": {"text/plain": {"schema": {"type": "string"}}},
+                                    }
+                                },
+                            }
+                        }
+                    },
+                    "components": {"schemes": {}},
+                },
+                "auth": {"type": "anonymous", "security_scheme": {}},
+                "functions": [
+                    {
+                        "name": "get_countries_LookupCountryByCurrency",
+                        "type": "function",
+                        "description": "Search by currency.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "currency": {"type": "string", "description": "The currency to search for."}
+                            },
+                            "required": ["currency"],
+                        },
+                    }
+                ],
+            }
+        ]
+        result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
+
         key = ToolCallAccuracyEvaluator._RESULT_KEY
         assert result is not None
         assert result[key] == 5.0