diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py index 61f29c62d1e..8f553619a45 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py @@ -11,7 +11,18 @@ from packaging.version import Version # Constants. -from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS +from ._models import ( + _USER, + _AGENT, + _TOOL, + _TOOL_CALL, + _TOOL_CALLS, + _FUNCTION, + _BUILT_IN_DESCRIPTIONS, + _BUILT_IN_PARAMS, + _OPENAPI, + OpenAPIToolDefinition, +) # Message instances. from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall @@ -93,7 +104,7 @@ def _list_tool_calls_chronological(self, thread_id: str, run_id: str) -> List[To return tool_calls_chronological @staticmethod - def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinition]: + def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]: """ Extracts tool definitions from a thread run. @@ -121,6 +132,26 @@ def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinitio parameters=parameters, ) ) + elif tool.type == _OPENAPI: + openapi_tool = tool.openapi + tool_definition = OpenAPIToolDefinition( + name=openapi_tool.name, + description=openapi_tool.description, + type=_OPENAPI, + spec=openapi_tool.spec, + auth=openapi_tool.auth.as_dict(), + default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None, + functions=[ + ToolDefinition( + name=func.get("name"), + description=func.get("description"), + parameters=func.get("parameters"), + type="function", + ) + for func in openapi_tool.get("functions") + ], + ) + final_tools.append(tool_definition) else: # Add limited support for built-in tools. Descriptions and parameters # are not published, but we'll include placeholders. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py index 57f44a8afbb..a97b2238b4a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py @@ -52,7 +52,6 @@ _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.", _SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.", _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.", - _OPENAPI: "Connects agents to external RESTful APIs using OpenAPI 3.0 specifications, enabling seamless access to third-party services.", } # Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes. @@ -101,13 +100,6 @@ "type": "object", "properties": {"input": {"type": "string", "description": "Search terms to use."}}, }, - _OPENAPI: { - "type": "object", - "properties": { - "name": {"type": "string", "description": "The name of the function to call."}, - "arguments": {"type": "string", "description": "JSON string of the arguments to pass to the function."}, - }, - }, } @@ -245,6 +237,27 @@ class ToolDefinition(BaseModel): parameters: dict +class OpenAPIToolDefinition(BaseModel): + """Represents OpenAPI tool definition that will be used in the agent. + :param name: The name of the tool. + :type name: str + :param type: The type of the tool. + :type type: str + :param description: A description of the tool. + :type description: str + :param parameters: The parameters required by the tool. + :type parameters: dict + """ + + name: str + type: str + description: Optional[str] = None + spec: object + auth: object + default_params: Optional[list[str]] = None + functions: list[ToolDefinition] + + class ToolCall: """Represents a tool call, used as an intermediate step in the conversion process. @@ -275,7 +288,7 @@ class EvaluatorData(BaseModel): query: List[Message] response: List[Message] - tool_definitions: List[ToolDefinition] + tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]] def to_json(self): """Converts the result to a JSON string. @@ -305,14 +318,16 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess # all in most of the cases, and bing would only show the API URL, without arguments or results. # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query. # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter. - if hasattr(tool_call.details, _FUNCTION): + if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"): # This is the internals of the content object that will be included with the tool call. tool_call_id = tool_call.details.id content_tool_call = { "type": _TOOL_CALL, "tool_call_id": tool_call_id, - "name": tool_call.details.function.name, - "arguments": safe_loads(tool_call.details.function.arguments), + "name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None, + "arguments": safe_loads( + tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None + ), } else: # Treat built-in tools separately. Object models may be unique so handle each case separately @@ -350,8 +365,8 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess # assistant's action of calling the tool. messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created)) - if hasattr(tool_call.details, _FUNCTION): - output = safe_loads(tool_call.details.function["output"]) + if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"): + output = safe_loads(tool_call.details.get("function")["output"]) else: try: # Some built-ins may have output, others may not diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3419cc429d1..384dcc3b8e9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -1,6 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from itertools import chain import math import os import logging @@ -315,6 +316,14 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): built_in_definitions = _get_needed_built_in_definitions(tool_calls) needed_tool_definitions.extend(built_in_definitions) + # OpenAPI tool is a collection of functions, so we need to expand it + tool_definitions_expanded = list( + chain.from_iterable( + tool.get("functions", []) if tool.get("type") == "openapi" else [tool] + for tool in needed_tool_definitions + ) + ) + # Validate that all tool calls have corresponding definitions for tool_call in tool_calls: if isinstance(tool_call, dict): @@ -329,7 +338,7 @@ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions): # This is a regular function tool from converter tool_definition_exists = any( tool.get("name") == tool_name and tool.get("type", "function") == "function" - for tool in tool_definitions + for tool in tool_definitions_expanded ) if not tool_definition_exists: raise EvaluationException( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 3f4ae086f9d..714b1b4073e 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -603,6 +603,83 @@ def test_evaluate_open_api(self, mock_model_config): tool_definitions = [] result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + key = ToolCallAccuracyEvaluator._RESULT_KEY + assert result is not None + assert result[key] == "not applicable" + assert result[f"{key}_result"] == "pass" + + def test_evaluate_open_api_with_tool_definition(self, mock_model_config): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + # Test OpenAPI function call for exchange rates - converter format + query = "What is the exchange rate from GBP to EUR?" + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_builtin_good", + "name": "get_countries_LookupCountryByCurrency", + "arguments": {"currency": "GBP"}, + }, + ] + tool_definitions = [ + { + "name": "get_countries", + "type": "openapi", + "description": "Retrieve a list of countries", + "spec": { + "openapi": "3.1.0", + "info": { + "title": "RestCountries.NET API", + "description": "Web API version 3.1 for managing country items, based on previous implementations from restcountries.eu and restcountries.com.", + "version": "v3.1", + }, + "servers": [{"url": "https://restcountries.net"}], + "auth": [], + "paths": { + "/v3.1/currency": { + "get": { + "description": "Search by currency.", + "operationId": "LookupCountryByCurrency", + "parameters": [ + { + "name": "currency", + "in": "query", + "description": "The currency to search for.", + "required": "true", + "schema": {"type": "string"}, + } + ], + "responses": { + "200": { + "description": "Success", + "content": {"text/plain": {"schema": {"type": "string"}}}, + } + }, + } + } + }, + "components": {"schemes": {}}, + }, + "auth": {"type": "anonymous", "security_scheme": {}}, + "functions": [ + { + "name": "get_countries_LookupCountryByCurrency", + "type": "function", + "description": "Search by currency.", + "parameters": { + "type": "object", + "properties": { + "currency": {"type": "string", "description": "The currency to search for."} + }, + "required": ["currency"], + }, + } + ], + } + ] + result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) + key = ToolCallAccuracyEvaluator._RESULT_KEY assert result is not None assert result[key] == 5.0