Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 12 additions & 4 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py‎
Lines changed: 10 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 156 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 156 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py‎
Lines changed: 15 additions & 15 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -670,13 +670,21 @@ def _pretty_format_conversation_history(conversation_history):
     ):
         formatted_history += f"User turn {i+1}:\n"
         for msg in user_query:
-            formatted_history += "  " + "\n  ".join(msg)
-        formatted_history += "\n\n"
+            if isinstance(msg, list):
+                for submsg in msg:
+                    formatted_history += "  " + "\n  ".join(submsg.split("\n")) + "\n"
+            else:
+                formatted_history += "  " + "\n  ".join(msg.split("\n")) + "\n"
+        formatted_history += "\n"
         if agent_response:
             formatted_history += f"Agent turn {i+1}:\n"
             for msg in agent_response:
-                formatted_history += "  " + "\n  ".join(msg)
-            formatted_history += "\n\n"
+                if isinstance(msg, list):
+                    for submsg in msg:
+                        formatted_history += "  " + "\n  ".join(submsg.split("\n")) + "\n"
+                else:
+                    formatted_history += "  " + "\n  ".join(msg.split("\n")) + "\n"
+            formatted_history += "\n"
     return formatted_history
 
 
 
@@ -11,7 +11,11 @@
 
 # Import all evals
 from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
-from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
+from azure.ai.evaluation._evaluators._task_completion import _TaskCompletionEvaluator
+from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
+from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
+from azure.ai.evaluation._evaluators._tool_success import _ToolSuccessEvaluator
+from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
 from azure.ai.evaluation import (
     BleuScoreEvaluator,
     CodeVulnerabilityEvaluator,
@@ -68,8 +72,12 @@
     SexualEvaluator: "sexual",
     SimilarityEvaluator: "similarity",
     TaskAdherenceEvaluator: "task_adherence",
-    TaskCompletionEvaluator: "task_completion",
+    _TaskCompletionEvaluator: "task_completion",
+    _TaskNavigationEfficiencyEvaluator: "task_navigation_efficiency",
     ToolCallAccuracyEvaluator: "tool_call_accuracy",
+    _ToolInputAccuracyEvaluator: "tool_input_accuracy",
+    _ToolSelectionEvaluator: "tool_selection",
+    _ToolSuccessEvaluator: "tool_success",
     UngroundedAttributesEvaluator: "ungrounded_attributes",
     ViolenceEvaluator: "violence",
 }
@@ -5,7 +5,8 @@
 import math
 import re
 import os
-from typing import Dict, Optional, TypeVar, Union
+from itertools import chain
+from typing import Dict, Optional, TypeVar, Union, List
 
 if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
     from promptflow.core._flow import AsyncPrompty
@@ -188,3 +189,157 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             f"{self._result_key}_result": binary_result,
             f"{self._result_key}_threshold": self._threshold,
         }
+
+    @staticmethod
+    def _get_built_in_tool_definition(tool_name: str):
+        """Get the definition for the built-in tool."""
+        try:
+            from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+
+            if tool_name in _BUILT_IN_DESCRIPTIONS:
+                return {
+                    "type": tool_name,
+                    "description": _BUILT_IN_DESCRIPTIONS[tool_name],
+                    "name": tool_name,
+                    "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
+                }
+        except ImportError:
+            pass
+        return None
+
+    def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
+        """Extract tool definitions needed for the given built-in tool calls."""
+        needed_definitions = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+
+                # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        definition = self._get_built_in_tool_definition(tool_name)
+                        if definition and definition not in needed_definitions:
+                            needed_definitions.append(definition)
+
+        return needed_definitions
+
+    def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
+        """Extract just the tool names from tool calls, removing parameters."""
+        tool_names = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        tool_names.append(tool_name)
+                elif tool_call.get("function", {}).get("name"):
+                    # Handle function call format
+                    tool_names.append(tool_call["function"]["name"])
+                elif tool_call.get("name"):
+                    # Handle direct name format
+                    tool_names.append(tool_call["name"])
+        return tool_names
+
+    def _extract_needed_tool_definitions(
+        self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
+    ) -> List[Dict]:
+        """Extract the tool definitions that are needed for the provided tool calls.
+
+        :param tool_calls: The tool calls that need definitions
+        :type tool_calls: List[Dict]
+        :param tool_definitions: User-provided tool definitions
+        :type tool_definitions: List[Dict]
+        :param error_target: The evaluator-specific error target for exceptions
+        :type error_target: ErrorTarget
+        :return: List of needed tool definitions
+        :rtype: List[Dict]
+        :raises EvaluationException: If validation fails
+        """
+        needed_tool_definitions = []
+
+        # Add all user-provided tool definitions
+        needed_tool_definitions.extend(tool_definitions)
+
+        # Add the needed built-in tool definitions (if they are called)
+        built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
+        needed_tool_definitions.extend(built_in_definitions)
+
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
+
+        # Validate that all tool calls have corresponding definitions
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name and self._get_built_in_tool_definition(tool_name):
+                        # This is a built-in tool from converter, already handled above
+                        continue
+                    elif tool_name:
+                        # This is a regular function tool from converter
+                        tool_definition_exists = any(
+                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
+                            for tool in tool_definitions_expanded
+                        )
+                        if not tool_definition_exists:
+                            raise EvaluationException(
+                                message=f"Tool definition for {tool_name} not found",
+                                blame=ErrorBlame.USER_ERROR,
+                                category=ErrorCategory.INVALID_VALUE,
+                                target=error_target,
+                            )
+                    else:
+                        raise EvaluationException(
+                            message=f"Tool call missing name: {tool_call}",
+                            blame=ErrorBlame.USER_ERROR,
+                            category=ErrorCategory.INVALID_VALUE,
+                            target=error_target,
+                        )
+                else:
+                    # Unsupported tool format - only converter format is supported
+                    raise EvaluationException(
+                        message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=error_target,
+                    )
+            else:
+                # Tool call is not a dictionary
+                raise EvaluationException(
+                    message=f"Tool call is not a dictionary: {tool_call}",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=error_target,
+                )
+
+        return needed_tool_definitions
+
+    def _not_applicable_result(
+        self, error_message: str, threshold: Union[int, float]
+    ) -> Dict[str, Union[str, float, Dict]]:
+        """Return a result indicating that the evaluation is not applicable.
+
+        :param error_message: The error message explaining why evaluation is not applicable.
+        :type error_message: str
+        :param threshold: The threshold value for the evaluator.
+        :type threshold: Union[int, float]
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float, Dict]]
+        """
+        # If no tool calls were made or tool call type is not supported, return not applicable result
+        return {
+            self._result_key: self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_result": "pass",
+            f"{self._result_key}_threshold": threshold,
+            f"{self._result_key}_reason": error_message,
+            f"{self._result_key}_details": {},
+        }
@@ -2,6 +2,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from ._task_completion import TaskCompletionEvaluator
+from ._task_completion import _TaskCompletionEvaluator
 
-__all__ = ["TaskCompletionEvaluator"]
+__all__ = ["_TaskCompletionEvaluator"]
@@ -18,7 +18,7 @@
 
 
 @experimental
-class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
+class _TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:
 
         - Final outcome and deliverable of the task
@@ -27,8 +27,8 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
     This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
 
     Scoring is binary:
-    - TRUE: Task fully completed with usable deliverable that meets all user requirements
-    - FALSE: Task incomplete, partially completed, or deliverable does not meet requirements
+    - 1 (pass): Task fully completed with usable deliverable that meets all user requirements
+    - 0 (fail): Task incomplete, partially completed, or deliverable does not meet requirements
 
     The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
 
@@ -43,7 +43,7 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
             :end-before: [END task_completion_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a TaskCompletionEvaluator with a query and response.
+            :caption: Initialize and call a _TaskCompletionEvaluator with a query and response.
 
     .. admonition:: Example using Azure AI Project URL:
 
@@ -52,7 +52,7 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
             :end-before: [END task_completion_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call a _TaskCompletionEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     """
@@ -83,20 +83,20 @@ def __call__(
         query: Union[str, List[dict]],
         response: Union[str, List[dict]],
         tool_definitions: Optional[Union[dict, List[dict]]] = None,
-    ) -> Dict[str, Union[str, bool]]:
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate task completion for a given query, response, and optionally tool definitions.
         The query and response can be either a string or a list of messages.
 
 
         Example with string inputs and no tools:
-            evaluator = TaskCompletionEvaluator(model_config)
+            evaluator = _TaskCompletionEvaluator(model_config)
             query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
             response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
 
             result = evaluator(query=query, response=response)
 
         Example with list of messages:
-            evaluator = TaskCompletionEvaluator(model_config)
+            evaluator = _TaskCompletionEvaluator(model_config)
             query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
             response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
             tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
@@ -110,7 +110,7 @@ def __call__(
         :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
         :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
         :return: A dictionary with the task completion evaluation results.
-        :rtype: Dict[str, Union[str, bool]]
+        :rtype: Dict[str, Union[str, float]]
         """
 
     @override
@@ -127,7 +127,7 @@ def __call__(  # pylint: disable=docstring-missing-param
         return super().__call__(*args, **kwargs)
 
     @override
-    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]:  # type: ignore[override]
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
         """Do Task Completion evaluation.
         :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
         :type eval_input: Dict
@@ -153,11 +153,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]:  # ty
         llm_output = prompty_output_dict.get("llm_output", {})
 
         if isinstance(llm_output, dict):
-            success = llm_output.get("success", False)
+            success = llm_output.get("success", 0)
             if isinstance(success, str):
-                success = success.upper() == "TRUE"
+                success = 1 if success.upper() == "TRUE" else 0
 
-            success_result = "pass" if success else "fail"
+            success_result = "pass" if success == 1 else "fail"
             reason = llm_output.get("explanation", "")
             return {
                 f"{self._result_key}": success,
@@ -173,5 +173,5 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]:  # ty
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
         if logger:
-            logger.warning("LLM output is not a dictionary, returning False for the success.")
-        return {self._result_key: False}
+            logger.warning("LLM output is not a dictionary, returning 0 for the success.")
+        return {self._result_key: 0}
@@ -2,6 +2,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from ._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode
+from ._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator, _TaskNavigationEfficiencyMatchingMode
 
-__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"]
+__all__ = ["_TaskNavigationEfficiencyEvaluator", "_TaskNavigationEfficiencyMatchingMode"]