Add Parameter Verification Functionality to Path Efficiency Evaluator (#43041)

m7md7sien · Copilot · web-flow · commit bb1223eaae69 · 2025-09-29T11:04:44.000-07:00
* Implement Path Efficiency Evaluator

* Adding Samples

* Move to private preview

* Add Parameter Verification Functionality to Path Efficiency Evaluator

* check for json seriarizable parameters

check for json seriarizable parameters

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Fix parameters matching bug

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -4,12 +4,14 @@
 
 import inspect
 from abc import ABC, abstractmethod
+import json
 from typing import (
     Any,
     Callable,
     Dict,
     Generic,
     List,
+    Tuple,
     TypedDict,
     TypeVar,
     Union,
@@ -510,15 +512,16 @@ def _parse_tools_from_response(self, response):
 
         return tool_calls
 
-    def _extract_tool_names_from_response(self, response) -> List[str]:
-        """Extract tool names from the response.
+    def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
+        """Extract tool names and parameters from the response.
+
         :param response: The response to parse.
         :type response: Union[str, List[dict]]
-        :return: List of tool names extracted from the response.
-        :rtype: List[str]
+        :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
+        :rtype: List[Tuple[str, Dict[str, str]]]
         """
         tool_calls = self._parse_tools_from_response(response)
-        tool_names = []
+        tool_name_param_pairs = []
         for tool_call in tool_calls:
             if not isinstance(tool_call, dict):
                 raise EvaluationException(
@@ -534,16 +537,41 @@ def _extract_tool_names_from_response(self, response) -> List[str]:
                     target=ErrorTarget.EVALUATE,
                     category=ErrorCategory.INVALID_VALUE,
                 )
-            if "name" in tool_call:
-                tool_names.append(tool_call["name"])
-            else:
+
+            if "name" not in tool_call:
                 raise EvaluationException(
                     "Tool call missing 'name' field.",
                     internal_message=str(tool_call),
                     target=ErrorTarget.EVALUATE,
                     category=ErrorCategory.MISSING_FIELD,
                 )
-        return tool_names
+
+            tool_name = str(tool_call["name"]).strip()
+
+            # Extract parameters/arguments
+            parameters = {}
+            if "arguments" in tool_call:
+                args = tool_call["arguments"]
+                if isinstance(args, dict):
+                    # Convert all values to strings for consistent comparison
+                    parameters = {str(k): str(v) for k, v in args.items()}
+                elif isinstance(args, str):
+                    # If arguments is a string, try to parse it as JSON
+                    try:
+                        parsed_args = json.loads(args)
+                        if isinstance(parsed_args, dict):
+                            parameters = {str(k): str(v) for k, v in parsed_args.items()}
+                    except json.JSONDecodeError:
+                        raise EvaluationException(
+                            "Failed to parse tool call arguments as JSON.",
+                            internal_message=str(tool_call),
+                            target=ErrorTarget.EVALUATE,
+                            category=ErrorCategory.INVALID_VALUE,
+                        )
+
+            tool_name_param_pairs.append((tool_name, parameters))
+
+        return tool_name_param_pairs
 
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py
@@ -1,8 +1,9 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import json
 from collections import Counter
-from typing import Dict, List, Union, Any
+from typing import Dict, List, Union, Any, Tuple
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
@@ -36,14 +37,30 @@ class PathEfficiencyEvaluator(EvaluatorBase):
                 f1_score_threshold=0.75
             )
 
+            # Example 1: Using simple tool names list
             result = path_efficiency_eval(
                 response=[
-                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "determine_intent", "arguments": {}}]},
-                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "use_tool", "arguments": {}}]},
-                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "review_results", "arguments": {}}]},
-                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report_generation", "arguments": {}}]}
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", "arguments": {}}]},
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}}]},
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
                 ],
-                ground_truth=["determine_intent", "use_tool", "review_results", "report_generation"]
+                ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
+            )
+
+            # Example 2: Using tool names with parameters (exact parameter matching required)
+            result = path_efficiency_eval(
+                response=[
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
+                    {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
+                ],
+                ground_truth=(
+                    ["search", "format_result"],
+                    {
+                        "search": {"query": "weather", "location": "NYC"},
+                        "format_result": {"format": "json"}
+                    }
+                )
             )
     """
 
@@ -78,16 +95,40 @@ def __init__(
             "path_efficiency_f1": f1_score_threshold,
         }
 
-    def _calculate_precision_recall_f1_scores(
-        self, agent_steps: List[str], ground_truth: List[str]
-    ) -> Dict[str, float]:
+    def _prepare_steps_for_comparison(
+        self,
+        agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
+        ground_truth: List[str],
+        ground_truth_params: Dict[str, Dict[str, Any]],
+        use_parameter_matching: bool,
+    ) -> Tuple[
+        List[Union[str, Tuple[str, Tuple]]],
+        List[Union[str, Tuple[str, Tuple]]],
+    ]:
+        """Prepare agent and ground truth steps for comparison based on parameter matching mode."""
+        agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
+        ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
+        if use_parameter_matching:
+            # When parameter matching is enabled, we need to match both tool name and parameters
+            agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
+            ground_truth_steps = [
+                (name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
+            ]
+        else:
+            # When parameter matching is disabled, only compare tool names
+            agent_steps = [name for name, _ in agent_tool_pairs]
+            ground_truth_steps = [step for step in ground_truth]
+
+        return agent_steps, ground_truth_steps
+
+    def _calculate_precision_recall_f1_scores(self, agent_steps: List, ground_truth_steps: List) -> Dict[str, float]:
         """Calculate precision, recall, and F1 scores."""
         if not agent_steps:
             return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
 
         # Count occurrences of each step in both lists to handle duplicates
         agent_steps_counts = Counter(agent_steps)
-        ground_truth_counts = Counter(ground_truth)
+        ground_truth_counts = Counter(ground_truth_steps)
 
         # Calculate true positives by taking the minimum count for each common element
         # For each step, count the intersection (min count) of agent and ground truth steps
@@ -126,27 +167,27 @@ def _calculate_precision_recall_f1_scores(
             "f1_score": f1_score,
         }
 
-    def _calculate_exact_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
+    def _calculate_exact_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
         """Check if agent steps exactly match ground truth (order and content)."""
-        return agent_steps == ground_truth
+        return agent_steps == ground_truth_steps
 
-    def _calculate_in_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
+    def _calculate_in_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
         """Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
-        if not ground_truth:
+        if not ground_truth_steps:
             return True
 
         gt_index = 0
         for step in agent_steps:
-            if gt_index < len(ground_truth) and step == ground_truth[gt_index]:
+            if gt_index < len(ground_truth_steps) and step == ground_truth_steps[gt_index]:
                 gt_index += 1
 
-        return gt_index == len(ground_truth)
+        return gt_index == len(ground_truth_steps)
 
-    def _calculate_any_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
+    def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
         """Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
         # Count occurrences of each step in both lists to handle duplicates
         agent_counts = Counter(agent_steps)
-        ground_truth_counts = Counter(ground_truth)
+        ground_truth_counts = Counter(ground_truth_steps)
 
         # Check if agent has at least as many occurrences of each ground truth step
         return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
@@ -167,22 +208,71 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         if not ground_truth:
             raise ValueError("ground_truth cannot be empty")
 
-        if not isinstance(ground_truth, list) or not all(isinstance(step, str) for step in ground_truth):
-            raise TypeError("ground_truth must be a list of strings")
+        # Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
+        use_parameter_matching = False
+        ground_truth_names = []
+        ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
+
+        if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
+            # Tuple format: (tool_names, parameters_dict)
+            tool_names_list, params_dict = ground_truth
+
+            if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
+                raise TypeError("ground_truth tuple first element must be a list of strings (tool names)")
+
+            if not isinstance(params_dict, dict):
+                raise TypeError(
+                    "ground_truth tuple second element must be a dictionary mapping tool names to parameters"
+                )
+
+            # Validate that all values in params_dict are dictionaries with string keys and values
+            for tool_name, params in params_dict.items():
+                if not isinstance(tool_name, str):
+                    raise TypeError("ground_truth parameters dictionary keys must be strings (tool names)")
+                if not isinstance(params, dict):
+                    raise TypeError(f"ground_truth parameters for tool '{tool_name}' must be a dictionary")
+                for k, v in params.items():
+                    if not isinstance(k, str):
+                        raise TypeError(f"ground_truth parameters for tool '{tool_name}' must have string keys")
+                    try:
+                        json.dumps(v)
+                    except (TypeError, ValueError):
+                        raise TypeError(
+                            f"ground_truth parameters for tool '{tool_name}' must have JSON-serializable values (got type {type(v)} for key '{k}')"
+                        )
+
+            ground_truth_names = [name.strip() for name in tool_names_list]
+            ground_truth_params_dict = params_dict
+            use_parameter_matching = True
+
+        elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
+            # List format: just tool names
+            ground_truth_names = [step.strip() for step in ground_truth]
+            use_parameter_matching = False
+
+        else:
+            raise TypeError(
+                "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
+            )
 
-        # Extract tool names from the response
-        agent_steps = self._extract_tool_names_from_response(response)
+        # Extract tool information from the response
+        agent_tool_pairs = self._extract_tool_names_and_params_from_response(response)
 
-        agent_steps = [step.strip() for step in agent_steps]
-        ground_truth = [step.strip() for step in ground_truth]
+        # Prepare steps for comparison
+        agent_steps, ground_truth_steps = self._prepare_steps_for_comparison(
+            agent_tool_pairs,
+            ground_truth_names,
+            ground_truth_params_dict,
+            use_parameter_matching,
+        )
 
         # Calculate precision, recall, and F1 scores
-        metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth)
+        metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
 
         # Calculate binary match metrics
-        exact_match = self._calculate_exact_match(agent_steps, ground_truth)
-        in_order_match = self._calculate_in_order_match(agent_steps, ground_truth)
-        any_order_match = self._calculate_any_order_match(agent_steps, ground_truth)
+        exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
+        in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
+        any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
 
         # Convert metrics to floats, using nan for None or non-convertible values
         path_efficiency_precision = (
@@ -215,6 +305,24 @@ def __call__(  # type: ignore
         :rtype: Dict[str, Union[float, str]]
         """
 
+    @overload
+    def __call__(  # type: ignore
+        self,
+        *,
+        response: Union[str, List[Dict[str, Any]]],
+        ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
+    ) -> Dict[str, Union[float, str]]:
+        """
+        Evaluate the path efficiency of an agent's action sequence with tool parameters.
+
+        :keyword response: The agent's response containing tool calls.
+        :paramtype response: Union[str, List[Dict[str, Any]]]
+        :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
+        :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
+        :return: The path efficiency scores and results.
+        :rtype: Dict[str, Union[float, str]]
+        """
+
     @override
     def __call__(
         self,
@@ -226,8 +334,8 @@ def __call__(
 
         :keyword response: The agent's response containing tool calls.
         :paramtype response: Union[str, List[Dict[str, Any]]]
-        :keyword ground_truth: List of expected tool/action steps.
-        :paramtype ground_truth: List[str]
+        :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
+        :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
         :return: The path efficiency scores and results.
         :rtype: Dict[str, Union[float, str]]
         """
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb
@@ -367,6 +367,38 @@
     "    print(f\"Error with empty ground truth: {e}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a8f6dc32",
+   "metadata": {},
+   "source": [
+    "#### Sample 8: Tuple Format with Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b1a1a0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PathEfficiencyEvaluator also supports tuple format with parameters for exact parameter matching\n",
+    "response_with_params = [\n",
+    "    {\n",
+    "        \"role\": \"assistant\",\n",
+    "        \"content\": [{\"type\": \"tool_call\", \"tool_call_id\": \"call_1\", \"name\": \"search\", \"arguments\": {\"query\": \"test\"}}],\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "# Ground truth using tuple format: (tool_names, parameters_dict)\n",
+    "# Parameters must match exactly for tools to be considered matching\n",
+    "ground_truth_with_params = ([\"search\"], {\"search\": {\"query\": \"test\"}})\n",
+    "\n",
+    "result = path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)\n",
+    "print(\"\\nTuple Format with Parameters Results:\")\n",
+    "pprint(result)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "6741e8a0",
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py
@@ -599,6 +599,19 @@ def evaluation_evaluate_classes_methods(self):
         ground_truth = ["search", "analyze", "report"]
 
         path_efficiency_evaluator(response=response, ground_truth=ground_truth)
+
+        # Also supports tuple format with parameters for exact parameter matching
+        response_with_params = [
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "test"}}
+                ],
+            },
+        ]
+        ground_truth_with_params = (["search"], {"search": {"query": "test"}})
+
+        path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)
         # [END path_efficiency_evaluator]
 
         # [START document_retrieval_evaluator]
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_path_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_path_efficiency_evaluators.py