Skip to content

Commit bb1223e

Browse files
m7md7sienCopilot
andauthored
Add Parameter Verification Functionality to Path Efficiency Evaluator (#43041)
* Implement Path Efficiency Evaluator * Adding Samples * Move to private preview * Add Parameter Verification Functionality to Path Efficiency Evaluator * check for json seriarizable parameters check for json seriarizable parameters Co-authored-by: Copilot <[email protected]> * Fix parameters matching bug --------- Co-authored-by: Copilot <[email protected]>
1 parent c473244 commit bb1223e

File tree

6 files changed

+437
-39
lines changed

6 files changed

+437
-39
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44

55
import inspect
66
from abc import ABC, abstractmethod
7+
import json
78
from typing import (
89
Any,
910
Callable,
1011
Dict,
1112
Generic,
1213
List,
14+
Tuple,
1315
TypedDict,
1416
TypeVar,
1517
Union,
@@ -510,15 +512,16 @@ def _parse_tools_from_response(self, response):
510512

511513
return tool_calls
512514

513-
def _extract_tool_names_from_response(self, response) -> List[str]:
514-
"""Extract tool names from the response.
515+
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
516+
"""Extract tool names and parameters from the response.
517+
515518
:param response: The response to parse.
516519
:type response: Union[str, List[dict]]
517-
:return: List of tool names extracted from the response.
518-
:rtype: List[str]
520+
:return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
521+
:rtype: List[Tuple[str, Dict[str, str]]]
519522
"""
520523
tool_calls = self._parse_tools_from_response(response)
521-
tool_names = []
524+
tool_name_param_pairs = []
522525
for tool_call in tool_calls:
523526
if not isinstance(tool_call, dict):
524527
raise EvaluationException(
@@ -534,16 +537,41 @@ def _extract_tool_names_from_response(self, response) -> List[str]:
534537
target=ErrorTarget.EVALUATE,
535538
category=ErrorCategory.INVALID_VALUE,
536539
)
537-
if "name" in tool_call:
538-
tool_names.append(tool_call["name"])
539-
else:
540+
541+
if "name" not in tool_call:
540542
raise EvaluationException(
541543
"Tool call missing 'name' field.",
542544
internal_message=str(tool_call),
543545
target=ErrorTarget.EVALUATE,
544546
category=ErrorCategory.MISSING_FIELD,
545547
)
546-
return tool_names
548+
549+
tool_name = str(tool_call["name"]).strip()
550+
551+
# Extract parameters/arguments
552+
parameters = {}
553+
if "arguments" in tool_call:
554+
args = tool_call["arguments"]
555+
if isinstance(args, dict):
556+
# Convert all values to strings for consistent comparison
557+
parameters = {str(k): str(v) for k, v in args.items()}
558+
elif isinstance(args, str):
559+
# If arguments is a string, try to parse it as JSON
560+
try:
561+
parsed_args = json.loads(args)
562+
if isinstance(parsed_args, dict):
563+
parameters = {str(k): str(v) for k, v in parsed_args.items()}
564+
except json.JSONDecodeError:
565+
raise EvaluationException(
566+
"Failed to parse tool call arguments as JSON.",
567+
internal_message=str(tool_call),
568+
target=ErrorTarget.EVALUATE,
569+
category=ErrorCategory.INVALID_VALUE,
570+
)
571+
572+
tool_name_param_pairs.append((tool_name, parameters))
573+
574+
return tool_name_param_pairs
547575

548576
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
549577
"""The asynchronous call where real end-to-end evaluation logic is performed.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py

Lines changed: 138 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4+
import json
45
from collections import Counter
5-
from typing import Dict, List, Union, Any
6+
from typing import Dict, List, Union, Any, Tuple
67
from typing_extensions import overload, override
78

89
from azure.ai.evaluation._evaluators._common import EvaluatorBase
@@ -36,14 +37,30 @@ class PathEfficiencyEvaluator(EvaluatorBase):
3637
f1_score_threshold=0.75
3738
)
3839
40+
# Example 1: Using simple tool names list
3941
result = path_efficiency_eval(
4042
response=[
41-
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "determine_intent", "arguments": {}}]},
42-
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "use_tool", "arguments": {}}]},
43-
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "review_results", "arguments": {}}]},
44-
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report_generation", "arguments": {}}]}
43+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", "arguments": {}}]},
44+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}}]},
45+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
46+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
4547
],
46-
ground_truth=["determine_intent", "use_tool", "review_results", "report_generation"]
48+
ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
49+
)
50+
51+
# Example 2: Using tool names with parameters (exact parameter matching required)
52+
result = path_efficiency_eval(
53+
response=[
54+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
55+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
56+
],
57+
ground_truth=(
58+
["search", "format_result"],
59+
{
60+
"search": {"query": "weather", "location": "NYC"},
61+
"format_result": {"format": "json"}
62+
}
63+
)
4764
)
4865
"""
4966

@@ -78,16 +95,40 @@ def __init__(
7895
"path_efficiency_f1": f1_score_threshold,
7996
}
8097

81-
def _calculate_precision_recall_f1_scores(
82-
self, agent_steps: List[str], ground_truth: List[str]
83-
) -> Dict[str, float]:
98+
def _prepare_steps_for_comparison(
99+
self,
100+
agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
101+
ground_truth: List[str],
102+
ground_truth_params: Dict[str, Dict[str, Any]],
103+
use_parameter_matching: bool,
104+
) -> Tuple[
105+
List[Union[str, Tuple[str, Tuple]]],
106+
List[Union[str, Tuple[str, Tuple]]],
107+
]:
108+
"""Prepare agent and ground truth steps for comparison based on parameter matching mode."""
109+
agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
110+
ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
111+
if use_parameter_matching:
112+
# When parameter matching is enabled, we need to match both tool name and parameters
113+
agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
114+
ground_truth_steps = [
115+
(name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
116+
]
117+
else:
118+
# When parameter matching is disabled, only compare tool names
119+
agent_steps = [name for name, _ in agent_tool_pairs]
120+
ground_truth_steps = [step for step in ground_truth]
121+
122+
return agent_steps, ground_truth_steps
123+
124+
def _calculate_precision_recall_f1_scores(self, agent_steps: List, ground_truth_steps: List) -> Dict[str, float]:
84125
"""Calculate precision, recall, and F1 scores."""
85126
if not agent_steps:
86127
return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
87128

88129
# Count occurrences of each step in both lists to handle duplicates
89130
agent_steps_counts = Counter(agent_steps)
90-
ground_truth_counts = Counter(ground_truth)
131+
ground_truth_counts = Counter(ground_truth_steps)
91132

92133
# Calculate true positives by taking the minimum count for each common element
93134
# For each step, count the intersection (min count) of agent and ground truth steps
@@ -126,27 +167,27 @@ def _calculate_precision_recall_f1_scores(
126167
"f1_score": f1_score,
127168
}
128169

129-
def _calculate_exact_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
170+
def _calculate_exact_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
130171
"""Check if agent steps exactly match ground truth (order and content)."""
131-
return agent_steps == ground_truth
172+
return agent_steps == ground_truth_steps
132173

133-
def _calculate_in_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
174+
def _calculate_in_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
134175
"""Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
135-
if not ground_truth:
176+
if not ground_truth_steps:
136177
return True
137178

138179
gt_index = 0
139180
for step in agent_steps:
140-
if gt_index < len(ground_truth) and step == ground_truth[gt_index]:
181+
if gt_index < len(ground_truth_steps) and step == ground_truth_steps[gt_index]:
141182
gt_index += 1
142183

143-
return gt_index == len(ground_truth)
184+
return gt_index == len(ground_truth_steps)
144185

145-
def _calculate_any_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
186+
def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
146187
"""Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
147188
# Count occurrences of each step in both lists to handle duplicates
148189
agent_counts = Counter(agent_steps)
149-
ground_truth_counts = Counter(ground_truth)
190+
ground_truth_counts = Counter(ground_truth_steps)
150191

151192
# Check if agent has at least as many occurrences of each ground truth step
152193
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
@@ -167,22 +208,71 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
167208
if not ground_truth:
168209
raise ValueError("ground_truth cannot be empty")
169210

170-
if not isinstance(ground_truth, list) or not all(isinstance(step, str) for step in ground_truth):
171-
raise TypeError("ground_truth must be a list of strings")
211+
# Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
212+
use_parameter_matching = False
213+
ground_truth_names = []
214+
ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
215+
216+
if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
217+
# Tuple format: (tool_names, parameters_dict)
218+
tool_names_list, params_dict = ground_truth
219+
220+
if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
221+
raise TypeError("ground_truth tuple first element must be a list of strings (tool names)")
222+
223+
if not isinstance(params_dict, dict):
224+
raise TypeError(
225+
"ground_truth tuple second element must be a dictionary mapping tool names to parameters"
226+
)
227+
228+
# Validate that all values in params_dict are dictionaries with string keys and values
229+
for tool_name, params in params_dict.items():
230+
if not isinstance(tool_name, str):
231+
raise TypeError("ground_truth parameters dictionary keys must be strings (tool names)")
232+
if not isinstance(params, dict):
233+
raise TypeError(f"ground_truth parameters for tool '{tool_name}' must be a dictionary")
234+
for k, v in params.items():
235+
if not isinstance(k, str):
236+
raise TypeError(f"ground_truth parameters for tool '{tool_name}' must have string keys")
237+
try:
238+
json.dumps(v)
239+
except (TypeError, ValueError):
240+
raise TypeError(
241+
f"ground_truth parameters for tool '{tool_name}' must have JSON-serializable values (got type {type(v)} for key '{k}')"
242+
)
243+
244+
ground_truth_names = [name.strip() for name in tool_names_list]
245+
ground_truth_params_dict = params_dict
246+
use_parameter_matching = True
247+
248+
elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
249+
# List format: just tool names
250+
ground_truth_names = [step.strip() for step in ground_truth]
251+
use_parameter_matching = False
252+
253+
else:
254+
raise TypeError(
255+
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
256+
)
172257

173-
# Extract tool names from the response
174-
agent_steps = self._extract_tool_names_from_response(response)
258+
# Extract tool information from the response
259+
agent_tool_pairs = self._extract_tool_names_and_params_from_response(response)
175260

176-
agent_steps = [step.strip() for step in agent_steps]
177-
ground_truth = [step.strip() for step in ground_truth]
261+
# Prepare steps for comparison
262+
agent_steps, ground_truth_steps = self._prepare_steps_for_comparison(
263+
agent_tool_pairs,
264+
ground_truth_names,
265+
ground_truth_params_dict,
266+
use_parameter_matching,
267+
)
178268

179269
# Calculate precision, recall, and F1 scores
180-
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth)
270+
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
181271

182272
# Calculate binary match metrics
183-
exact_match = self._calculate_exact_match(agent_steps, ground_truth)
184-
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth)
185-
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth)
273+
exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
274+
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
275+
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
186276

187277
# Convert metrics to floats, using nan for None or non-convertible values
188278
path_efficiency_precision = (
@@ -215,6 +305,24 @@ def __call__( # type: ignore
215305
:rtype: Dict[str, Union[float, str]]
216306
"""
217307

308+
@overload
309+
def __call__( # type: ignore
310+
self,
311+
*,
312+
response: Union[str, List[Dict[str, Any]]],
313+
ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
314+
) -> Dict[str, Union[float, str]]:
315+
"""
316+
Evaluate the path efficiency of an agent's action sequence with tool parameters.
317+
318+
:keyword response: The agent's response containing tool calls.
319+
:paramtype response: Union[str, List[Dict[str, Any]]]
320+
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321+
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322+
:return: The path efficiency scores and results.
323+
:rtype: Dict[str, Union[float, str]]
324+
"""
325+
218326
@override
219327
def __call__(
220328
self,
@@ -226,8 +334,8 @@ def __call__(
226334
227335
:keyword response: The agent's response containing tool calls.
228336
:paramtype response: Union[str, List[Dict[str, Any]]]
229-
:keyword ground_truth: List of expected tool/action steps.
230-
:paramtype ground_truth: List[str]
337+
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338+
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
231339
:return: The path efficiency scores and results.
232340
:rtype: Dict[str, Union[float, str]]
233341
"""

sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/path_efficiency.ipynb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,38 @@
367367
" print(f\"Error with empty ground truth: {e}\")"
368368
]
369369
},
370+
{
371+
"cell_type": "markdown",
372+
"id": "a8f6dc32",
373+
"metadata": {},
374+
"source": [
375+
"#### Sample 8: Tuple Format with Parameters"
376+
]
377+
},
378+
{
379+
"cell_type": "code",
380+
"execution_count": null,
381+
"id": "1b1a1a0c",
382+
"metadata": {},
383+
"outputs": [],
384+
"source": [
385+
"# PathEfficiencyEvaluator also supports tuple format with parameters for exact parameter matching\n",
386+
"response_with_params = [\n",
387+
" {\n",
388+
" \"role\": \"assistant\",\n",
389+
" \"content\": [{\"type\": \"tool_call\", \"tool_call_id\": \"call_1\", \"name\": \"search\", \"arguments\": {\"query\": \"test\"}}],\n",
390+
" },\n",
391+
"]\n",
392+
"\n",
393+
"# Ground truth using tuple format: (tool_names, parameters_dict)\n",
394+
"# Parameters must match exactly for tools to be considered matching\n",
395+
"ground_truth_with_params = ([\"search\"], {\"search\": {\"query\": \"test\"}})\n",
396+
"\n",
397+
"result = path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)\n",
398+
"print(\"\\nTuple Format with Parameters Results:\")\n",
399+
"pprint(result)"
400+
]
401+
},
370402
{
371403
"cell_type": "markdown",
372404
"id": "6741e8a0",

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,19 @@ def evaluation_evaluate_classes_methods(self):
599599
ground_truth = ["search", "analyze", "report"]
600600

601601
path_efficiency_evaluator(response=response, ground_truth=ground_truth)
602+
603+
# Also supports tuple format with parameters for exact parameter matching
604+
response_with_params = [
605+
{
606+
"role": "assistant",
607+
"content": [
608+
{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "test"}}
609+
],
610+
},
611+
]
612+
ground_truth_with_params = (["search"], {"search": {"query": "test"}})
613+
614+
path_efficiency_evaluator(response=response_with_params, ground_truth=ground_truth_with_params)
602615
# [END path_efficiency_evaluator]
603616

604617
# [START document_retrieval_evaluator]

0 commit comments

Comments
 (0)