11# ---------------------------------------------------------
22# Copyright (c) Microsoft Corporation. All rights reserved.
33# ---------------------------------------------------------
4+ import json
45from collections import Counter
5- from typing import Dict , List , Union , Any
6+ from typing import Dict , List , Union , Any , Tuple
67from typing_extensions import overload , override
78
89from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
@@ -36,14 +37,30 @@ class PathEfficiencyEvaluator(EvaluatorBase):
3637 f1_score_threshold=0.75
3738 )
3839
40+ # Example 1: Using simple tool names list
3941 result = path_efficiency_eval(
4042 response=[
41- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "determine_intent ", "arguments": {}}]},
42- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "use_tool ", "arguments": {}}]},
43- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "review_results ", "arguments": {}}]},
44- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report_generation ", "arguments": {}}]}
43+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call ", "arguments": {}}]},
44+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A ", "arguments": {}}]},
45+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B ", "arguments": {}}]},
46+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis ", "arguments": {}}]}
4547 ],
46- ground_truth=["determine_intent", "use_tool", "review_results", "report_generation"]
48+ ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
49+ )
50+
51+ # Example 2: Using tool names with parameters (exact parameter matching required)
52+ result = path_efficiency_eval(
53+ response=[
54+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
55+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
56+ ],
57+ ground_truth=(
58+ ["search", "format_result"],
59+ {
60+ "search": {"query": "weather", "location": "NYC"},
61+ "format_result": {"format": "json"}
62+ }
63+ )
4764 )
4865 """
4966
@@ -78,16 +95,40 @@ def __init__(
7895 "path_efficiency_f1" : f1_score_threshold ,
7996 }
8097
81- def _calculate_precision_recall_f1_scores (
82- self , agent_steps : List [str ], ground_truth : List [str ]
83- ) -> Dict [str , float ]:
98+ def _prepare_steps_for_comparison (
99+ self ,
100+ agent_tool_pairs : List [Tuple [str , Dict [str , Any ]]],
101+ ground_truth : List [str ],
102+ ground_truth_params : Dict [str , Dict [str , Any ]],
103+ use_parameter_matching : bool ,
104+ ) -> Tuple [
105+ List [Union [str , Tuple [str , Tuple ]]],
106+ List [Union [str , Tuple [str , Tuple ]]],
107+ ]:
108+ """Prepare agent and ground truth steps for comparison based on parameter matching mode."""
109+ agent_steps : List [Union [str , Tuple [str , Tuple ]]] = []
110+ ground_truth_steps : List [Union [str , Tuple [str , Tuple ]]] = []
111+ if use_parameter_matching :
112+ # When parameter matching is enabled, we need to match both tool name and parameters
113+ agent_steps = [(pair [0 ], tuple (sorted (pair [1 ].items ()))) for pair in agent_tool_pairs ]
114+ ground_truth_steps = [
115+ (name , tuple (sorted (ground_truth_params .get (name , {}).items ()))) for name in ground_truth
116+ ]
117+ else :
118+ # When parameter matching is disabled, only compare tool names
119+ agent_steps = [name for name , _ in agent_tool_pairs ]
120+ ground_truth_steps = [step for step in ground_truth ]
121+
122+ return agent_steps , ground_truth_steps
123+
124+ def _calculate_precision_recall_f1_scores (self , agent_steps : List , ground_truth_steps : List ) -> Dict [str , float ]:
84125 """Calculate precision, recall, and F1 scores."""
85126 if not agent_steps :
86127 return {"precision_score" : 0.0 , "recall_score" : 0.0 , "f1_score" : 0.0 }
87128
88129 # Count occurrences of each step in both lists to handle duplicates
89130 agent_steps_counts = Counter (agent_steps )
90- ground_truth_counts = Counter (ground_truth )
131+ ground_truth_counts = Counter (ground_truth_steps )
91132
92133 # Calculate true positives by taking the minimum count for each common element
93134 # For each step, count the intersection (min count) of agent and ground truth steps
@@ -126,27 +167,27 @@ def _calculate_precision_recall_f1_scores(
126167 "f1_score" : f1_score ,
127168 }
128169
129- def _calculate_exact_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
170+ def _calculate_exact_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
130171 """Check if agent steps exactly match ground truth (order and content)."""
131- return agent_steps == ground_truth
172+ return agent_steps == ground_truth_steps
132173
133- def _calculate_in_order_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
174+ def _calculate_in_order_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
134175 """Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
135- if not ground_truth :
176+ if not ground_truth_steps :
136177 return True
137178
138179 gt_index = 0
139180 for step in agent_steps :
140- if gt_index < len (ground_truth ) and step == ground_truth [gt_index ]:
181+ if gt_index < len (ground_truth_steps ) and step == ground_truth_steps [gt_index ]:
141182 gt_index += 1
142183
143- return gt_index == len (ground_truth )
184+ return gt_index == len (ground_truth_steps )
144185
145- def _calculate_any_order_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
186+ def _calculate_any_order_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
146187 """Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
147188 # Count occurrences of each step in both lists to handle duplicates
148189 agent_counts = Counter (agent_steps )
149- ground_truth_counts = Counter (ground_truth )
190+ ground_truth_counts = Counter (ground_truth_steps )
150191
151192 # Check if agent has at least as many occurrences of each ground truth step
152193 return all (agent_counts [step ] >= ground_truth_counts [step ] for step in ground_truth_counts )
@@ -167,22 +208,71 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
167208 if not ground_truth :
168209 raise ValueError ("ground_truth cannot be empty" )
169210
170- if not isinstance (ground_truth , list ) or not all (isinstance (step , str ) for step in ground_truth ):
171- raise TypeError ("ground_truth must be a list of strings" )
211+ # Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
212+ use_parameter_matching = False
213+ ground_truth_names = []
214+ ground_truth_params_dict : Dict [str , Dict [str , Any ]] = {}
215+
216+ if isinstance (ground_truth , tuple ) and len (ground_truth ) == 2 :
217+ # Tuple format: (tool_names, parameters_dict)
218+ tool_names_list , params_dict = ground_truth
219+
220+ if not isinstance (tool_names_list , list ) or not all (isinstance (name , str ) for name in tool_names_list ):
221+ raise TypeError ("ground_truth tuple first element must be a list of strings (tool names)" )
222+
223+ if not isinstance (params_dict , dict ):
224+ raise TypeError (
225+ "ground_truth tuple second element must be a dictionary mapping tool names to parameters"
226+ )
227+
228+ # Validate that all values in params_dict are dictionaries with string keys and values
229+ for tool_name , params in params_dict .items ():
230+ if not isinstance (tool_name , str ):
231+ raise TypeError ("ground_truth parameters dictionary keys must be strings (tool names)" )
232+ if not isinstance (params , dict ):
233+ raise TypeError (f"ground_truth parameters for tool '{ tool_name } ' must be a dictionary" )
234+ for k , v in params .items ():
235+ if not isinstance (k , str ):
236+ raise TypeError (f"ground_truth parameters for tool '{ tool_name } ' must have string keys" )
237+ try :
238+ json .dumps (v )
239+ except (TypeError , ValueError ):
240+ raise TypeError (
241+ f"ground_truth parameters for tool '{ tool_name } ' must have JSON-serializable values (got type { type (v )} for key '{ k } ')"
242+ )
243+
244+ ground_truth_names = [name .strip () for name in tool_names_list ]
245+ ground_truth_params_dict = params_dict
246+ use_parameter_matching = True
247+
248+ elif isinstance (ground_truth , list ) and all (isinstance (step , str ) for step in ground_truth ):
249+ # List format: just tool names
250+ ground_truth_names = [step .strip () for step in ground_truth ]
251+ use_parameter_matching = False
252+
253+ else :
254+ raise TypeError (
255+ "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
256+ )
172257
173- # Extract tool names from the response
174- agent_steps = self ._extract_tool_names_from_response (response )
258+ # Extract tool information from the response
259+ agent_tool_pairs = self ._extract_tool_names_and_params_from_response (response )
175260
176- agent_steps = [step .strip () for step in agent_steps ]
177- ground_truth = [step .strip () for step in ground_truth ]
261+ # Prepare steps for comparison
262+ agent_steps , ground_truth_steps = self ._prepare_steps_for_comparison (
263+ agent_tool_pairs ,
264+ ground_truth_names ,
265+ ground_truth_params_dict ,
266+ use_parameter_matching ,
267+ )
178268
179269 # Calculate precision, recall, and F1 scores
180- metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth )
270+ metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
181271
182272 # Calculate binary match metrics
183- exact_match = self ._calculate_exact_match (agent_steps , ground_truth )
184- in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth )
185- any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth )
273+ exact_match = self ._calculate_exact_match (agent_steps , ground_truth_steps )
274+ in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth_steps )
275+ any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth_steps )
186276
187277 # Convert metrics to floats, using nan for None or non-convertible values
188278 path_efficiency_precision = (
@@ -215,6 +305,24 @@ def __call__( # type: ignore
215305 :rtype: Dict[str, Union[float, str]]
216306 """
217307
308+ @overload
309+ def __call__ ( # type: ignore
310+ self ,
311+ * ,
312+ response : Union [str , List [Dict [str , Any ]]],
313+ ground_truth : Tuple [List [str ], Dict [str , Dict [str , str ]]],
314+ ) -> Dict [str , Union [float , str ]]:
315+ """
316+ Evaluate the path efficiency of an agent's action sequence with tool parameters.
317+
318+ :keyword response: The agent's response containing tool calls.
319+ :paramtype response: Union[str, List[Dict[str, Any]]]
320+ :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321+ :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322+ :return: The path efficiency scores and results.
323+ :rtype: Dict[str, Union[float, str]]
324+ """
325+
218326 @override
219327 def __call__ (
220328 self ,
@@ -226,8 +334,8 @@ def __call__(
226334
227335 :keyword response: The agent's response containing tool calls.
228336 :paramtype response: Union[str, List[Dict[str, Any]]]
229- :keyword ground_truth: List of expected tool/action steps.
230- :paramtype ground_truth: List[str]
337+ :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict) .
338+ :paramtype ground_truth: Union[ List[str], Tuple[List[str], Dict[str, Dict[str, str]]] ]
231339 :return: The path efficiency scores and results.
232340 :rtype: Dict[str, Union[float, str]]
233341 """
0 commit comments