1
1
# ---------------------------------------------------------
2
2
# Copyright (c) Microsoft Corporation. All rights reserved.
3
3
# ---------------------------------------------------------
4
+ import json
4
5
from collections import Counter
5
- from typing import Dict , List , Union , Any
6
+ from typing import Dict , List , Union , Any , Tuple
6
7
from typing_extensions import overload , override
7
8
8
9
from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
@@ -36,14 +37,30 @@ class PathEfficiencyEvaluator(EvaluatorBase):
36
37
f1_score_threshold=0.75
37
38
)
38
39
40
+ # Example 1: Using simple tool names list
39
41
result = path_efficiency_eval(
40
42
response=[
41
- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "determine_intent ", "arguments": {}}]},
42
- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "use_tool ", "arguments": {}}]},
43
- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "review_results ", "arguments": {}}]},
44
- {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report_generation ", "arguments": {}}]}
43
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call ", "arguments": {}}]},
44
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A ", "arguments": {}}]},
45
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B ", "arguments": {}}]},
46
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis ", "arguments": {}}]}
45
47
],
46
- ground_truth=["determine_intent", "use_tool", "review_results", "report_generation"]
48
+ ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
49
+ )
50
+
51
+ # Example 2: Using tool names with parameters (exact parameter matching required)
52
+ result = path_efficiency_eval(
53
+ response=[
54
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
55
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
56
+ ],
57
+ ground_truth=(
58
+ ["search", "format_result"],
59
+ {
60
+ "search": {"query": "weather", "location": "NYC"},
61
+ "format_result": {"format": "json"}
62
+ }
63
+ )
47
64
)
48
65
"""
49
66
@@ -78,16 +95,40 @@ def __init__(
78
95
"path_efficiency_f1" : f1_score_threshold ,
79
96
}
80
97
81
- def _calculate_precision_recall_f1_scores (
82
- self , agent_steps : List [str ], ground_truth : List [str ]
83
- ) -> Dict [str , float ]:
98
+ def _prepare_steps_for_comparison (
99
+ self ,
100
+ agent_tool_pairs : List [Tuple [str , Dict [str , Any ]]],
101
+ ground_truth : List [str ],
102
+ ground_truth_params : Dict [str , Dict [str , Any ]],
103
+ use_parameter_matching : bool ,
104
+ ) -> Tuple [
105
+ List [Union [str , Tuple [str , Tuple ]]],
106
+ List [Union [str , Tuple [str , Tuple ]]],
107
+ ]:
108
+ """Prepare agent and ground truth steps for comparison based on parameter matching mode."""
109
+ agent_steps : List [Union [str , Tuple [str , Tuple ]]] = []
110
+ ground_truth_steps : List [Union [str , Tuple [str , Tuple ]]] = []
111
+ if use_parameter_matching :
112
+ # When parameter matching is enabled, we need to match both tool name and parameters
113
+ agent_steps = [(pair [0 ], tuple (sorted (pair [1 ].items ()))) for pair in agent_tool_pairs ]
114
+ ground_truth_steps = [
115
+ (name , tuple (sorted (ground_truth_params .get (name , {}).items ()))) for name in ground_truth
116
+ ]
117
+ else :
118
+ # When parameter matching is disabled, only compare tool names
119
+ agent_steps = [name for name , _ in agent_tool_pairs ]
120
+ ground_truth_steps = [step for step in ground_truth ]
121
+
122
+ return agent_steps , ground_truth_steps
123
+
124
+ def _calculate_precision_recall_f1_scores (self , agent_steps : List , ground_truth_steps : List ) -> Dict [str , float ]:
84
125
"""Calculate precision, recall, and F1 scores."""
85
126
if not agent_steps :
86
127
return {"precision_score" : 0.0 , "recall_score" : 0.0 , "f1_score" : 0.0 }
87
128
88
129
# Count occurrences of each step in both lists to handle duplicates
89
130
agent_steps_counts = Counter (agent_steps )
90
- ground_truth_counts = Counter (ground_truth )
131
+ ground_truth_counts = Counter (ground_truth_steps )
91
132
92
133
# Calculate true positives by taking the minimum count for each common element
93
134
# For each step, count the intersection (min count) of agent and ground truth steps
@@ -126,27 +167,27 @@ def _calculate_precision_recall_f1_scores(
126
167
"f1_score" : f1_score ,
127
168
}
128
169
129
- def _calculate_exact_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
170
+ def _calculate_exact_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
130
171
"""Check if agent steps exactly match ground truth (order and content)."""
131
- return agent_steps == ground_truth
172
+ return agent_steps == ground_truth_steps
132
173
133
- def _calculate_in_order_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
174
+ def _calculate_in_order_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
134
175
"""Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
135
- if not ground_truth :
176
+ if not ground_truth_steps :
136
177
return True
137
178
138
179
gt_index = 0
139
180
for step in agent_steps :
140
- if gt_index < len (ground_truth ) and step == ground_truth [gt_index ]:
181
+ if gt_index < len (ground_truth_steps ) and step == ground_truth_steps [gt_index ]:
141
182
gt_index += 1
142
183
143
- return gt_index == len (ground_truth )
184
+ return gt_index == len (ground_truth_steps )
144
185
145
- def _calculate_any_order_match (self , agent_steps : List [ str ], ground_truth : List [ str ] ) -> bool :
186
+ def _calculate_any_order_match (self , agent_steps : List , ground_truth_steps : List ) -> bool :
146
187
"""Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
147
188
# Count occurrences of each step in both lists to handle duplicates
148
189
agent_counts = Counter (agent_steps )
149
- ground_truth_counts = Counter (ground_truth )
190
+ ground_truth_counts = Counter (ground_truth_steps )
150
191
151
192
# Check if agent has at least as many occurrences of each ground truth step
152
193
return all (agent_counts [step ] >= ground_truth_counts [step ] for step in ground_truth_counts )
@@ -167,22 +208,71 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
167
208
if not ground_truth :
168
209
raise ValueError ("ground_truth cannot be empty" )
169
210
170
- if not isinstance (ground_truth , list ) or not all (isinstance (step , str ) for step in ground_truth ):
171
- raise TypeError ("ground_truth must be a list of strings" )
211
+ # Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
212
+ use_parameter_matching = False
213
+ ground_truth_names = []
214
+ ground_truth_params_dict : Dict [str , Dict [str , Any ]] = {}
215
+
216
+ if isinstance (ground_truth , tuple ) and len (ground_truth ) == 2 :
217
+ # Tuple format: (tool_names, parameters_dict)
218
+ tool_names_list , params_dict = ground_truth
219
+
220
+ if not isinstance (tool_names_list , list ) or not all (isinstance (name , str ) for name in tool_names_list ):
221
+ raise TypeError ("ground_truth tuple first element must be a list of strings (tool names)" )
222
+
223
+ if not isinstance (params_dict , dict ):
224
+ raise TypeError (
225
+ "ground_truth tuple second element must be a dictionary mapping tool names to parameters"
226
+ )
227
+
228
+ # Validate that all values in params_dict are dictionaries with string keys and values
229
+ for tool_name , params in params_dict .items ():
230
+ if not isinstance (tool_name , str ):
231
+ raise TypeError ("ground_truth parameters dictionary keys must be strings (tool names)" )
232
+ if not isinstance (params , dict ):
233
+ raise TypeError (f"ground_truth parameters for tool '{ tool_name } ' must be a dictionary" )
234
+ for k , v in params .items ():
235
+ if not isinstance (k , str ):
236
+ raise TypeError (f"ground_truth parameters for tool '{ tool_name } ' must have string keys" )
237
+ try :
238
+ json .dumps (v )
239
+ except (TypeError , ValueError ):
240
+ raise TypeError (
241
+ f"ground_truth parameters for tool '{ tool_name } ' must have JSON-serializable values (got type { type (v )} for key '{ k } ')"
242
+ )
243
+
244
+ ground_truth_names = [name .strip () for name in tool_names_list ]
245
+ ground_truth_params_dict = params_dict
246
+ use_parameter_matching = True
247
+
248
+ elif isinstance (ground_truth , list ) and all (isinstance (step , str ) for step in ground_truth ):
249
+ # List format: just tool names
250
+ ground_truth_names = [step .strip () for step in ground_truth ]
251
+ use_parameter_matching = False
252
+
253
+ else :
254
+ raise TypeError (
255
+ "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
256
+ )
172
257
173
- # Extract tool names from the response
174
- agent_steps = self ._extract_tool_names_from_response (response )
258
+ # Extract tool information from the response
259
+ agent_tool_pairs = self ._extract_tool_names_and_params_from_response (response )
175
260
176
- agent_steps = [step .strip () for step in agent_steps ]
177
- ground_truth = [step .strip () for step in ground_truth ]
261
+ # Prepare steps for comparison
262
+ agent_steps , ground_truth_steps = self ._prepare_steps_for_comparison (
263
+ agent_tool_pairs ,
264
+ ground_truth_names ,
265
+ ground_truth_params_dict ,
266
+ use_parameter_matching ,
267
+ )
178
268
179
269
# Calculate precision, recall, and F1 scores
180
- metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth )
270
+ metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
181
271
182
272
# Calculate binary match metrics
183
- exact_match = self ._calculate_exact_match (agent_steps , ground_truth )
184
- in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth )
185
- any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth )
273
+ exact_match = self ._calculate_exact_match (agent_steps , ground_truth_steps )
274
+ in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth_steps )
275
+ any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth_steps )
186
276
187
277
# Convert metrics to floats, using nan for None or non-convertible values
188
278
path_efficiency_precision = (
@@ -215,6 +305,24 @@ def __call__( # type: ignore
215
305
:rtype: Dict[str, Union[float, str]]
216
306
"""
217
307
308
+ @overload
309
+ def __call__ ( # type: ignore
310
+ self ,
311
+ * ,
312
+ response : Union [str , List [Dict [str , Any ]]],
313
+ ground_truth : Tuple [List [str ], Dict [str , Dict [str , str ]]],
314
+ ) -> Dict [str , Union [float , str ]]:
315
+ """
316
+ Evaluate the path efficiency of an agent's action sequence with tool parameters.
317
+
318
+ :keyword response: The agent's response containing tool calls.
319
+ :paramtype response: Union[str, List[Dict[str, Any]]]
320
+ :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321
+ :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322
+ :return: The path efficiency scores and results.
323
+ :rtype: Dict[str, Union[float, str]]
324
+ """
325
+
218
326
@override
219
327
def __call__ (
220
328
self ,
@@ -226,8 +334,8 @@ def __call__(
226
334
227
335
:keyword response: The agent's response containing tool calls.
228
336
:paramtype response: Union[str, List[Dict[str, Any]]]
229
- :keyword ground_truth: List of expected tool/action steps.
230
- :paramtype ground_truth: List[str]
337
+ :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict) .
338
+ :paramtype ground_truth: Union[ List[str], Tuple[List[str], Dict[str, Dict[str, str]]] ]
231
339
:return: The path efficiency scores and results.
232
340
:rtype: Dict[str, Union[float, str]]
233
341
"""
0 commit comments