1
1
# ---------------------------------------------------------
2
2
# Copyright (c) Microsoft Corporation. All rights reserved.
3
3
# ---------------------------------------------------------
4
- import json
4
+ from enum import Enum
5
5
from collections import Counter
6
+ import json
6
7
from typing import Dict , List , Union , Any , Tuple
7
8
from typing_extensions import overload , override
8
9
9
- from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
10
10
from azure .ai .evaluation ._constants import EVALUATION_PASS_FAIL_MAPPING
11
+ from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
12
+ from azure .ai .evaluation ._exceptions import (
13
+ ErrorCategory ,
14
+ ErrorTarget ,
15
+ EvaluationException ,
16
+ )
17
+
18
+
19
+ class TaskNavigationEfficiencyMatchingMode (str , Enum ):
20
+ """
21
+ Enumeration of task navigation efficiency matching mode.
22
+
23
+ This enum allows you to specify which single matching technique should be used when evaluating
24
+ the efficiency of an agent's tool calls sequence against a ground truth path.
25
+ """
26
+
27
+ EXACT_MATCH = "exact_match"
28
+ """
29
+ Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30
+
31
+ Returns True only if the agent's tool calls sequence is identical to the expected sequence
32
+ in both order and content (no extra steps, no missing steps, correct order).
33
+ """
34
+
35
+ IN_ORDER_MATCH = "in_order_match"
36
+ """
37
+ Binary metric allowing extra steps but requiring correct order of required tool calls.
38
+
39
+ Returns True if all ground truth steps appear in the agent's sequence in the correct
40
+ order, even if there are additional steps interspersed.
41
+ """
42
+
43
+ ANY_ORDER_MATCH = "any_order_match"
44
+ """
45
+ Binary metric allowing both extra steps and different ordering.
46
+
47
+ Returns True if all ground truth steps appear in the agent's sequence with sufficient
48
+ frequency, regardless of order. Most lenient matching criterion.
49
+ """
11
50
12
51
13
- class PathEfficiencyEvaluator (EvaluatorBase ):
52
+ class TaskNavigationEfficiencyEvaluator (EvaluatorBase ):
14
53
"""
15
54
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
16
55
17
- The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18
- between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19
- three binary match metrics: exact match, in-order match (allows extra steps) , and any-order match (allows extra steps and ignores order) .
56
+ The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57
+ It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58
+ It also returns precision, recall , and F1 scores in properties bag .
20
59
21
- :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22
- :type precision_threshold: float
23
- :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24
- :type recall_threshold: float
25
- :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26
- :type f1_score_threshold: float
60
+ :param matching_mode: The matching mode to use. Default is "exact_match".
61
+ :type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]
27
62
28
63
.. admonition:: Example:
29
64
30
65
.. code-block:: python
31
66
32
- from azure.ai.evaluation import PathEfficiencyEvaluator
67
+ from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
33
68
34
- path_efficiency_eval = PathEfficiencyEvaluator(
35
- precision_threshold=0.7,
36
- recall_threshold=0.8,
37
- f1_score_threshold=0.75
69
+ task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
70
+ matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
38
71
)
39
72
40
73
# Example 1: Using simple tool names list
@@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase):
64
97
)
65
98
"""
66
99
67
- _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68
-
69
- id = "azureai://built-in/evaluators/path_efficiency"
100
+ id = "azureai://built-in/evaluators/task_navigation_efficiency"
70
101
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
102
103
+ matching_mode : TaskNavigationEfficiencyMatchingMode
104
+ """The matching mode to use."""
105
+
72
106
@override
73
107
def __init__ (
74
108
self ,
75
109
* ,
76
- precision_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
77
- recall_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
78
- f1_score_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
110
+ matching_mode : Union [
111
+ str , TaskNavigationEfficiencyMatchingMode
112
+ ] = TaskNavigationEfficiencyMatchingMode . EXACT_MATCH ,
79
113
):
80
- self ._higher_is_better = True
81
- super ().__init__ ()
114
+ # Type checking for metric parameter
115
+ if isinstance (matching_mode , str ):
116
+ try :
117
+ self .matching_mode = TaskNavigationEfficiencyMatchingMode (matching_mode )
118
+ except ValueError :
119
+ raise ValueError (
120
+ f"matching_mode must be one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} , got '{ matching_mode } '"
121
+ )
122
+ elif isinstance (matching_mode , TaskNavigationEfficiencyMatchingMode ):
123
+ self .matching_mode = matching_mode
124
+ else :
125
+ raise EvaluationException (
126
+ f"matching_mode must be a string with one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} or TaskNavigationEfficiencyMatchingMode enum, got { type (matching_mode )} " ,
127
+ internal_message = str (matching_mode ),
128
+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
129
+ category = ErrorCategory .INVALID_VALUE ,
130
+ )
82
131
83
- # Type checking for threshold parameters
84
- for name , value in [
85
- ("precision_threshold" , precision_threshold ),
86
- ("recall_threshold" , recall_threshold ),
87
- ("f1_score_threshold" , f1_score_threshold ),
88
- ]:
89
- if not isinstance (value , float ):
90
- raise TypeError (f"{ name } must be a float, got { type (value )} " )
91
-
92
- self ._threshold = {
93
- "path_efficiency_precision" : precision_threshold ,
94
- "path_efficiency_recall" : recall_threshold ,
95
- "path_efficiency_f1" : f1_score_threshold ,
96
- }
132
+ super ().__init__ ()
97
133
98
134
def _prepare_steps_for_comparison (
99
135
self ,
@@ -192,14 +228,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
192
228
# Check if agent has at least as many occurrences of each ground truth step
193
229
return all (agent_counts [step ] >= ground_truth_counts [step ] for step in ground_truth_counts )
194
230
231
+ _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
232
+ TaskNavigationEfficiencyMatchingMode .EXACT_MATCH : _calculate_exact_match ,
233
+ TaskNavigationEfficiencyMatchingMode .IN_ORDER_MATCH : _calculate_in_order_match ,
234
+ TaskNavigationEfficiencyMatchingMode .ANY_ORDER_MATCH : _calculate_any_order_match ,
235
+ }
236
+
195
237
@override
196
- async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str ]]:
238
+ async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
197
239
"""Produce a path efficiency evaluation result.
198
240
199
241
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200
242
:type eval_input: Dict
201
243
:return: The evaluation result.
202
- :rtype: Dict[str, Union[float, str]]
244
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
203
245
"""
204
246
response = eval_input ["response" ]
205
247
ground_truth = eval_input ["ground_truth" ]
@@ -244,12 +286,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
244
286
ground_truth_names = [name .strip () for name in tool_names_list ]
245
287
ground_truth_params_dict = params_dict
246
288
use_parameter_matching = True
247
-
248
289
elif isinstance (ground_truth , list ) and all (isinstance (step , str ) for step in ground_truth ):
249
290
# List format: just tool names
250
291
ground_truth_names = [step .strip () for step in ground_truth ]
251
292
use_parameter_matching = False
252
-
253
293
else :
254
294
raise TypeError (
255
295
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +307,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
267
307
)
268
308
269
309
# Calculate precision, recall, and F1 scores
270
- metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
271
-
272
- # Calculate binary match metrics
273
- exact_match = self ._calculate_exact_match (agent_steps , ground_truth_steps )
274
- in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth_steps )
275
- any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth_steps )
310
+ additional_properties_metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
276
311
277
312
# Convert metrics to floats, using nan for None or non-convertible values
278
- path_efficiency_precision = (
279
- float (metrics ["precision_score" ]) if metrics ["precision_score" ] is not None else float ("nan" )
280
- )
281
- path_efficiency_recall = float (metrics ["recall_score" ]) if metrics ["recall_score" ] is not None else float ("nan" )
282
- path_efficiency_f1_score = float (metrics ["f1_score" ]) if metrics ["f1_score" ] is not None else float ("nan" )
313
+ for metric , score in additional_properties_metrics .items ():
314
+ additional_properties_metrics [metric ] = float (score ) if score is not None else float ("nan" )
283
315
284
- return {
285
- "path_efficiency_precision_score" : path_efficiency_precision ,
286
- "path_efficiency_recall_score" : path_efficiency_recall ,
287
- "path_efficiency_f1_score" : path_efficiency_f1_score ,
288
- "path_efficiency_exact_match_result" : EVALUATION_PASS_FAIL_MAPPING [exact_match ],
289
- "path_efficiency_in_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [in_order_match ],
290
- "path_efficiency_any_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [any_order_match ],
291
- }
316
+ if self .matching_mode in self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS :
317
+ # Calculate binary match metrics
318
+ match_result = self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS [self .matching_mode ](
319
+ self , agent_steps , ground_truth_steps
320
+ )
321
+
322
+ return {
323
+ "task_navigation_efficiency_result" : EVALUATION_PASS_FAIL_MAPPING [match_result ],
324
+ "properties" : additional_properties_metrics ,
325
+ }
326
+ else :
327
+ raise EvaluationException (
328
+ f"Unsupported matching_mode '{ self .matching_mode } '" ,
329
+ internal_message = str (self .matching_mode ),
330
+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
331
+ category = ErrorCategory .INVALID_VALUE ,
332
+ )
292
333
293
334
@overload
294
335
def __call__ ( # type: ignore
295
336
self , * , response : Union [str , List [Dict [str , Any ]]], ground_truth : List [str ]
296
- ) -> Dict [str , Union [float , str ]]:
337
+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
297
338
"""
298
- Evaluate the path efficiency of an agent's action sequence.
339
+ Evaluate the task navigation efficiency of an agent's action sequence.
299
340
300
341
:keyword response: The agent's response containing tool calls.
301
342
:paramtype response: Union[str, List[Dict[str, Any]]]
302
343
:keyword ground_truth: List of expected tool/action steps.
303
344
:paramtype ground_truth: List[str]
304
- :return: The path efficiency scores and results.
305
- :rtype: Dict[str, Union[float, str]]
345
+ :return: The task navigation efficiency scores and results.
346
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
306
347
"""
307
348
308
349
@overload
@@ -311,16 +352,16 @@ def __call__( # type: ignore
311
352
* ,
312
353
response : Union [str , List [Dict [str , Any ]]],
313
354
ground_truth : Tuple [List [str ], Dict [str , Dict [str , str ]]],
314
- ) -> Dict [str , Union [float , str ]]:
355
+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
315
356
"""
316
- Evaluate the path efficiency of an agent's action sequence with tool parameters.
357
+ Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317
358
318
359
:keyword response: The agent's response containing tool calls.
319
360
:paramtype response: Union[str, List[Dict[str, Any]]]
320
361
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321
362
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322
- :return: The path efficiency scores and results.
323
- :rtype: Dict[str, Union[float, str]]
363
+ :return: The task navigation efficiency scores and results.
364
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
324
365
"""
325
366
326
367
@override
@@ -330,13 +371,13 @@ def __call__(
330
371
** kwargs ,
331
372
):
332
373
"""
333
- Evaluate path efficiency.
374
+ Evaluate task navigation efficiency.
334
375
335
376
:keyword response: The agent's response containing tool calls.
336
377
:paramtype response: Union[str, List[Dict[str, Any]]]
337
378
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338
379
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339
- :return: The path efficiency scores and results.
340
- :rtype: Dict[str, Union[float, str]]
380
+ :return: The task navigation efficiency scores and results.
381
+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
341
382
"""
342
383
return super ().__call__ (* args , ** kwargs )
0 commit comments