11# ---------------------------------------------------------
22# Copyright (c) Microsoft Corporation. All rights reserved.
33# ---------------------------------------------------------
4- import json
4+ from enum import Enum
55from collections import Counter
6+ import json
67from typing import Dict , List , Union , Any , Tuple
78from typing_extensions import overload , override
89
9- from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
1010from azure .ai .evaluation ._constants import EVALUATION_PASS_FAIL_MAPPING
11+ from azure .ai .evaluation ._evaluators ._common import EvaluatorBase
12+ from azure .ai .evaluation ._exceptions import (
13+ ErrorCategory ,
14+ ErrorTarget ,
15+ EvaluationException ,
16+ )
17+
18+
19+ class TaskNavigationEfficiencyMatchingMode (str , Enum ):
20+ """
21+ Enumeration of task navigation efficiency matching mode.
22+
23+ This enum allows you to specify which single matching technique should be used when evaluating
24+ the efficiency of an agent's tool calls sequence against a ground truth path.
25+ """
26+
27+ EXACT_MATCH = "exact_match"
28+ """
29+ Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30+
31+ Returns True only if the agent's tool calls sequence is identical to the expected sequence
32+ in both order and content (no extra steps, no missing steps, correct order).
33+ """
34+
35+ IN_ORDER_MATCH = "in_order_match"
36+ """
37+ Binary metric allowing extra steps but requiring correct order of required tool calls.
38+
39+ Returns True if all ground truth steps appear in the agent's sequence in the correct
40+ order, even if there are additional steps interspersed.
41+ """
42+
43+ ANY_ORDER_MATCH = "any_order_match"
44+ """
45+ Binary metric allowing both extra steps and different ordering.
46+
47+ Returns True if all ground truth steps appear in the agent's sequence with sufficient
48+ frequency, regardless of order. Most lenient matching criterion.
49+ """
1150
1251
13- class PathEfficiencyEvaluator (EvaluatorBase ):
52+ class TaskNavigationEfficiencyEvaluator (EvaluatorBase ):
1453 """
1554 Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
1655
17- The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18- between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19- three binary match metrics: exact match, in-order match (allows extra steps) , and any-order match (allows extra steps and ignores order) .
56+ The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57+ It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58+ It also returns precision, recall , and F1 scores in properties bag .
2059
21- :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22- :type precision_threshold: float
23- :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24- :type recall_threshold: float
25- :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26- :type f1_score_threshold: float
60+ :param matching_mode: The matching mode to use. Default is "exact_match".
61+ :type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]
2762
2863 .. admonition:: Example:
2964
3065 .. code-block:: python
3166
32- from azure.ai.evaluation import PathEfficiencyEvaluator
67+ from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
3368
34- path_efficiency_eval = PathEfficiencyEvaluator(
35- precision_threshold=0.7,
36- recall_threshold=0.8,
37- f1_score_threshold=0.75
69+ task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
70+ matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
3871 )
3972
4073 # Example 1: Using simple tool names list
@@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase):
6497 )
6598 """
6699
67- _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68-
69- id = "azureai://built-in/evaluators/path_efficiency"
100+ id = "azureai://built-in/evaluators/task_navigation_efficiency"
70101 """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71102
103+ matching_mode : TaskNavigationEfficiencyMatchingMode
104+ """The matching mode to use."""
105+
72106 @override
73107 def __init__ (
74108 self ,
75109 * ,
76- precision_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
77- recall_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
78- f1_score_threshold : float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD ,
110+ matching_mode : Union [
111+ str , TaskNavigationEfficiencyMatchingMode
112+ ] = TaskNavigationEfficiencyMatchingMode . EXACT_MATCH ,
79113 ):
80- self ._higher_is_better = True
81- super ().__init__ ()
114+ # Type checking for metric parameter
115+ if isinstance (matching_mode , str ):
116+ try :
117+ self .matching_mode = TaskNavigationEfficiencyMatchingMode (matching_mode )
118+ except ValueError :
119+ raise ValueError (
120+ f"matching_mode must be one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} , got '{ matching_mode } '"
121+ )
122+ elif isinstance (matching_mode , TaskNavigationEfficiencyMatchingMode ):
123+ self .matching_mode = matching_mode
124+ else :
125+ raise EvaluationException (
126+ f"matching_mode must be a string with one of { [m .value for m in TaskNavigationEfficiencyMatchingMode ]} or TaskNavigationEfficiencyMatchingMode enum, got { type (matching_mode )} " ,
127+ internal_message = str (matching_mode ),
128+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
129+ category = ErrorCategory .INVALID_VALUE ,
130+ )
82131
83- # Type checking for threshold parameters
84- for name , value in [
85- ("precision_threshold" , precision_threshold ),
86- ("recall_threshold" , recall_threshold ),
87- ("f1_score_threshold" , f1_score_threshold ),
88- ]:
89- if not isinstance (value , float ):
90- raise TypeError (f"{ name } must be a float, got { type (value )} " )
91-
92- self ._threshold = {
93- "path_efficiency_precision" : precision_threshold ,
94- "path_efficiency_recall" : recall_threshold ,
95- "path_efficiency_f1" : f1_score_threshold ,
96- }
132+ super ().__init__ ()
97133
98134 def _prepare_steps_for_comparison (
99135 self ,
@@ -192,14 +228,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
192228 # Check if agent has at least as many occurrences of each ground truth step
193229 return all (agent_counts [step ] >= ground_truth_counts [step ] for step in ground_truth_counts )
194230
231+ _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
232+ TaskNavigationEfficiencyMatchingMode .EXACT_MATCH : _calculate_exact_match ,
233+ TaskNavigationEfficiencyMatchingMode .IN_ORDER_MATCH : _calculate_in_order_match ,
234+ TaskNavigationEfficiencyMatchingMode .ANY_ORDER_MATCH : _calculate_any_order_match ,
235+ }
236+
195237 @override
196- async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str ]]:
238+ async def _do_eval (self , eval_input : Dict ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
197239 """Produce a path efficiency evaluation result.
198240
199241 :param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200242 :type eval_input: Dict
201243 :return: The evaluation result.
202- :rtype: Dict[str, Union[float, str]]
244+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
203245 """
204246 response = eval_input ["response" ]
205247 ground_truth = eval_input ["ground_truth" ]
@@ -244,12 +286,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
244286 ground_truth_names = [name .strip () for name in tool_names_list ]
245287 ground_truth_params_dict = params_dict
246288 use_parameter_matching = True
247-
248289 elif isinstance (ground_truth , list ) and all (isinstance (step , str ) for step in ground_truth ):
249290 # List format: just tool names
250291 ground_truth_names = [step .strip () for step in ground_truth ]
251292 use_parameter_matching = False
252-
253293 else :
254294 raise TypeError (
255295 "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +307,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
267307 )
268308
269309 # Calculate precision, recall, and F1 scores
270- metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
271-
272- # Calculate binary match metrics
273- exact_match = self ._calculate_exact_match (agent_steps , ground_truth_steps )
274- in_order_match = self ._calculate_in_order_match (agent_steps , ground_truth_steps )
275- any_order_match = self ._calculate_any_order_match (agent_steps , ground_truth_steps )
310+ additional_properties_metrics = self ._calculate_precision_recall_f1_scores (agent_steps , ground_truth_steps )
276311
277312 # Convert metrics to floats, using nan for None or non-convertible values
278- path_efficiency_precision = (
279- float (metrics ["precision_score" ]) if metrics ["precision_score" ] is not None else float ("nan" )
280- )
281- path_efficiency_recall = float (metrics ["recall_score" ]) if metrics ["recall_score" ] is not None else float ("nan" )
282- path_efficiency_f1_score = float (metrics ["f1_score" ]) if metrics ["f1_score" ] is not None else float ("nan" )
313+ for metric , score in additional_properties_metrics .items ():
314+ additional_properties_metrics [metric ] = float (score ) if score is not None else float ("nan" )
283315
284- return {
285- "path_efficiency_precision_score" : path_efficiency_precision ,
286- "path_efficiency_recall_score" : path_efficiency_recall ,
287- "path_efficiency_f1_score" : path_efficiency_f1_score ,
288- "path_efficiency_exact_match_result" : EVALUATION_PASS_FAIL_MAPPING [exact_match ],
289- "path_efficiency_in_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [in_order_match ],
290- "path_efficiency_any_order_match_result" : EVALUATION_PASS_FAIL_MAPPING [any_order_match ],
291- }
316+ if self .matching_mode in self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS :
317+ # Calculate binary match metrics
318+ match_result = self ._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS [self .matching_mode ](
319+ self , agent_steps , ground_truth_steps
320+ )
321+
322+ return {
323+ "task_navigation_efficiency_result" : EVALUATION_PASS_FAIL_MAPPING [match_result ],
324+ "properties" : additional_properties_metrics ,
325+ }
326+ else :
327+ raise EvaluationException (
328+ f"Unsupported matching_mode '{ self .matching_mode } '" ,
329+ internal_message = str (self .matching_mode ),
330+ target = ErrorTarget .TASK_NAVIGATION_EFFICIENCY_EVALUATOR ,
331+ category = ErrorCategory .INVALID_VALUE ,
332+ )
292333
293334 @overload
294335 def __call__ ( # type: ignore
295336 self , * , response : Union [str , List [Dict [str , Any ]]], ground_truth : List [str ]
296- ) -> Dict [str , Union [float , str ]]:
337+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
297338 """
298- Evaluate the path efficiency of an agent's action sequence.
339+ Evaluate the task navigation efficiency of an agent's action sequence.
299340
300341 :keyword response: The agent's response containing tool calls.
301342 :paramtype response: Union[str, List[Dict[str, Any]]]
302343 :keyword ground_truth: List of expected tool/action steps.
303344 :paramtype ground_truth: List[str]
304- :return: The path efficiency scores and results.
305- :rtype: Dict[str, Union[float, str]]
345+ :return: The task navigation efficiency scores and results.
346+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
306347 """
307348
308349 @overload
@@ -311,16 +352,16 @@ def __call__( # type: ignore
311352 * ,
312353 response : Union [str , List [Dict [str , Any ]]],
313354 ground_truth : Tuple [List [str ], Dict [str , Dict [str , str ]]],
314- ) -> Dict [str , Union [float , str ]]:
355+ ) -> Dict [str , Union [float , str , Dict [ str , float ] ]]:
315356 """
316- Evaluate the path efficiency of an agent's action sequence with tool parameters.
357+ Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317358
318359 :keyword response: The agent's response containing tool calls.
319360 :paramtype response: Union[str, List[Dict[str, Any]]]
320361 :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321362 :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322- :return: The path efficiency scores and results.
323- :rtype: Dict[str, Union[float, str]]
363+ :return: The task navigation efficiency scores and results.
364+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
324365 """
325366
326367 @override
@@ -330,13 +371,13 @@ def __call__(
330371 ** kwargs ,
331372 ):
332373 """
333- Evaluate path efficiency.
374+ Evaluate task navigation efficiency.
334375
335376 :keyword response: The agent's response containing tool calls.
336377 :paramtype response: Union[str, List[Dict[str, Any]]]
337378 :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338379 :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339- :return: The path efficiency scores and results.
340- :rtype: Dict[str, Union[float, str]]
380+ :return: The task navigation efficiency scores and results.
381+ :rtype: Dict[str, Union[float, str, Dict[str, float] ]]
341382 """
342383 return super ().__call__ (* args , ** kwargs )
0 commit comments