Skip to content

Commit 65f6f1a

Browse files
authored
Rename Path Efficiency to Task Navigation Efficiency Evaluator and Return Only One Metric (#43248)
* Rename Path Efficiency to Task Navigation Efficiency and Return Only One Metric * Rename files
1 parent 772ee5a commit 65f6f1a

File tree

9 files changed

+449
-647
lines changed

9 files changed

+449
-647
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py

Lines changed: 0 additions & 7 deletions
This file was deleted.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from ._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode
6+
7+
__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py renamed to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py

Lines changed: 114 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,73 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4-
import json
4+
from enum import Enum
55
from collections import Counter
6+
import json
67
from typing import Dict, List, Union, Any, Tuple
78
from typing_extensions import overload, override
89

9-
from azure.ai.evaluation._evaluators._common import EvaluatorBase
1010
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
12+
from azure.ai.evaluation._exceptions import (
13+
ErrorCategory,
14+
ErrorTarget,
15+
EvaluationException,
16+
)
17+
18+
19+
class TaskNavigationEfficiencyMatchingMode(str, Enum):
20+
"""
21+
Enumeration of task navigation efficiency matching mode.
22+
23+
This enum allows you to specify which single matching technique should be used when evaluating
24+
the efficiency of an agent's tool calls sequence against a ground truth path.
25+
"""
26+
27+
EXACT_MATCH = "exact_match"
28+
"""
29+
Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30+
31+
Returns True only if the agent's tool calls sequence is identical to the expected sequence
32+
in both order and content (no extra steps, no missing steps, correct order).
33+
"""
34+
35+
IN_ORDER_MATCH = "in_order_match"
36+
"""
37+
Binary metric allowing extra steps but requiring correct order of required tool calls.
38+
39+
Returns True if all ground truth steps appear in the agent's sequence in the correct
40+
order, even if there are additional steps interspersed.
41+
"""
42+
43+
ANY_ORDER_MATCH = "any_order_match"
44+
"""
45+
Binary metric allowing both extra steps and different ordering.
46+
47+
Returns True if all ground truth steps appear in the agent's sequence with sufficient
48+
frequency, regardless of order. Most lenient matching criterion.
49+
"""
1150

1251

13-
class PathEfficiencyEvaluator(EvaluatorBase):
52+
class TaskNavigationEfficiencyEvaluator(EvaluatorBase):
1453
"""
1554
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
1655
17-
The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18-
between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19-
three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
56+
The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57+
It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58+
It also returns precision, recall, and F1 scores in properties bag.
2059
21-
:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22-
:type precision_threshold: float
23-
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24-
:type recall_threshold: float
25-
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26-
:type f1_score_threshold: float
60+
:param matching_mode: The matching mode to use. Default is "exact_match".
61+
:type matching_mode: enum[str, TaskNavigationEfficiencyMatchingMode]
2762
2863
.. admonition:: Example:
2964
3065
.. code-block:: python
3166
32-
from azure.ai.evaluation import PathEfficiencyEvaluator
67+
from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
3368
34-
path_efficiency_eval = PathEfficiencyEvaluator(
35-
precision_threshold=0.7,
36-
recall_threshold=0.8,
37-
f1_score_threshold=0.75
69+
task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
70+
matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
3871
)
3972
4073
# Example 1: Using simple tool names list
@@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase):
6497
)
6598
"""
6699

67-
_DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68-
69-
id = "azureai://built-in/evaluators/path_efficiency"
100+
id = "azureai://built-in/evaluators/task_navigation_efficiency"
70101
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71102

103+
matching_mode: TaskNavigationEfficiencyMatchingMode
104+
"""The matching mode to use."""
105+
72106
@override
73107
def __init__(
74108
self,
75109
*,
76-
precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
77-
recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
78-
f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
110+
matching_mode: Union[
111+
str, TaskNavigationEfficiencyMatchingMode
112+
] = TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
79113
):
80-
self._higher_is_better = True
81-
super().__init__()
114+
# Type checking for metric parameter
115+
if isinstance(matching_mode, str):
116+
try:
117+
self.matching_mode = TaskNavigationEfficiencyMatchingMode(matching_mode)
118+
except ValueError:
119+
raise ValueError(
120+
f"matching_mode must be one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'"
121+
)
122+
elif isinstance(matching_mode, TaskNavigationEfficiencyMatchingMode):
123+
self.matching_mode = matching_mode
124+
else:
125+
raise EvaluationException(
126+
f"matching_mode must be a string with one of {[m.value for m in TaskNavigationEfficiencyMatchingMode]} or TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
127+
internal_message=str(matching_mode),
128+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
129+
category=ErrorCategory.INVALID_VALUE,
130+
)
82131

83-
# Type checking for threshold parameters
84-
for name, value in [
85-
("precision_threshold", precision_threshold),
86-
("recall_threshold", recall_threshold),
87-
("f1_score_threshold", f1_score_threshold),
88-
]:
89-
if not isinstance(value, float):
90-
raise TypeError(f"{name} must be a float, got {type(value)}")
91-
92-
self._threshold = {
93-
"path_efficiency_precision": precision_threshold,
94-
"path_efficiency_recall": recall_threshold,
95-
"path_efficiency_f1": f1_score_threshold,
96-
}
132+
super().__init__()
97133

98134
def _prepare_steps_for_comparison(
99135
self,
@@ -192,14 +228,20 @@ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List
192228
# Check if agent has at least as many occurrences of each ground truth step
193229
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
194230

231+
_TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
232+
TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
233+
TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
234+
TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
235+
}
236+
195237
@override
196-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
238+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
197239
"""Produce a path efficiency evaluation result.
198240
199241
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200242
:type eval_input: Dict
201243
:return: The evaluation result.
202-
:rtype: Dict[str, Union[float, str]]
244+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
203245
"""
204246
response = eval_input["response"]
205247
ground_truth = eval_input["ground_truth"]
@@ -244,12 +286,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
244286
ground_truth_names = [name.strip() for name in tool_names_list]
245287
ground_truth_params_dict = params_dict
246288
use_parameter_matching = True
247-
248289
elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
249290
# List format: just tool names
250291
ground_truth_names = [step.strip() for step in ground_truth]
251292
use_parameter_matching = False
252-
253293
else:
254294
raise TypeError(
255295
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +307,43 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
267307
)
268308

269309
# Calculate precision, recall, and F1 scores
270-
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
271-
272-
# Calculate binary match metrics
273-
exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
274-
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
275-
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
310+
additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
276311

277312
# Convert metrics to floats, using nan for None or non-convertible values
278-
path_efficiency_precision = (
279-
float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
280-
)
281-
path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
282-
path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
313+
for metric, score in additional_properties_metrics.items():
314+
additional_properties_metrics[metric] = float(score) if score is not None else float("nan")
283315

284-
return {
285-
"path_efficiency_precision_score": path_efficiency_precision,
286-
"path_efficiency_recall_score": path_efficiency_recall,
287-
"path_efficiency_f1_score": path_efficiency_f1_score,
288-
"path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
289-
"path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
290-
"path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
291-
}
316+
if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
317+
# Calculate binary match metrics
318+
match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](
319+
self, agent_steps, ground_truth_steps
320+
)
321+
322+
return {
323+
"task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
324+
"properties": additional_properties_metrics,
325+
}
326+
else:
327+
raise EvaluationException(
328+
f"Unsupported matching_mode '{self.matching_mode}'",
329+
internal_message=str(self.matching_mode),
330+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
331+
category=ErrorCategory.INVALID_VALUE,
332+
)
292333

293334
@overload
294335
def __call__( # type: ignore
295336
self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
296-
) -> Dict[str, Union[float, str]]:
337+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
297338
"""
298-
Evaluate the path efficiency of an agent's action sequence.
339+
Evaluate the task navigation efficiency of an agent's action sequence.
299340
300341
:keyword response: The agent's response containing tool calls.
301342
:paramtype response: Union[str, List[Dict[str, Any]]]
302343
:keyword ground_truth: List of expected tool/action steps.
303344
:paramtype ground_truth: List[str]
304-
:return: The path efficiency scores and results.
305-
:rtype: Dict[str, Union[float, str]]
345+
:return: The task navigation efficiency scores and results.
346+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
306347
"""
307348

308349
@overload
@@ -311,16 +352,16 @@ def __call__( # type: ignore
311352
*,
312353
response: Union[str, List[Dict[str, Any]]],
313354
ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
314-
) -> Dict[str, Union[float, str]]:
355+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
315356
"""
316-
Evaluate the path efficiency of an agent's action sequence with tool parameters.
357+
Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317358
318359
:keyword response: The agent's response containing tool calls.
319360
:paramtype response: Union[str, List[Dict[str, Any]]]
320361
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321362
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322-
:return: The path efficiency scores and results.
323-
:rtype: Dict[str, Union[float, str]]
363+
:return: The task navigation efficiency scores and results.
364+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
324365
"""
325366

326367
@override
@@ -330,13 +371,13 @@ def __call__(
330371
**kwargs,
331372
):
332373
"""
333-
Evaluate path efficiency.
374+
Evaluate task navigation efficiency.
334375
335376
:keyword response: The agent's response containing tool calls.
336377
:paramtype response: Union[str, List[Dict[str, Any]]]
337378
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338379
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339-
:return: The path efficiency scores and results.
340-
:rtype: Dict[str, Union[float, str]]
380+
:return: The task navigation efficiency scores and results.
381+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
341382
"""
342383
return super().__call__(*args, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class ErrorTarget(Enum):
7878
ECI_EVALUATOR = "ECIEvaluator"
7979
F1_EVALUATOR = "F1Evaluator"
8080
GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
81+
TASK_NAVIGATION_EFFICIENCY_EVALUATOR = "TaskNavigationEfficiencyEvaluator"
8182
PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
8283
INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator"
8384
RELEVANCE_EVALUATOR = "RelevanceEvaluator"

0 commit comments

Comments
 (0)