Skip to content

Commit d77526a

Browse files
authored
Implement Path Efficiency Evaluator (#42571)
* Implement Path Efficiency Evaluator * Adding Samples * Move to private preview
1 parent 607213f commit d77526a

File tree

7 files changed

+1114
-3
lines changed

7 files changed

+1114
-3
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,41 @@ def _parse_tools_from_response(self, response):
510510

511511
return tool_calls
512512

513+
def _extract_tool_names_from_response(self, response) -> List[str]:
514+
"""Extract tool names from the response.
515+
:param response: The response to parse.
516+
:type response: Union[str, List[dict]]
517+
:return: List of tool names extracted from the response.
518+
:rtype: List[str]
519+
"""
520+
tool_calls = self._parse_tools_from_response(response)
521+
tool_names = []
522+
for tool_call in tool_calls:
523+
if not isinstance(tool_call, dict):
524+
raise EvaluationException(
525+
"Tool call must be a dictionary.",
526+
internal_message=str(tool_call),
527+
target=ErrorTarget.EVALUATE,
528+
category=ErrorCategory.UNKNOWN,
529+
)
530+
if tool_call.get("type") != "tool_call":
531+
raise EvaluationException(
532+
"Tool call must have 'type' set to 'tool_call'.",
533+
internal_message=str(tool_call),
534+
target=ErrorTarget.EVALUATE,
535+
category=ErrorCategory.INVALID_VALUE,
536+
)
537+
if "name" in tool_call:
538+
tool_names.append(tool_call["name"])
539+
else:
540+
raise EvaluationException(
541+
"Tool call missing 'name' field.",
542+
internal_message=str(tool_call),
543+
target=ErrorTarget.EVALUATE,
544+
category=ErrorCategory.MISSING_FIELD,
545+
)
546+
return tool_names
547+
513548
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
514549
"""The asynchronous call where real end-to-end evaluation logic is performed.
515550
@@ -532,14 +567,25 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
532567
base_key = key[:-6] # Remove "_score" suffix
533568
result_key = f"{base_key}_result"
534569
threshold_key = f"{base_key}_threshold"
535-
result[threshold_key] = self._threshold
570+
threshold_value = (
571+
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
572+
)
573+
if not isinstance(threshold_value, (int, float)):
574+
raise EvaluationException(
575+
"Threshold value must be a number.",
576+
internal_message=str(threshold_value),
577+
target=ErrorTarget.EVALUATE,
578+
category=ErrorCategory.INVALID_VALUE,
579+
)
580+
581+
result[threshold_key] = threshold_value
536582
if self._higher_is_better:
537-
if float(score_value) >= self._threshold:
583+
if float(score_value) >= threshold_value:
538584
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
539585
else:
540586
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
541587
else:
542-
if float(score_value) <= self._threshold:
588+
if float(score_value) <= threshold_value:
543589
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
544590
else:
545591
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from ._path_efficiency import PathEfficiencyEvaluator
6+
7+
__all__ = ["PathEfficiencyEvaluator"]
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
from collections import Counter
5+
from typing import Dict, List, Union, Any
6+
from typing_extensions import overload, override
7+
8+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
9+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
10+
11+
12+
class PathEfficiencyEvaluator(EvaluatorBase):
13+
"""
14+
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
15+
16+
The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
17+
between the agent's tool usage trajectory and the ground truth expected steps. It also provides
18+
three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
19+
20+
:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
21+
:type precision_threshold: float
22+
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
23+
:type recall_threshold: float
24+
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
25+
:type f1_score_threshold: float
26+
27+
.. admonition:: Example:
28+
29+
.. code-block:: python
30+
31+
from azure.ai.evaluation import PathEfficiencyEvaluator
32+
33+
path_efficiency_eval = PathEfficiencyEvaluator(
34+
precision_threshold=0.7,
35+
recall_threshold=0.8,
36+
f1_score_threshold=0.75
37+
)
38+
39+
result = path_efficiency_eval(
40+
response=[
41+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "determine_intent", "arguments": {}}]},
42+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "use_tool", "arguments": {}}]},
43+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "review_results", "arguments": {}}]},
44+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "report_generation", "arguments": {}}]}
45+
],
46+
ground_truth=["determine_intent", "use_tool", "review_results", "report_generation"]
47+
)
48+
"""
49+
50+
_DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
51+
52+
id = "azureai://built-in/evaluators/path_efficiency"
53+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
54+
55+
@override
56+
def __init__(
57+
self,
58+
*,
59+
precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
60+
recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
61+
f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
62+
):
63+
self._higher_is_better = True
64+
super().__init__()
65+
66+
# Type checking for threshold parameters
67+
for name, value in [
68+
("precision_threshold", precision_threshold),
69+
("recall_threshold", recall_threshold),
70+
("f1_score_threshold", f1_score_threshold),
71+
]:
72+
if not isinstance(value, float):
73+
raise TypeError(f"{name} must be a float, got {type(value)}")
74+
75+
self._threshold = {
76+
"path_efficiency_precision": precision_threshold,
77+
"path_efficiency_recall": recall_threshold,
78+
"path_efficiency_f1": f1_score_threshold,
79+
}
80+
81+
def _calculate_precision_recall_f1_scores(
82+
self, agent_steps: List[str], ground_truth: List[str]
83+
) -> Dict[str, float]:
84+
"""Calculate precision, recall, and F1 scores."""
85+
if not agent_steps:
86+
return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
87+
88+
# Count occurrences of each step in both lists to handle duplicates
89+
agent_steps_counts = Counter(agent_steps)
90+
ground_truth_counts = Counter(ground_truth)
91+
92+
# Calculate true positives by taking the minimum count for each common element
93+
# For each step, count the intersection (min count) of agent and ground truth steps
94+
true_positives = sum(
95+
min(agent_steps_counts[step], ground_truth_counts[step])
96+
for step in agent_steps_counts
97+
if step in ground_truth_counts
98+
)
99+
100+
# Calculate false positives (agent steps not in ground truth or excess occurrences)
101+
# For each step, count the excess occurrences of agent steps not in (minus) ground truth
102+
# or zero (agent steps minus agent steps) if agent steps is less than ground truth
103+
false_positives = sum(
104+
agent_steps_counts[step] - min(agent_steps_counts[step], ground_truth_counts.get(step, 0))
105+
for step in agent_steps_counts
106+
)
107+
108+
# Calculate false negatives (ground truth steps not in agent or missing occurrences)
109+
# For each step, count the excess occurrences of ground truth steps not in (minus) agent steps
110+
# or zero (ground truth steps minus ground truth steps) if ground truth steps is less than agent steps
111+
false_negatives = sum(
112+
ground_truth_counts[step] - min(ground_truth_counts[step], agent_steps_counts.get(step, 0))
113+
for step in ground_truth_counts
114+
)
115+
116+
# Calculate precision, recall, F1
117+
precision = (
118+
true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
119+
)
120+
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
121+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
122+
123+
return {
124+
"precision_score": precision,
125+
"recall_score": recall,
126+
"f1_score": f1_score,
127+
}
128+
129+
def _calculate_exact_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
130+
"""Check if agent steps exactly match ground truth (order and content)."""
131+
return agent_steps == ground_truth
132+
133+
def _calculate_in_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
134+
"""Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
135+
if not ground_truth:
136+
return True
137+
138+
gt_index = 0
139+
for step in agent_steps:
140+
if gt_index < len(ground_truth) and step == ground_truth[gt_index]:
141+
gt_index += 1
142+
143+
return gt_index == len(ground_truth)
144+
145+
def _calculate_any_order_match(self, agent_steps: List[str], ground_truth: List[str]) -> bool:
146+
"""Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
147+
# Count occurrences of each step in both lists to handle duplicates
148+
agent_counts = Counter(agent_steps)
149+
ground_truth_counts = Counter(ground_truth)
150+
151+
# Check if agent has at least as many occurrences of each ground truth step
152+
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
153+
154+
@override
155+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
156+
"""Produce a path efficiency evaluation result.
157+
158+
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
159+
:type eval_input: Dict
160+
:return: The evaluation result.
161+
:rtype: Dict[str, Union[float, str]]
162+
"""
163+
response = eval_input["response"]
164+
ground_truth = eval_input["ground_truth"]
165+
166+
# Value and type checking for ground truth steps
167+
if not ground_truth:
168+
raise ValueError("ground_truth cannot be empty")
169+
170+
if not isinstance(ground_truth, list) or not all(isinstance(step, str) for step in ground_truth):
171+
raise TypeError("ground_truth must be a list of strings")
172+
173+
# Extract tool names from the response
174+
agent_steps = self._extract_tool_names_from_response(response)
175+
176+
agent_steps = [step.strip() for step in agent_steps]
177+
ground_truth = [step.strip() for step in ground_truth]
178+
179+
# Calculate precision, recall, and F1 scores
180+
metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth)
181+
182+
# Calculate binary match metrics
183+
exact_match = self._calculate_exact_match(agent_steps, ground_truth)
184+
in_order_match = self._calculate_in_order_match(agent_steps, ground_truth)
185+
any_order_match = self._calculate_any_order_match(agent_steps, ground_truth)
186+
187+
# Convert metrics to floats, using nan for None or non-convertible values
188+
path_efficiency_precision = (
189+
float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
190+
)
191+
path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
192+
path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
193+
194+
return {
195+
"path_efficiency_precision_score": path_efficiency_precision,
196+
"path_efficiency_recall_score": path_efficiency_recall,
197+
"path_efficiency_f1_score": path_efficiency_f1_score,
198+
"path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
199+
"path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
200+
"path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
201+
}
202+
203+
@overload
204+
def __call__( # type: ignore
205+
self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
206+
) -> Dict[str, Union[float, str]]:
207+
"""
208+
Evaluate the path efficiency of an agent's action sequence.
209+
210+
:keyword response: The agent's response containing tool calls.
211+
:paramtype response: Union[str, List[Dict[str, Any]]]
212+
:keyword ground_truth: List of expected tool/action steps.
213+
:paramtype ground_truth: List[str]
214+
:return: The path efficiency scores and results.
215+
:rtype: Dict[str, Union[float, str]]
216+
"""
217+
218+
@override
219+
def __call__(
220+
self,
221+
*args,
222+
**kwargs,
223+
):
224+
"""
225+
Evaluate path efficiency.
226+
227+
:keyword response: The agent's response containing tool calls.
228+
:paramtype response: Union[str, List[Dict[str, Any]]]
229+
:keyword ground_truth: List of expected tool/action steps.
230+
:paramtype ground_truth: List[str]
231+
:return: The path efficiency scores and results.
232+
:rtype: Dict[str, Union[float, str]]
233+
"""
234+
return super().__call__(*args, **kwargs)

0 commit comments

Comments
 (0)