Skip to content

Commit ef340f3

Browse files
authored
Fix flow structure for Task Adherence and Intent Resolution (#43643)
* changes * update var to _threshold * leave threshold jic * updates * black * BLACK
1 parent 95d00ae commit ef340f3

File tree

4 files changed

+61
-10
lines changed

4 files changed

+61
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
"tool_call_accurate",
1717
"response_completeness",
1818
"task_adherence",
19+
"tool_selection",
20+
"tool_output_utilization",
21+
"task_completion",
22+
"tool_input_accuracy",
23+
"tool_success",
24+
"tool_call_accuracy",
1925
]
2026

2127

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHO
6969
model_config=model_config,
7070
prompty_file=prompty_path,
7171
result_key=self._RESULT_KEY,
72+
threshold=threshold,
7273
credential=credential,
74+
_higher_is_better=True,
7375
**kwargs,
7476
)
7577

@@ -145,8 +147,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
145147
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
146148
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
147149

148-
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
150+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
151+
llm_output = prompty_output_dict["llm_output"]
149152
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
153+
score = math.nan
150154
if isinstance(llm_output, dict):
151155
score = llm_output.get("score", math.nan)
152156
if not check_score_is_valid(
@@ -162,16 +166,31 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
162166
)
163167
reason = llm_output.get("explanation", "")
164168
score = float(score)
165-
score_result = "pass" if score >= self.threshold else "fail"
169+
score_result = "pass" if score >= self._threshold else "fail"
166170

167171
response_dict = {
168172
f"{self._result_key}": score,
173+
f"gpt_{self._result_key}": score,
169174
f"{self._result_key}_result": score_result,
170-
f"{self._result_key}_threshold": self.threshold,
175+
f"{self._result_key}_threshold": self._threshold,
171176
f"{self._result_key}_reason": reason,
177+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
178+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
179+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
180+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
181+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
182+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
183+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
172184
}
173185
return response_dict
174186
# If llm_output is not a dictionary, return NaN for the score. This should never happen
175187
if logger:
176188
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
177-
return {self._result_key: math.nan}
189+
190+
binary_result = self._get_binary_result(score)
191+
return {
192+
self._result_key: float(score),
193+
f"gpt_{self._result_key}": float(score),
194+
f"{self._result_key}_result": binary_result,
195+
f"{self._result_key}_threshold": self._threshold,
196+
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
186186
binary_result = self._get_binary_result(score)
187187
return {
188188
self._result_key: float(score),
189+
f"gpt_{self._result_key}": float(score),
189190
f"{self._result_key}_result": binary_result,
190191
f"{self._result_key}_threshold": self._threshold,
191192
f"{self._result_key}_reason": reason,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010

1111
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
1212
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13-
from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
13+
from ..._common.utils import (
14+
reformat_conversation_history,
15+
reformat_agent_response,
16+
reformat_tool_definitions,
17+
)
1418
from azure.ai.evaluation._model_configurations import Message
1519
from azure.ai.evaluation._common._experimental import experimental
1620

@@ -73,12 +77,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
7377
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
7478
current_dir = os.path.dirname(__file__)
7579
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
76-
self.threshold = threshold
80+
self.threshold = threshold # to be removed in favor of _threshold
7781
super().__init__(
7882
model_config=model_config,
7983
prompty_file=prompty_path,
8084
result_key=self._RESULT_KEY,
85+
threshold=threshold,
8186
credential=credential,
87+
_higher_is_better=True,
8288
**kwargs,
8389
)
8490

@@ -154,19 +160,38 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
154160
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
155161
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
156162
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
157-
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
163+
164+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165+
llm_output = prompty_output_dict["llm_output"]
166+
167+
score = math.nan
158168
if isinstance(llm_output, dict):
159169
score = float(llm_output.get("score", math.nan))
160-
score_result = "pass" if score >= self.threshold else "fail"
170+
score_result = "pass" if score >= self._threshold else "fail"
161171
reason = llm_output.get("explanation", "")
162172
return {
163173
f"{self._result_key}": score,
174+
f"gpt_{self._result_key}": score,
164175
f"{self._result_key}_result": score_result,
165-
f"{self._result_key}_threshold": self.threshold,
176+
f"{self._result_key}_threshold": self._threshold,
166177
f"{self._result_key}_reason": reason,
167178
# Uncomment the following line in the next iteration after UI contracts are validated.
168179
# f"{self._result_key}_additional_details": llm_output
180+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
181+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
182+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
183+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
184+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
185+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
186+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
169187
}
170188
if logger:
171189
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
172-
return {self._result_key: math.nan}
190+
191+
binary_result = self._get_binary_result(score)
192+
return {
193+
self._result_key: float(score),
194+
f"gpt_{self._result_key}": float(score),
195+
f"{self._result_key}_result": binary_result,
196+
f"{self._result_key}_threshold": self._threshold,
197+
}

0 commit comments

Comments
 (0)