Skip to content

Commit 51176df

Browse files
authored
Fix Flow Structure for Relevance and Response Completeness Evaluators (#43645)
1 parent a9741f5 commit 51176df

File tree

3 files changed

+63
-27
lines changed

3 files changed

+63
-27
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
8787
def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
8888
current_dir = os.path.dirname(__file__)
8989
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
90-
self._threshold = threshold
91-
self._higher_is_better = True
9290
super().__init__(
9391
model_config=model_config,
9492
prompty_file=prompty_path,
9593
result_key=self._RESULT_KEY,
9694
threshold=threshold,
9795
credential=credential,
98-
_higher_is_better=self._higher_is_better,
96+
_higher_is_better=True,
9997
**kwargs,
10098
)
10199

@@ -178,7 +176,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
178176
if not isinstance(eval_input["response"], str):
179177
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
180178
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
181-
llm_output = result["llm_output"]
179+
llm_output = result.get("llm_output")
182180
score = math.nan
183181

184182
if isinstance(llm_output, dict):
@@ -188,10 +186,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
188186
binary_result = self._get_binary_result(score)
189187
return {
190188
self._result_key: float(score),
191-
f"gpt_{self._result_key}": float(score),
192-
f"{self._result_key}_reason": reason,
193189
f"{self._result_key}_result": binary_result,
194190
f"{self._result_key}_threshold": self._threshold,
191+
f"{self._result_key}_reason": reason,
195192
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
196193
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
197194
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
@@ -201,10 +198,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
201198
f"{self._result_key}_sample_output": result.get("sample_output", ""),
202199
}
203200

201+
if logger:
202+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
203+
204204
binary_result = self._get_binary_result(score)
205205
return {
206206
self._result_key: float(score),
207-
f"gpt_{self._result_key}": float(score),
208207
f"{self._result_key}_result": binary_result,
209208
f"{self._result_key}_threshold": self._threshold,
210209
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# ---------------------------------------------------------
44

55
import os
6+
import logging
67
import math
78
from typing import Dict, List, Union, Optional
89

@@ -14,6 +15,8 @@
1415
from azure.ai.evaluation._model_configurations import Conversation, Message
1516
from azure.ai.evaluation._common._experimental import experimental
1617

18+
logger = logging.getLogger(__name__)
19+
1720

1821
@experimental
1922
class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -74,12 +77,14 @@ def __init__(
7477
):
7578
current_dir = os.path.dirname(__file__)
7679
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
77-
self.threshold = threshold
80+
self.threshold = threshold # to be removed in favor of _threshold
7881
super().__init__(
7982
model_config=model_config,
8083
prompty_file=prompty_path,
8184
result_key=self._RESULT_KEY,
85+
threshold=threshold,
8286
credential=credential,
87+
_higher_is_better=True,
8388
**kwargs,
8489
)
8590

@@ -156,20 +161,42 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
156161
target=ErrorTarget.COMPLETENESS_EVALUATOR,
157162
)
158163

159-
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
164+
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165+
llm_output = result.get("llm_output") if isinstance(result, dict) else result
160166

161167
score = math.nan
162-
if llm_output:
163-
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
168+
llm_output_is_dict = isinstance(llm_output, dict)
169+
if llm_output_is_dict or isinstance(llm_output, str):
170+
reason = ""
171+
if llm_output_is_dict:
172+
score = float(llm_output.get("score", math.nan))
173+
reason = llm_output.get("explanation", "")
174+
else:
175+
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
164176

165-
score_result = "pass" if score >= self.threshold else "fail"
177+
binary_result = self._get_binary_result(score)
166178

167179
# updating the result key and threshold to int based on the schema
168180
return {
169181
f"{self._result_key}": int(score),
170-
f"{self._result_key}_result": score_result,
171-
f"{self._result_key}_threshold": int(self.threshold),
182+
f"{self._result_key}_result": binary_result,
183+
f"{self._result_key}_threshold": int(self._threshold),
172184
f"{self._result_key}_reason": reason,
185+
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
186+
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
187+
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
188+
f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
189+
f"{self._result_key}_model": result.get("model_id", ""),
190+
f"{self._result_key}_sample_input": result.get("sample_input", ""),
191+
f"{self._result_key}_sample_output": result.get("sample_output", ""),
173192
}
174193

175-
return {self._result_key: math.nan}
194+
if logger:
195+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
196+
197+
binary_result = self._get_binary_result(score)
198+
return {
199+
self._result_key: float(score),
200+
f"{self._result_key}_result": binary_result,
201+
f"{self._result_key}_threshold": self._threshold,
202+
}

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,29 @@
66

77

88
async def completeness_response1_async_mock():
9-
return """<S0>Let's think step by step: The ground truth states "The capital of Japan is Tokyo," which provides
10-
both the subject (capital of Japan) and the specific answer (Tokyo). The response, "The capital of Japan,
11-
" only partially addresses the subject but does not provide the specific answer (Tokyo). This means it misses the
12-
core claim established in the ground truth.</S0> <S1>The response is fully incomplete as it does not provide the
13-
necessary and relevant information, specifically the name of the capital, Tokyo.</S1> <S2>1</S2>"""
9+
return {
10+
"llm_output": '<S0>Let\'s think step by step: The ground truth states "The capital of Japan is Tokyo." The response is "The capital of Japan." The response does not specify what the capital is; it only repeats part of the question and omits the key information ("Tokyo"). Therefore, none of the necessary information from the ground truth is present in the response.</S0>\n<S1>The response is fully incomplete because it does not provide the answer ("Tokyo") at all.</S1>\n<S2>1</S2>',
11+
"input_token_count": 1354,
12+
"output_token_count": 108,
13+
"total_token_count": 1462,
14+
"finish_reason": "stop",
15+
"model_id": "gpt-4.1-2025-04-14",
16+
"sample_input": '[{"role": "user", "content": "{\\"response\\": \\"The capital of Japan\\", \\"ground_truth\\": \\"The capital of Japan is Tokyo.\\"}"}]',
17+
"sample_output": '[{"role": "assistant", "content": "<S0>Let\'s think step by step: The ground truth states \\"The capital of Japan is Tokyo.\\" The response is \\"The capital of Japan.\\" The response does not specify what the capital is; it only repeats part of the question and omits the key information (\\"Tokyo\\"). Therefore, none of the necessary information from the ground truth is present in the response.</S0>\\n<S1>The response is fully incomplete because it does not provide the answer (\\"Tokyo\\") at all.</S1>\\n<S2>1</S2>"}]',
18+
}
1419

1520

1621
async def completeness_response2_async_mock():
17-
return """<S0>Let's think step by step: The response states that the capital of Japan is Tokyo. The ground truth
18-
also states that the capital of Japan is Tokyo. Both the response and the ground truth are identical, containing
19-
all the necessary and relevant information. There is no missing or incorrect information in the response.</S0>
20-
<S1>The response perfectly matches the ground truth, containing all the necessary and relevant information
21-
without any omissions or errors.</S1> <S2>5</S2>"""
22+
return {
23+
"llm_output": '<S0>Let\'s think step by step: The ground truth contains a single statement: "The capital of Japan is Tokyo." The response exactly matches this statement without omitting or altering any information. There are no additional claims or missing details to consider. According to the definitions, a fully complete response should perfectly contain all necessary and relevant information from the ground truth.</S0>\n<S1>The response is a perfect match to the ground truth, with no missing or incorrect information.</S1>\n<S2>5</S2>',
24+
"input_token_count": 1356,
25+
"output_token_count": 107,
26+
"total_token_count": 1463,
27+
"finish_reason": "stop",
28+
"model_id": "gpt-4.1-2025-04-14",
29+
"sample_input": '[{"role": "user", "content": "{\\"response\\": \\"The capital of Japan is Tokyo.\\", \\"ground_truth\\": \\"The capital of Japan is Tokyo.\\"}"}]',
30+
"sample_output": '[{"role": "assistant", "content": "<S0>Let\'s think step by step: The ground truth contains a single statement: \\"The capital of Japan is Tokyo.\\" The response exactly matches this statement without omitting or altering any information. There are no additional claims or missing details to consider. According to the definitions, a fully complete response should perfectly contain all necessary and relevant information from the ground truth.</S0>\\n<S1>The response is a perfect match to the ground truth, with no missing or incorrect information.</S1>\\n<S2>5</S2>"}]',
31+
}
2232

2333

2434
@pytest.mark.usefixtures("mock_model_config")
@@ -81,7 +91,7 @@ def test_evaluate_completeness_valid2(self, mock_model_config):
8191
assert result[key] == 5
8292
assert result[f"{key}_result"] == "pass"
8393
assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
84-
assert "The response perfectly matches " in result[f"{key}_reason"]
94+
assert "The response is a perfect match " in result[f"{key}_reason"]
8595

8696
def test_evaluate_completeness_valid3(self, mock_model_config):
8797
response_completeness_evaluator = ResponseCompletenessEvaluator(
@@ -103,7 +113,7 @@ def test_evaluate_completeness_valid3(self, mock_model_config):
103113
assert result[key] == 5
104114
assert result[f"{key}_result"] == "pass"
105115
assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
106-
assert "The response perfectly matches " in result[f"{key}_reason"]
116+
assert "The response is a perfect match " in result[f"{key}_reason"]
107117

108118
def test_evaluate_completeness_missing_ground_truth(self, mock_model_config):
109119
response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)

0 commit comments

Comments
 (0)