Skip to content

Commit 3f50031

Browse files
Copilotsingankit
andcommitted
Fix MeteorScoreEvaluator incorrect binary result by changing int to float conversion
Co-authored-by: singankit <[email protected]>
1 parent a08b792 commit 3f50031

File tree

2 files changed

+48
-3
lines changed

2 files changed

+48
-3
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,12 +416,12 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
416416
threshold_key = f"{base_key}_threshold"
417417
result[threshold_key] = self._threshold
418418
if self._higher_is_better:
419-
if int(score_value) >= self._threshold:
419+
if float(score_value) >= self._threshold:
420420
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
421421
else:
422422
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
423423
else:
424-
if int(score_value) <= self._threshold:
424+
if float(score_value) <= self._threshold:
425425
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
426426
else:
427427
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,49 @@ def test_rouge_different_types(self, mock_call, rouge_type):
218218
# All results should pass since all scores are above threshold
219219
assert mock_result["rouge_precision_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
220220
assert mock_result["rouge_recall_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
221-
assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
221+
assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
222+
223+
224+
@pytest.mark.unittest
225+
class TestActualThresholdBehavior:
226+
"""Tests for actual threshold behavior in evaluators without mocking - to validate float vs int conversion bug fix."""
227+
228+
def test_meteor_score_decimal_threshold_behavior(self):
229+
"""Test that MeteorScoreEvaluator correctly handles decimal scores for threshold comparison."""
230+
# This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
231+
evaluator = MeteorScoreEvaluator(threshold=0.5)
232+
233+
# Using identical strings should give a high METEOR score (> 0.5)
234+
result = evaluator(ground_truth="Hello world", response="Hello world")
235+
236+
# The score should be > 0.5 and result should be "pass"
237+
assert result["meteor_score"] > 0.5
238+
assert result["meteor_result"] == "pass"
239+
assert result["meteor_threshold"] == 0.5
240+
241+
def test_bleu_score_decimal_threshold_behavior(self):
242+
"""Test that BleuScoreEvaluator correctly handles decimal scores for threshold comparison."""
243+
# This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
244+
evaluator = BleuScoreEvaluator(threshold=0.1)
245+
246+
# Using identical strings should give a BLEU score > 0.1
247+
result = evaluator(ground_truth="Hello world", response="Hello world")
248+
249+
# The score should be > 0.1 and result should be "pass"
250+
assert result["bleu_score"] > 0.1
251+
assert result["bleu_result"] == "pass"
252+
assert result["bleu_threshold"] == 0.1
253+
254+
def test_meteor_score_threshold_boundary_cases(self):
255+
"""Test MeteorScoreEvaluator threshold boundary cases."""
256+
# Test where score should be just above threshold
257+
evaluator_low = MeteorScoreEvaluator(threshold=0.1)
258+
result_low = evaluator_low(ground_truth="Hello world", response="Hello world")
259+
assert result_low["meteor_score"] > 0.1
260+
assert result_low["meteor_result"] == "pass"
261+
262+
# Test where threshold is set very high - should fail
263+
evaluator_high = MeteorScoreEvaluator(threshold=1.0)
264+
result_high = evaluator_high(ground_truth="Hello world", response="Hello world")
265+
assert result_high["meteor_score"] < 1.0
266+
assert result_high["meteor_result"] == "fail"

0 commit comments

Comments
 (0)