Fix MeteorScoreEvaluator incorrect binary result by changing int to float conversion

Copilot · singankit · Copilot · commit 3f50031b2c99 · 2025-06-09T22:01:10.000Z
Co-authored-by: singankit &lt;30610298+singankit@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -416,12 +416,12 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
                         threshold_key = f"{base_key}_threshold"
                         result[threshold_key] = self._threshold
                         if self._higher_is_better:
-                            if int(score_value) >= self._threshold:
+                            if float(score_value) >= self._threshold:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
                         else:
-                            if int(score_value) <= self._threshold:
+                            if float(score_value) <= self._threshold:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py
@@ -218,4 +218,49 @@ def test_rouge_different_types(self, mock_call, rouge_type):
         # All results should pass since all scores are above threshold
         assert mock_result["rouge_precision_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
         assert mock_result["rouge_recall_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
-        assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
+        assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
+
+
+@pytest.mark.unittest
+class TestActualThresholdBehavior:
+    """Tests for actual threshold behavior in evaluators without mocking - to validate float vs int conversion bug fix."""
+
+    def test_meteor_score_decimal_threshold_behavior(self):
+        """Test that MeteorScoreEvaluator correctly handles decimal scores for threshold comparison."""
+        # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
+        evaluator = MeteorScoreEvaluator(threshold=0.5)
+        
+        # Using identical strings should give a high METEOR score (> 0.5)
+        result = evaluator(ground_truth="Hello world", response="Hello world")
+        
+        # The score should be > 0.5 and result should be "pass" 
+        assert result["meteor_score"] > 0.5
+        assert result["meteor_result"] == "pass"
+        assert result["meteor_threshold"] == 0.5
+
+    def test_bleu_score_decimal_threshold_behavior(self):
+        """Test that BleuScoreEvaluator correctly handles decimal scores for threshold comparison."""
+        # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
+        evaluator = BleuScoreEvaluator(threshold=0.1)
+        
+        # Using identical strings should give a BLEU score > 0.1
+        result = evaluator(ground_truth="Hello world", response="Hello world")
+        
+        # The score should be > 0.1 and result should be "pass"
+        assert result["bleu_score"] > 0.1
+        assert result["bleu_result"] == "pass"
+        assert result["bleu_threshold"] == 0.1
+
+    def test_meteor_score_threshold_boundary_cases(self):
+        """Test MeteorScoreEvaluator threshold boundary cases."""
+        # Test where score should be just above threshold
+        evaluator_low = MeteorScoreEvaluator(threshold=0.1)
+        result_low = evaluator_low(ground_truth="Hello world", response="Hello world")
+        assert result_low["meteor_score"] > 0.1
+        assert result_low["meteor_result"] == "pass"
+        
+        # Test where threshold is set very high - should fail
+        evaluator_high = MeteorScoreEvaluator(threshold=1.0)
+        result_high = evaluator_high(ground_truth="Hello world", response="Hello world")
+        assert result_high["meteor_score"] < 1.0
+        assert result_high["meteor_result"] == "fail"