Fix MeteorScoreEvaluator incorrect binary result due to int conversion bug (Azure#41492)

Copilot · singankit · web-flow · commit 0c347e994c87 · 2025-06-10T18:57:25.000Z
* Initial plan for issue

* Fix MeteorScoreEvaluator incorrect binary result by changing int to float conversion

Co-authored-by: singankit &lt;30610298+singankit@users.noreply.github.com&gt;

* Update CHANGELOG.md for MeteorScoreEvaluator bug fix

Co-authored-by: singankit &lt;30610298+singankit@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: singankit &lt;30610298+singankit@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Release History
 
+## 1.9.0 (Unreleased)
+
+### Bugs Fixed
+- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
+
 ## 1.8.0 (2025-05-29)
 
 ### Features Added
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -416,12 +416,12 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
                         threshold_key = f"{base_key}_threshold"
                         result[threshold_key] = self._threshold
                         if self._higher_is_better:
-                            if int(score_value) >= self._threshold:
+                            if float(score_value) >= self._threshold:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
                         else:
-                            if int(score_value) <= self._threshold:
+                            if float(score_value) <= self._threshold:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py
@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
 
-VERSION = "1.8.0"
+VERSION = "1.9.0"
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py
@@ -218,4 +218,49 @@ def test_rouge_different_types(self, mock_call, rouge_type):
         # All results should pass since all scores are above threshold
         assert mock_result["rouge_precision_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
         assert mock_result["rouge_recall_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
-        assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
+        assert mock_result["rouge_f1_score_result"] == EVALUATION_PASS_FAIL_MAPPING[True]
+
+
+@pytest.mark.unittest
+class TestActualThresholdBehavior:
+    """Tests for actual threshold behavior in evaluators without mocking - to validate float vs int conversion bug fix."""
+
+    def test_meteor_score_decimal_threshold_behavior(self):
+        """Test that MeteorScoreEvaluator correctly handles decimal scores for threshold comparison."""
+        # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
+        evaluator = MeteorScoreEvaluator(threshold=0.5)
+        
+        # Using identical strings should give a high METEOR score (> 0.5)
+        result = evaluator(ground_truth="Hello world", response="Hello world")
+        
+        # The score should be > 0.5 and result should be "pass" 
+        assert result["meteor_score"] > 0.5
+        assert result["meteor_result"] == "pass"
+        assert result["meteor_threshold"] == 0.5
+
+    def test_bleu_score_decimal_threshold_behavior(self):
+        """Test that BleuScoreEvaluator correctly handles decimal scores for threshold comparison."""
+        # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
+        evaluator = BleuScoreEvaluator(threshold=0.1)
+        
+        # Using identical strings should give a BLEU score > 0.1
+        result = evaluator(ground_truth="Hello world", response="Hello world")
+        
+        # The score should be > 0.1 and result should be "pass"
+        assert result["bleu_score"] > 0.1
+        assert result["bleu_result"] == "pass"
+        assert result["bleu_threshold"] == 0.1
+
+    def test_meteor_score_threshold_boundary_cases(self):
+        """Test MeteorScoreEvaluator threshold boundary cases."""
+        # Test where score should be just above threshold
+        evaluator_low = MeteorScoreEvaluator(threshold=0.1)
+        result_low = evaluator_low(ground_truth="Hello world", response="Hello world")
+        assert result_low["meteor_score"] > 0.1
+        assert result_low["meteor_result"] == "pass"
+        
+        # Test where threshold is set very high - should fail
+        evaluator_high = MeteorScoreEvaluator(threshold=1.0)
+        result_high = evaluator_high(ground_truth="Hello world", response="Hello world")
+        assert result_high["meteor_score"] < 1.0
+        assert result_high["meteor_result"] == "fail"