@@ -218,4 +218,49 @@ def test_rouge_different_types(self, mock_call, rouge_type):
218218 # All results should pass since all scores are above threshold
219219 assert mock_result ["rouge_precision_result" ] == EVALUATION_PASS_FAIL_MAPPING [True ]
220220 assert mock_result ["rouge_recall_result" ] == EVALUATION_PASS_FAIL_MAPPING [True ]
221- assert mock_result ["rouge_f1_score_result" ] == EVALUATION_PASS_FAIL_MAPPING [True ]
221+ assert mock_result ["rouge_f1_score_result" ] == EVALUATION_PASS_FAIL_MAPPING [True ]
222+
223+
224+ @pytest .mark .unittest
225+ class TestActualThresholdBehavior :
226+ """Tests for actual threshold behavior in evaluators without mocking - to validate float vs int conversion bug fix."""
227+
228+ def test_meteor_score_decimal_threshold_behavior (self ):
229+ """Test that MeteorScoreEvaluator correctly handles decimal scores for threshold comparison."""
230+ # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
231+ evaluator = MeteorScoreEvaluator (threshold = 0.5 )
232+
233+ # Using identical strings should give a high METEOR score (> 0.5)
234+ result = evaluator (ground_truth = "Hello world" , response = "Hello world" )
235+
236+ # The score should be > 0.5 and result should be "pass"
237+ assert result ["meteor_score" ] > 0.5
238+ assert result ["meteor_result" ] == "pass"
239+ assert result ["meteor_threshold" ] == 0.5
240+
241+ def test_bleu_score_decimal_threshold_behavior (self ):
242+ """Test that BleuScoreEvaluator correctly handles decimal scores for threshold comparison."""
243+ # This test validates the fix for the bug where int(score_value) was used instead of float(score_value)
244+ evaluator = BleuScoreEvaluator (threshold = 0.1 )
245+
246+ # Using identical strings should give a BLEU score > 0.1
247+ result = evaluator (ground_truth = "Hello world" , response = "Hello world" )
248+
249+ # The score should be > 0.1 and result should be "pass"
250+ assert result ["bleu_score" ] > 0.1
251+ assert result ["bleu_result" ] == "pass"
252+ assert result ["bleu_threshold" ] == 0.1
253+
254+ def test_meteor_score_threshold_boundary_cases (self ):
255+ """Test MeteorScoreEvaluator threshold boundary cases."""
256+ # Test where score should be just above threshold
257+ evaluator_low = MeteorScoreEvaluator (threshold = 0.1 )
258+ result_low = evaluator_low (ground_truth = "Hello world" , response = "Hello world" )
259+ assert result_low ["meteor_score" ] > 0.1
260+ assert result_low ["meteor_result" ] == "pass"
261+
262+ # Test where threshold is set very high - should fail
263+ evaluator_high = MeteorScoreEvaluator (threshold = 1.0 )
264+ result_high = evaluator_high (ground_truth = "Hello world" , response = "Hello world" )
265+ assert result_high ["meteor_score" ] < 1.0
266+ assert result_high ["meteor_result" ] == "fail"
0 commit comments