|
| 1 | +# import os |
| 2 | +# import pytest |
| 3 | +# from unittest.mock import patch |
| 4 | + |
| 5 | +# from langchain.evaluation.hallucination.detector import HallucinationDetector |
| 6 | + |
| 7 | +# # ----------------------------- |
| 8 | +# # Integration Tests (Real HF model) |
| 9 | +# # ----------------------------- |
| 10 | +# skip_if_no_hf = pytest.mark.skipif( |
| 11 | +# "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" |
| 12 | +# ) |
| 13 | + |
| 14 | +# @pytest.fixture(scope="module") |
| 15 | +# @skip_if_no_hf |
| 16 | +# @pytest.mark.requires("integration") |
| 17 | +# def detector_real(): |
| 18 | +# # Only runs locally if HF token is available |
| 19 | +# return HallucinationDetector(model_name="facebook/bart-large-mnli") |
| 20 | + |
| 21 | + |
| 22 | +# @skip_if_no_hf |
| 23 | +# @pytest.mark.requires("integration") |
| 24 | +# def test_extract_claims_integration(detector_real): |
| 25 | +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 26 | +# claims = detector_real.extract_claims(text) |
| 27 | +# assert isinstance(claims, list) |
| 28 | +# assert len(claims) == 2 |
| 29 | +# assert "Barack Obama was the 44th President of the United States" in claims |
| 30 | + |
| 31 | + |
| 32 | +# @skip_if_no_hf |
| 33 | +# @pytest.mark.requires("integration") |
| 34 | +# def test_compute_hallucination_rate_integration(detector_real): |
| 35 | +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 36 | +# evidence = [ |
| 37 | +# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", |
| 38 | +# "Barack Obama was born in Hawaii, not Kenya." |
| 39 | +# ] |
| 40 | +# result = detector_real.compute_hallucination_rate(text, evidence) |
| 41 | +# unsupported = result["unsupported_claims"] |
| 42 | +# total = result["total_claims"] |
| 43 | +# hallucination_rate = result["hallucination_rate"] |
| 44 | + |
| 45 | +# assert "total_claims" in result |
| 46 | +# assert "unsupported_claims" in result |
| 47 | +# assert "hallucination_rate" in result |
| 48 | +# assert result["total_claims"] == 2 |
| 49 | +# assert unsupported in [1, 2] # Accepts both possible outputs |
| 50 | +# assert 0 <= hallucination_rate <= 1 # Just check it’s a valid rate |
| 51 | + |
| 52 | + |
| 53 | +# # ----------------------------- |
| 54 | +# # Unit Tests (Mocked) |
| 55 | +# # ----------------------------- |
| 56 | +# # Unit test fixture |
| 57 | +# @pytest.fixture(scope="module") |
| 58 | +# def detector_mock(): |
| 59 | +# with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: |
| 60 | +# # mock NLI results |
| 61 | +# mock_pipeline.return_value = lambda text: [ |
| 62 | +# {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} |
| 63 | +# ] |
| 64 | +# # Now constructor won't load HF model |
| 65 | +# detector = HallucinationDetector(model_name="any") |
| 66 | +# yield detector |
| 67 | + |
| 68 | + |
| 69 | +# def test_extract_claims_mock(detector_mock): |
| 70 | +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 71 | +# claims = detector_mock.extract_claims(text) |
| 72 | +# assert isinstance(claims, list) |
| 73 | +# assert len(claims) == 2 |
| 74 | + |
| 75 | + |
| 76 | +# def test_verify_claim_supported_mock(detector_mock): |
| 77 | +# claim = "Barack Obama was the 44th President of the United States" |
| 78 | +# evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." |
| 79 | +# result = detector_mock.verify_claim(claim, evidence) |
| 80 | +# assert result is True |
| 81 | + |
| 82 | + |
| 83 | +# def test_verify_claim_unsupported_mock(detector_mock): |
| 84 | +# claim = "Barack Obama was born in Kenya" |
| 85 | +# evidence = "Barack Obama was born in Hawaii, not Kenya." |
| 86 | +# result = detector_mock.verify_claim(claim, evidence) |
| 87 | +# assert result is False |
| 88 | + |
| 89 | + |
| 90 | +# def test_compute_hallucination_rate_mock(detector_mock): |
| 91 | +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 92 | +# evidence = [ |
| 93 | +# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", |
| 94 | +# "Barack Obama was born in Hawaii, not Kenya." |
| 95 | +# ] |
| 96 | +# result = detector_mock.compute_hallucination_rate(text, evidence) |
| 97 | +# assert "total_claims" in result |
| 98 | +# assert "unsupported_claims" in result |
| 99 | +# assert "hallucination_rate" in result |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | +import os |
| 104 | +import pytest |
| 105 | +from unittest.mock import patch |
| 106 | + |
| 107 | +from langchain.evaluation.hallucination.detector import HallucinationDetector |
| 108 | + |
| 109 | +# ----------------------------- |
| 110 | +# Integration Tests (Real HF model) |
| 111 | +# ----------------------------- |
| 112 | +skip_if_no_hf = pytest.mark.skipif( |
| 113 | + "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" |
| 114 | +) |
| 115 | + |
| 116 | +@pytest.fixture(scope="module") |
| 117 | +@skip_if_no_hf |
| 118 | +@pytest.mark.requires("integration") |
| 119 | +def detector_real(): |
| 120 | + """Runs only if Hugging Face token is available.""" |
| 121 | + return HallucinationDetector(model_name="facebook/bart-large-mnli") |
| 122 | + |
| 123 | + |
| 124 | +@skip_if_no_hf |
| 125 | +@pytest.mark.requires("integration") |
| 126 | +def test_extract_claims_integration(detector_real): |
| 127 | + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 128 | + claims = detector_real.extract_claims(text) |
| 129 | + # Check structure and basic logic |
| 130 | + assert isinstance(claims, list) |
| 131 | + assert len(claims) == 2 |
| 132 | + # Ensure at least one claim matches expected |
| 133 | + assert any("Barack Obama was the 44th President" in c for c in claims) |
| 134 | + |
| 135 | + |
| 136 | +@skip_if_no_hf |
| 137 | +@pytest.mark.requires("integration") |
| 138 | +def test_compute_hallucination_rate_integration(detector_real): |
| 139 | + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 140 | + evidence = [ |
| 141 | + "Barack Obama served as the 44th President of the United States from 2009 to 2017.", |
| 142 | + "Barack Obama was born in Hawaii, not Kenya." |
| 143 | + ] |
| 144 | + result = detector_real.compute_hallucination_rate(text, evidence) |
| 145 | + |
| 146 | + # Validate structure |
| 147 | + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: |
| 148 | + assert key in result |
| 149 | + |
| 150 | + total = result["total_claims"] |
| 151 | + unsupported = result["unsupported_claims"] |
| 152 | + hallucination_rate = result["hallucination_rate"] |
| 153 | + |
| 154 | + # Check logical consistency |
| 155 | + assert total == 2 |
| 156 | + assert 0 <= unsupported <= total # Dynamic check |
| 157 | + assert abs(hallucination_rate - unsupported / total) < 1e-6 # Matches formula |
| 158 | + assert 0 <= hallucination_rate <= 1 |
| 159 | + |
| 160 | + |
| 161 | +# ----------------------------- |
| 162 | +# Unit Tests (Mocked) |
| 163 | +# ----------------------------- |
| 164 | +@pytest.fixture(scope="module") |
| 165 | +def detector_mock(): |
| 166 | + """Mock pipeline to make unit tests deterministic.""" |
| 167 | + with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: |
| 168 | + # Mock NLI behavior |
| 169 | + mock_pipeline.return_value = lambda text: [ |
| 170 | + {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} |
| 171 | + ] |
| 172 | + detector = HallucinationDetector(model_name="any") # Model not loaded |
| 173 | + yield detector |
| 174 | + |
| 175 | + |
| 176 | +def test_extract_claims_mock(detector_mock): |
| 177 | + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 178 | + claims = detector_mock.extract_claims(text) |
| 179 | + assert isinstance(claims, list) |
| 180 | + assert len(claims) == 2 |
| 181 | + |
| 182 | + |
| 183 | +def test_verify_claim_supported_mock(detector_mock): |
| 184 | + claim = "Barack Obama was the 44th President of the United States" |
| 185 | + evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." |
| 186 | + assert detector_mock.verify_claim(claim, evidence) is True |
| 187 | + |
| 188 | + |
| 189 | +def test_verify_claim_unsupported_mock(detector_mock): |
| 190 | + claim = "Barack Obama was born in Kenya" |
| 191 | + evidence = "Barack Obama was born in Hawaii, not Kenya." |
| 192 | + assert detector_mock.verify_claim(claim, evidence) is False |
| 193 | + |
| 194 | + |
| 195 | +def test_compute_hallucination_rate_mock(detector_mock): |
| 196 | + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." |
| 197 | + evidence = [ |
| 198 | + "Barack Obama served as the 44th President of the United States from 2009 to 2017.", |
| 199 | + "Barack Obama was born in Hawaii, not Kenya." |
| 200 | + ] |
| 201 | + result = detector_mock.compute_hallucination_rate(text, evidence) |
| 202 | + # Validate structure and logical consistency |
| 203 | + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: |
| 204 | + assert key in result |
| 205 | + assert result["total_claims"] == 2 |
| 206 | + assert 0 <= result["unsupported_claims"] <= 2 |
| 207 | + assert abs(result["hallucination_rate"] - result["unsupported_claims"] / result["total_claims"]) < 1e-6 |
0 commit comments