Skip to content

Commit cef0240

Browse files
feat: add HallucinationDetector for evaluating hallucinations (#33191)
1 parent 63097db commit cef0240

File tree

5 files changed

+252
-0
lines changed

5 files changed

+252
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .detector import HallucinationDetector
2+
3+
__all__ = ["HallucinationDetector"]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from typing import List, Dict
2+
from transformers import pipeline
3+
4+
class HallucinationDetector:
5+
"""
6+
Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli).
7+
- Extract claims (basic sentence split)
8+
- Verify claims against evidence docs using NLI
9+
- Compute hallucination rate
10+
"""
11+
12+
def __init__(self, model_name: str = "facebook/bart-large-mnli"):
13+
self.nli_pipeline = pipeline("text-classification", model=model_name)
14+
15+
def extract_claims(self, text: str) -> List[str]:
16+
"""Naive sentence-based claim extraction"""
17+
return [c.strip() for c in text.split(".") if c.strip()]
18+
19+
def verify_claim(self, claim: str, evidence: str) -> bool:
20+
"""Check if a claim is supported by given evidence"""
21+
result = self.nli_pipeline(f"{claim} </s></s> {evidence}")
22+
return result[0]["label"].lower() == "entailment"
23+
24+
def verify_claim_multi(self, claim: str, evidence_docs: List[str]) -> bool:
25+
"""A claim is supported if any evidence doc entails it"""
26+
return any(self.verify_claim(claim, e) for e in evidence_docs)
27+
28+
def compute_hallucination_rate(self, text: str, evidence_docs: List[str]) -> Dict[str, float]:
29+
claims = self.extract_claims(text)
30+
if not claims:
31+
return {"total_claims": 0, "unsupported_claims": 0, "hallucination_rate": 0.0}
32+
33+
unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims)
34+
return {
35+
"total_claims": len(claims),
36+
"unsupported_claims": unsupported,
37+
"hallucination_rate": unsupported / len(claims),
38+
}

libs/langchain/pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ together = ["langchain-together"]
3939
# deepseek = ["langchain-deepseek"]
4040
# xai = ["langchain-xai"]
4141
# perplexity = ["langchain-perplexity"]
42+
# hallucination = ["transformers", "torch"]
43+
4244

4345
[project.urls]
4446
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain"
@@ -83,6 +85,8 @@ test_integration = [
8385
"langchainhub>=0.1.16,<1.0.0",
8486
"langchain-core",
8587
"langchain-text-splitters",
88+
"transformers>=4.35.0,<5.0.0",
89+
"torch>=2.1.0,<3.0.0",
8690
]
8791
lint = [
8892
"ruff>=0.13.1,<0.14.0",

libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py

Whitespace-only changes.
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# import os
2+
# import pytest
3+
# from unittest.mock import patch
4+
5+
# from langchain.evaluation.hallucination.detector import HallucinationDetector
6+
7+
# # -----------------------------
8+
# # Integration Tests (Real HF model)
9+
# # -----------------------------
10+
# skip_if_no_hf = pytest.mark.skipif(
11+
# "HF_TOKEN" not in os.environ, reason="Hugging Face token not available"
12+
# )
13+
14+
# @pytest.fixture(scope="module")
15+
# @skip_if_no_hf
16+
# @pytest.mark.requires("integration")
17+
# def detector_real():
18+
# # Only runs locally if HF token is available
19+
# return HallucinationDetector(model_name="facebook/bart-large-mnli")
20+
21+
22+
# @skip_if_no_hf
23+
# @pytest.mark.requires("integration")
24+
# def test_extract_claims_integration(detector_real):
25+
# text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
26+
# claims = detector_real.extract_claims(text)
27+
# assert isinstance(claims, list)
28+
# assert len(claims) == 2
29+
# assert "Barack Obama was the 44th President of the United States" in claims
30+
31+
32+
# @skip_if_no_hf
33+
# @pytest.mark.requires("integration")
34+
# def test_compute_hallucination_rate_integration(detector_real):
35+
# text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
36+
# evidence = [
37+
# "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
38+
# "Barack Obama was born in Hawaii, not Kenya."
39+
# ]
40+
# result = detector_real.compute_hallucination_rate(text, evidence)
41+
# unsupported = result["unsupported_claims"]
42+
# total = result["total_claims"]
43+
# hallucination_rate = result["hallucination_rate"]
44+
45+
# assert "total_claims" in result
46+
# assert "unsupported_claims" in result
47+
# assert "hallucination_rate" in result
48+
# assert result["total_claims"] == 2
49+
# assert unsupported in [1, 2] # Accepts both possible outputs
50+
# assert 0 <= hallucination_rate <= 1 # Just check it’s a valid rate
51+
52+
53+
# # -----------------------------
54+
# # Unit Tests (Mocked)
55+
# # -----------------------------
56+
# # Unit test fixture
57+
# @pytest.fixture(scope="module")
58+
# def detector_mock():
59+
# with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
60+
# # mock NLI results
61+
# mock_pipeline.return_value = lambda text: [
62+
# {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9}
63+
# ]
64+
# # Now constructor won't load HF model
65+
# detector = HallucinationDetector(model_name="any")
66+
# yield detector
67+
68+
69+
# def test_extract_claims_mock(detector_mock):
70+
# text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
71+
# claims = detector_mock.extract_claims(text)
72+
# assert isinstance(claims, list)
73+
# assert len(claims) == 2
74+
75+
76+
# def test_verify_claim_supported_mock(detector_mock):
77+
# claim = "Barack Obama was the 44th President of the United States"
78+
# evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017."
79+
# result = detector_mock.verify_claim(claim, evidence)
80+
# assert result is True
81+
82+
83+
# def test_verify_claim_unsupported_mock(detector_mock):
84+
# claim = "Barack Obama was born in Kenya"
85+
# evidence = "Barack Obama was born in Hawaii, not Kenya."
86+
# result = detector_mock.verify_claim(claim, evidence)
87+
# assert result is False
88+
89+
90+
# def test_compute_hallucination_rate_mock(detector_mock):
91+
# text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
92+
# evidence = [
93+
# "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
94+
# "Barack Obama was born in Hawaii, not Kenya."
95+
# ]
96+
# result = detector_mock.compute_hallucination_rate(text, evidence)
97+
# assert "total_claims" in result
98+
# assert "unsupported_claims" in result
99+
# assert "hallucination_rate" in result
100+
101+
102+
103+
import os
104+
import pytest
105+
from unittest.mock import patch
106+
107+
from langchain.evaluation.hallucination.detector import HallucinationDetector
108+
109+
# -----------------------------
110+
# Integration Tests (Real HF model)
111+
# -----------------------------
112+
skip_if_no_hf = pytest.mark.skipif(
113+
"HF_TOKEN" not in os.environ, reason="Hugging Face token not available"
114+
)
115+
116+
@pytest.fixture(scope="module")
117+
@skip_if_no_hf
118+
@pytest.mark.requires("integration")
119+
def detector_real():
120+
"""Runs only if Hugging Face token is available."""
121+
return HallucinationDetector(model_name="facebook/bart-large-mnli")
122+
123+
124+
@skip_if_no_hf
125+
@pytest.mark.requires("integration")
126+
def test_extract_claims_integration(detector_real):
127+
text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
128+
claims = detector_real.extract_claims(text)
129+
# Check structure and basic logic
130+
assert isinstance(claims, list)
131+
assert len(claims) == 2
132+
# Ensure at least one claim matches expected
133+
assert any("Barack Obama was the 44th President" in c for c in claims)
134+
135+
136+
@skip_if_no_hf
137+
@pytest.mark.requires("integration")
138+
def test_compute_hallucination_rate_integration(detector_real):
139+
text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
140+
evidence = [
141+
"Barack Obama served as the 44th President of the United States from 2009 to 2017.",
142+
"Barack Obama was born in Hawaii, not Kenya."
143+
]
144+
result = detector_real.compute_hallucination_rate(text, evidence)
145+
146+
# Validate structure
147+
for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
148+
assert key in result
149+
150+
total = result["total_claims"]
151+
unsupported = result["unsupported_claims"]
152+
hallucination_rate = result["hallucination_rate"]
153+
154+
# Check logical consistency
155+
assert total == 2
156+
assert 0 <= unsupported <= total # Dynamic check
157+
assert abs(hallucination_rate - unsupported / total) < 1e-6 # Matches formula
158+
assert 0 <= hallucination_rate <= 1
159+
160+
161+
# -----------------------------
162+
# Unit Tests (Mocked)
163+
# -----------------------------
164+
@pytest.fixture(scope="module")
165+
def detector_mock():
166+
"""Mock pipeline to make unit tests deterministic."""
167+
with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
168+
# Mock NLI behavior
169+
mock_pipeline.return_value = lambda text: [
170+
{"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9}
171+
]
172+
detector = HallucinationDetector(model_name="any") # Model not loaded
173+
yield detector
174+
175+
176+
def test_extract_claims_mock(detector_mock):
177+
text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
178+
claims = detector_mock.extract_claims(text)
179+
assert isinstance(claims, list)
180+
assert len(claims) == 2
181+
182+
183+
def test_verify_claim_supported_mock(detector_mock):
184+
claim = "Barack Obama was the 44th President of the United States"
185+
evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017."
186+
assert detector_mock.verify_claim(claim, evidence) is True
187+
188+
189+
def test_verify_claim_unsupported_mock(detector_mock):
190+
claim = "Barack Obama was born in Kenya"
191+
evidence = "Barack Obama was born in Hawaii, not Kenya."
192+
assert detector_mock.verify_claim(claim, evidence) is False
193+
194+
195+
def test_compute_hallucination_rate_mock(detector_mock):
196+
text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
197+
evidence = [
198+
"Barack Obama served as the 44th President of the United States from 2009 to 2017.",
199+
"Barack Obama was born in Hawaii, not Kenya."
200+
]
201+
result = detector_mock.compute_hallucination_rate(text, evidence)
202+
# Validate structure and logical consistency
203+
for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
204+
assert key in result
205+
assert result["total_claims"] == 2
206+
assert 0 <= result["unsupported_claims"] <= 2
207+
assert abs(result["hallucination_rate"] - result["unsupported_claims"] / result["total_claims"]) < 1e-6

0 commit comments

Comments
 (0)