feat: add HallucinationDetector for evaluating hallucinations (#33191)

Vipul-Pandey-22 · Vipul-Pandey-22 · commit cef02408ddc7 · 2025-10-05T23:34:35.000+05:30
diff --git a/libs/langchain/langchain/evaluation/hallucination/__init__.py b/libs/langchain/langchain/evaluation/hallucination/__init__.py
@@ -0,0 +1,3 @@
+from .detector import HallucinationDetector
+
+__all__ = ["HallucinationDetector"]
diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py
@@ -0,0 +1,38 @@
+from typing import List, Dict
+from transformers import pipeline
+
+class HallucinationDetector:
+    """
+    Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli).
+    - Extract claims (basic sentence split)
+    - Verify claims against evidence docs using NLI
+    - Compute hallucination rate
+    """
+
+    def __init__(self, model_name: str = "facebook/bart-large-mnli"):
+        self.nli_pipeline = pipeline("text-classification", model=model_name)
+
+    def extract_claims(self, text: str) -> List[str]:
+        """Naive sentence-based claim extraction"""
+        return [c.strip() for c in text.split(".") if c.strip()]
+
+    def verify_claim(self, claim: str, evidence: str) -> bool:
+        """Check if a claim is supported by given evidence"""
+        result = self.nli_pipeline(f"{claim} </s></s> {evidence}")
+        return result[0]["label"].lower() == "entailment"
+
+    def verify_claim_multi(self, claim: str, evidence_docs: List[str]) -> bool:
+        """A claim is supported if any evidence doc entails it"""
+        return any(self.verify_claim(claim, e) for e in evidence_docs)
+
+    def compute_hallucination_rate(self, text: str, evidence_docs: List[str]) -> Dict[str, float]:
+        claims = self.extract_claims(text)
+        if not claims:
+            return {"total_claims": 0, "unsupported_claims": 0, "hallucination_rate": 0.0}
+
+        unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims)
+        return {
+            "total_claims": len(claims),
+            "unsupported_claims": unsupported,
+            "hallucination_rate": unsupported / len(claims),
+        }
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
@@ -39,6 +39,8 @@ together = ["langchain-together"]
 # deepseek = ["langchain-deepseek"]
 # xai = ["langchain-xai"]
 # perplexity = ["langchain-perplexity"]
+# hallucination = ["transformers", "torch"]
+
 
 [project.urls]
 "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain"
@@ -83,6 +85,8 @@ test_integration = [
     "langchainhub>=0.1.16,<1.0.0",
     "langchain-core",
     "langchain-text-splitters",
+    "transformers>=4.35.0,<5.0.0",
+    "torch>=2.1.0,<3.0.0",
 ]
 lint = [
     "ruff>=0.13.1,<0.14.0",
diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py
diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py
@@ -0,0 +1,207 @@
+# import os
+# import pytest
+# from unittest.mock import patch
+
+# from langchain.evaluation.hallucination.detector import HallucinationDetector
+
+# # -----------------------------
+# # Integration Tests (Real HF model)
+# # -----------------------------
+# skip_if_no_hf = pytest.mark.skipif(
+#     "HF_TOKEN" not in os.environ, reason="Hugging Face token not available"
+# )
+
+# @pytest.fixture(scope="module")
+# @skip_if_no_hf
+# @pytest.mark.requires("integration")
+# def detector_real():
+#     # Only runs locally if HF token is available
+#     return HallucinationDetector(model_name="facebook/bart-large-mnli")
+
+
+# @skip_if_no_hf
+# @pytest.mark.requires("integration")
+# def test_extract_claims_integration(detector_real):
+#     text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+#     claims = detector_real.extract_claims(text)
+#     assert isinstance(claims, list)
+#     assert len(claims) == 2
+#     assert "Barack Obama was the 44th President of the United States" in claims
+
+
+# @skip_if_no_hf
+# @pytest.mark.requires("integration")
+# def test_compute_hallucination_rate_integration(detector_real):
+#     text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+#     evidence = [
+#         "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
+#         "Barack Obama was born in Hawaii, not Kenya."
+#     ]
+#     result = detector_real.compute_hallucination_rate(text, evidence)
+#     unsupported = result["unsupported_claims"]
+#     total = result["total_claims"]
+#     hallucination_rate = result["hallucination_rate"]
+
+#     assert "total_claims" in result
+#     assert "unsupported_claims" in result
+#     assert "hallucination_rate" in result
+#     assert result["total_claims"] == 2
+#     assert unsupported in [1, 2]  # Accepts both possible outputs
+#     assert 0 <= hallucination_rate <= 1  # Just check it’s a valid rate
+
+
+# # -----------------------------
+# # Unit Tests (Mocked)
+# # -----------------------------
+# # Unit test fixture
+# @pytest.fixture(scope="module")
+# def detector_mock():
+#     with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
+#         # mock NLI results
+#         mock_pipeline.return_value = lambda text: [
+#             {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9}
+#         ]
+#         # Now constructor won't load HF model
+#         detector = HallucinationDetector(model_name="any")
+#         yield detector
+
+
+# def test_extract_claims_mock(detector_mock):
+#     text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+#     claims = detector_mock.extract_claims(text)
+#     assert isinstance(claims, list)
+#     assert len(claims) == 2
+
+
+# def test_verify_claim_supported_mock(detector_mock):
+#     claim = "Barack Obama was the 44th President of the United States"
+#     evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017."
+#     result = detector_mock.verify_claim(claim, evidence)
+#     assert result is True
+
+
+# def test_verify_claim_unsupported_mock(detector_mock):
+#     claim = "Barack Obama was born in Kenya"
+#     evidence = "Barack Obama was born in Hawaii, not Kenya."
+#     result = detector_mock.verify_claim(claim, evidence)
+#     assert result is False
+
+
+# def test_compute_hallucination_rate_mock(detector_mock):
+#     text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+#     evidence = [
+#         "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
+#         "Barack Obama was born in Hawaii, not Kenya."
+#     ]
+#     result = detector_mock.compute_hallucination_rate(text, evidence)
+#     assert "total_claims" in result
+#     assert "unsupported_claims" in result
+#     assert "hallucination_rate" in result
+
+
+
+import os
+import pytest
+from unittest.mock import patch
+
+from langchain.evaluation.hallucination.detector import HallucinationDetector
+
+# -----------------------------
+# Integration Tests (Real HF model)
+# -----------------------------
+skip_if_no_hf = pytest.mark.skipif(
+    "HF_TOKEN" not in os.environ, reason="Hugging Face token not available"
+)
+
+@pytest.fixture(scope="module")
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def detector_real():
+    """Runs only if Hugging Face token is available."""
+    return HallucinationDetector(model_name="facebook/bart-large-mnli")
+
+
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def test_extract_claims_integration(detector_real):
+    text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+    claims = detector_real.extract_claims(text)
+    # Check structure and basic logic
+    assert isinstance(claims, list)
+    assert len(claims) == 2
+    # Ensure at least one claim matches expected
+    assert any("Barack Obama was the 44th President" in c for c in claims)
+
+
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def test_compute_hallucination_rate_integration(detector_real):
+    text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+    evidence = [
+        "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
+        "Barack Obama was born in Hawaii, not Kenya."
+    ]
+    result = detector_real.compute_hallucination_rate(text, evidence)
+
+    # Validate structure
+    for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
+        assert key in result
+
+    total = result["total_claims"]
+    unsupported = result["unsupported_claims"]
+    hallucination_rate = result["hallucination_rate"]
+
+    # Check logical consistency
+    assert total == 2
+    assert 0 <= unsupported <= total  # Dynamic check
+    assert abs(hallucination_rate - unsupported / total) < 1e-6  # Matches formula
+    assert 0 <= hallucination_rate <= 1
+
+
+# -----------------------------
+# Unit Tests (Mocked)
+# -----------------------------
+@pytest.fixture(scope="module")
+def detector_mock():
+    """Mock pipeline to make unit tests deterministic."""
+    with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
+        # Mock NLI behavior
+        mock_pipeline.return_value = lambda text: [
+            {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9}
+        ]
+        detector = HallucinationDetector(model_name="any")  # Model not loaded
+        yield detector
+
+
+def test_extract_claims_mock(detector_mock):
+    text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+    claims = detector_mock.extract_claims(text)
+    assert isinstance(claims, list)
+    assert len(claims) == 2
+
+
+def test_verify_claim_supported_mock(detector_mock):
+    claim = "Barack Obama was the 44th President of the United States"
+    evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017."
+    assert detector_mock.verify_claim(claim, evidence) is True
+
+
+def test_verify_claim_unsupported_mock(detector_mock):
+    claim = "Barack Obama was born in Kenya"
+    evidence = "Barack Obama was born in Hawaii, not Kenya."
+    assert detector_mock.verify_claim(claim, evidence) is False
+
+
+def test_compute_hallucination_rate_mock(detector_mock):
+    text = "Barack Obama was the 44th President of the United States. He was born in Kenya."
+    evidence = [
+        "Barack Obama served as the 44th President of the United States from 2009 to 2017.",
+        "Barack Obama was born in Hawaii, not Kenya."
+    ]
+    result = detector_mock.compute_hallucination_rate(text, evidence)
+    # Validate structure and logical consistency
+    for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
+        assert key in result
+    assert result["total_claims"] == 2
+    assert 0 <= result["unsupported_claims"] <= 2
+    assert abs(result["hallucination_rate"] - result["unsupported_claims"] / result["total_claims"]) < 1e-6

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .detector import HallucinationDetector`
	`2`	`+`
	`3`	`+__all__ = ["HallucinationDetector"]`