langchain-ai · Vipul-Pandey-22 · Oct 5, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/libs/langchain/langchain/evaluation/hallucination/__init__.py b/libs/langchain/langchain/evaluation/hallucination/__init__.py
@@ -0,0 +1,3 @@
+from .detector import HallucinationDetector
+
+__all__ = ["HallucinationDetector"]
diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py
@@ -0,0 +1,56 @@
+from typing import Callable, Optional, Any
+
+# Lazy import for optional transformers dependency
+pipeline: Optional[Callable[..., Any]] = None
+try:
+    from transformers import pipeline as _pipeline
+    pipeline = _pipeline
+except ImportError:
+    pass
+
+
+class HallucinationDetector:
+    """Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli).
+    - Extract claims (basic sentence split)
+    - Verify claims against evidence docs using NLI
+    - Compute hallucination rate
+    """
+
+    def __init__(self, model_name: str = "facebook/bart-large-mnli"):
+        if pipeline is None:
+            raise ImportError(
+                "The 'transformers' package is required for HallucinationDetector. "
+                "Install it with `pip install transformers`."
+            )
+        self.nli_pipeline = pipeline("text-classification", model=model_name)
+
+    def extract_claims(self, text: str) -> list[str]:
+        """Naive sentence-based claim extraction"""
+        return [c.strip() for c in text.split(".") if c.strip()]
+
+    def verify_claim(self, claim: str, evidence: str) -> bool:
+        """Check if a claim is supported by given evidence"""
+        result = self.nli_pipeline(f"{claim} </s></s> {evidence}")
+        return result[0]["label"].lower() == "entailment"
+
+    def verify_claim_multi(self, claim: str, evidence_docs: list[str]) -> bool:
+        """A claim is supported if any evidence doc entails it"""
+        return any(self.verify_claim(claim, e) for e in evidence_docs)
+
+    def compute_hallucination_rate(
+        self, text: str, evidence_docs: list[str]
+    ) -> dict[str, float]:
+        claims = self.extract_claims(text)
+        if not claims:
+            return {
+                "total_claims": 0,
+                "unsupported_claims": 0,
+                "hallucination_rate": 0.0,
+            }
+
+        unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims)
+        return {
+            "total_claims": len(claims),
+            "unsupported_claims": unsupported,
+            "hallucination_rate": unsupported / len(claims),
+        }
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
@@ -39,6 +39,7 @@ aws = ["langchain-aws"]
 deepseek = ["langchain-deepseek"]
 xai = ["langchain-xai"]
 perplexity = ["langchain-perplexity"]
+transformers = ["transformers>=4.0.0,<5.0.0"]
 
 [project.urls]
 "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain"
@@ -83,6 +84,7 @@ test_integration = [
     "langchainhub>=0.1.16,<1.0.0",
     "langchain-core",
     "langchain-text-splitters",
+    "transformers>=4.0.0,<5.0.0",
 ]
 lint = [
     "ruff>=0.13.1,<0.14.0",

diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py
diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py
@@ -0,0 +1,139 @@
+import os
+from collections.abc import Generator
+from unittest.mock import patch
+
+import pytest
+
+from langchain.evaluation.hallucination.detector import HallucinationDetector
+
+# -----------------------------
+# Integration Tests (Real HF model)
+# -----------------------------
+skip_if_no_hf = pytest.mark.skipif(
+    "HF_TOKEN" not in os.environ,
+    reason="Hugging Face token not available"
+)
+
+
+@pytest.fixture(scope="module")
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def detector_real() -> HallucinationDetector:
+    """Runs only if Hugging Face token is available."""
+    return HallucinationDetector(model_name="facebook/bart-large-mnli")
+
+
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def test_extract_claims_integration(detector_real: HallucinationDetector) -> None:
+    text = (
+    "Barack Obama was the 44th President of the United States. "
+    "He was born in Kenya."
+    )
+    claims = detector_real.extract_claims(text)
+    # Check structure and basic logic
+    assert isinstance(claims, list)
+    assert len(claims) == 2
+    # Ensure at least one claim matches expected
+    assert any("Barack Obama was the 44th President" in c for c in claims)
+
+
+@skip_if_no_hf
+@pytest.mark.requires("integration")
+def test_compute_hallucination_rate_integration(
+    detector_real: HallucinationDetector,
+) -> None:
+    text = (
+    "Barack Obama was the 44th President of the United States. "
+    "He was born in Kenya."
+    )
+    evidence = [
+        (
+        "Barack Obama served as the 44th President of the United States "
+        "from 2009 to 2017."
+        ),
+        "Barack Obama was born in Hawaii, not Kenya.",
+    ]
+    result = detector_real.compute_hallucination_rate(text, evidence)
+
+    # Validate structure
+    for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
+        assert key in result
+
+    total = result["total_claims"]
+    unsupported = result["unsupported_claims"]
+    hallucination_rate = result["hallucination_rate"]
+
+    assert total == 2
+    assert 0 <= unsupported <= total
+    assert abs(hallucination_rate - unsupported / total) < 1e-6
+    assert 0 <= hallucination_rate <= 1
+
+
+# -----------------------------
+# Unit Tests (Mocked)
+# -----------------------------
+@pytest.fixture(scope="module")
+def detector_mock() -> Generator[HallucinationDetector, None, None]:
+    """Mock pipeline to make unit tests deterministic."""
+    with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
+        # Mock NLI behavior
+        mock_pipeline.return_value = lambda text: [
+            {"label": "ENTAILMENT", "score": 0.9}
+            if "President" in text
+            else {"label": "CONTRADICTION", "score": 0.9}
+        ]
+        detector = HallucinationDetector(model_name="any")  # Model not loaded
+        yield detector
+
+
+def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None:
+    text = (
+        "Barack Obama was the 44th President of the United States. "
+        "He was born in Kenya."
+    )
+    claims = detector_mock.extract_claims(text)
+    assert isinstance(claims, list)
+    assert len(claims) == 2
+
+
+def test_verify_claim_supported_mock(detector_mock: HallucinationDetector) -> None:
+    claim = "Barack Obama was the 44th President of the United States"
+    evidence = (
+        "Barack Obama served as the 44th President of the United States "
+        "from 2009 to 2017."
+    )
+    assert detector_mock.verify_claim(claim, evidence) is True
+
+
+def test_verify_claim_unsupported_mock(detector_mock: HallucinationDetector) -> None:
+    claim = "Barack Obama was born in Kenya"
+    evidence = "Barack Obama was born in Hawaii, not Kenya."
+    assert detector_mock.verify_claim(claim, evidence) is False
+
+
+def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) -> None:
+    text = (
+        "Barack Obama was the 44th President of the United States. "
+        "He was born in Kenya."
+    )
+    evidence = [
+        (
+        "Barack Obama served as the 44th President of the United States "
+        "from 2009 to 2017."
+        ),
+        "Barack Obama was born in Hawaii, not Kenya.",
+    ]
+    result = detector_mock.compute_hallucination_rate(text, evidence)
+    # Validate structure and logical consistency
+    for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
+        assert key in result
+    assert result["total_claims"] == 2
+    assert 0 <= result["unsupported_claims"] <= 2
+    assert (
+        abs(
+            result["hallucination_rate"]
+            - result["unsupported_claims"] / result["total_claims"]
+        )
+        < 1e-6
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .detector import HallucinationDetector

		__all__ = ["HallucinationDetector"]