Skip to content
3 changes: 3 additions & 0 deletions libs/langchain/langchain/evaluation/hallucination/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .detector import HallucinationDetector

__all__ = ["HallucinationDetector"]
56 changes: 56 additions & 0 deletions libs/langchain/langchain/evaluation/hallucination/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import Callable, Optional, Any

# Lazy import for optional transformers dependency
pipeline: Optional[Callable[..., Any]] = None
try:
from transformers import pipeline as _pipeline
pipeline = _pipeline
except ImportError:
pass


class HallucinationDetector:
"""Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli).
- Extract claims (basic sentence split)
- Verify claims against evidence docs using NLI
- Compute hallucination rate
"""

def __init__(self, model_name: str = "facebook/bart-large-mnli"):
if pipeline is None:
raise ImportError(
"The 'transformers' package is required for HallucinationDetector. "
"Install it with `pip install transformers`."
)
self.nli_pipeline = pipeline("text-classification", model=model_name)

def extract_claims(self, text: str) -> list[str]:
"""Naive sentence-based claim extraction"""
return [c.strip() for c in text.split(".") if c.strip()]

def verify_claim(self, claim: str, evidence: str) -> bool:
"""Check if a claim is supported by given evidence"""
result = self.nli_pipeline(f"{claim} </s></s> {evidence}")
return result[0]["label"].lower() == "entailment"

def verify_claim_multi(self, claim: str, evidence_docs: list[str]) -> bool:
"""A claim is supported if any evidence doc entails it"""
return any(self.verify_claim(claim, e) for e in evidence_docs)

def compute_hallucination_rate(
self, text: str, evidence_docs: list[str]
) -> dict[str, float]:
claims = self.extract_claims(text)
if not claims:
return {
"total_claims": 0,
"unsupported_claims": 0,
"hallucination_rate": 0.0,
}

unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims)
return {
"total_claims": len(claims),
"unsupported_claims": unsupported,
"hallucination_rate": unsupported / len(claims),
}
2 changes: 2 additions & 0 deletions libs/langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ aws = ["langchain-aws"]
deepseek = ["langchain-deepseek"]
xai = ["langchain-xai"]
perplexity = ["langchain-perplexity"]
transformers = ["transformers>=4.0.0,<5.0.0"]

[project.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain"
Expand Down Expand Up @@ -83,6 +84,7 @@ test_integration = [
"langchainhub>=0.1.16,<1.0.0",
"langchain-core",
"langchain-text-splitters",
"transformers>=4.0.0,<5.0.0",
]
lint = [
"ruff>=0.13.1,<0.14.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
from collections.abc import Generator
from unittest.mock import patch

import pytest

from langchain.evaluation.hallucination.detector import HallucinationDetector

# -----------------------------
# Integration Tests (Real HF model)
# -----------------------------
skip_if_no_hf = pytest.mark.skipif(
"HF_TOKEN" not in os.environ,
reason="Hugging Face token not available"
)


@pytest.fixture(scope="module")
@skip_if_no_hf
@pytest.mark.requires("integration")
def detector_real() -> HallucinationDetector:
"""Runs only if Hugging Face token is available."""
return HallucinationDetector(model_name="facebook/bart-large-mnli")


@skip_if_no_hf
@pytest.mark.requires("integration")
def test_extract_claims_integration(detector_real: HallucinationDetector) -> None:
text = (
"Barack Obama was the 44th President of the United States. "
"He was born in Kenya."
)
claims = detector_real.extract_claims(text)
# Check structure and basic logic
assert isinstance(claims, list)
assert len(claims) == 2
# Ensure at least one claim matches expected
assert any("Barack Obama was the 44th President" in c for c in claims)


@skip_if_no_hf
@pytest.mark.requires("integration")
def test_compute_hallucination_rate_integration(
detector_real: HallucinationDetector,
) -> None:
text = (
"Barack Obama was the 44th President of the United States. "
"He was born in Kenya."
)
evidence = [
(
"Barack Obama served as the 44th President of the United States "
"from 2009 to 2017."
),
"Barack Obama was born in Hawaii, not Kenya.",
]
result = detector_real.compute_hallucination_rate(text, evidence)

# Validate structure
for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
assert key in result

total = result["total_claims"]
unsupported = result["unsupported_claims"]
hallucination_rate = result["hallucination_rate"]

assert total == 2
assert 0 <= unsupported <= total
assert abs(hallucination_rate - unsupported / total) < 1e-6
assert 0 <= hallucination_rate <= 1


# -----------------------------
# Unit Tests (Mocked)
# -----------------------------
@pytest.fixture(scope="module")
def detector_mock() -> Generator[HallucinationDetector, None, None]:
"""Mock pipeline to make unit tests deterministic."""
with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline:
# Mock NLI behavior
mock_pipeline.return_value = lambda text: [
{"label": "ENTAILMENT", "score": 0.9}
if "President" in text
else {"label": "CONTRADICTION", "score": 0.9}
]
detector = HallucinationDetector(model_name="any") # Model not loaded
yield detector


def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None:
text = (
"Barack Obama was the 44th President of the United States. "
"He was born in Kenya."
)
claims = detector_mock.extract_claims(text)
assert isinstance(claims, list)
assert len(claims) == 2


def test_verify_claim_supported_mock(detector_mock: HallucinationDetector) -> None:
claim = "Barack Obama was the 44th President of the United States"
evidence = (
"Barack Obama served as the 44th President of the United States "
"from 2009 to 2017."
)
assert detector_mock.verify_claim(claim, evidence) is True


def test_verify_claim_unsupported_mock(detector_mock: HallucinationDetector) -> None:
claim = "Barack Obama was born in Kenya"
evidence = "Barack Obama was born in Hawaii, not Kenya."
assert detector_mock.verify_claim(claim, evidence) is False


def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) -> None:
text = (
"Barack Obama was the 44th President of the United States. "
"He was born in Kenya."
)
evidence = [
(
"Barack Obama served as the 44th President of the United States "
"from 2009 to 2017."
),
"Barack Obama was born in Hawaii, not Kenya.",
]
result = detector_mock.compute_hallucination_rate(text, evidence)
# Validate structure and logical consistency
for key in ["total_claims", "unsupported_claims", "hallucination_rate"]:
assert key in result
assert result["total_claims"] == 2
assert 0 <= result["unsupported_claims"] <= 2
assert (
abs(
result["hallucination_rate"]
- result["unsupported_claims"] / result["total_claims"]
)
< 1e-6
)
Loading