diff --git a/libs/langchain/langchain/evaluation/hallucination/__init__.py b/libs/langchain/langchain/evaluation/hallucination/__init__.py new file mode 100644 index 0000000000000..dde5106ca1e31 --- /dev/null +++ b/libs/langchain/langchain/evaluation/hallucination/__init__.py @@ -0,0 +1,3 @@ +from .detector import HallucinationDetector + +__all__ = ["HallucinationDetector"] diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py new file mode 100644 index 0000000000000..19265299c7f40 --- /dev/null +++ b/libs/langchain/langchain/evaluation/hallucination/detector.py @@ -0,0 +1,56 @@ +from typing import Callable, Optional, Any + +# Lazy import for optional transformers dependency +pipeline: Optional[Callable[..., Any]] = None +try: + from transformers import pipeline as _pipeline + pipeline = _pipeline +except ImportError: + pass + + +class HallucinationDetector: + """Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli). + - Extract claims (basic sentence split) + - Verify claims against evidence docs using NLI + - Compute hallucination rate + """ + + def __init__(self, model_name: str = "facebook/bart-large-mnli"): + if pipeline is None: + raise ImportError( + "The 'transformers' package is required for HallucinationDetector. " + "Install it with `pip install transformers`." + ) + self.nli_pipeline = pipeline("text-classification", model=model_name) + + def extract_claims(self, text: str) -> list[str]: + """Naive sentence-based claim extraction""" + return [c.strip() for c in text.split(".") if c.strip()] + + def verify_claim(self, claim: str, evidence: str) -> bool: + """Check if a claim is supported by given evidence""" + result = self.nli_pipeline(f"{claim} {evidence}") + return result[0]["label"].lower() == "entailment" + + def verify_claim_multi(self, claim: str, evidence_docs: list[str]) -> bool: + """A claim is supported if any evidence doc entails it""" + return any(self.verify_claim(claim, e) for e in evidence_docs) + + def compute_hallucination_rate( + self, text: str, evidence_docs: list[str] + ) -> dict[str, float]: + claims = self.extract_claims(text) + if not claims: + return { + "total_claims": 0, + "unsupported_claims": 0, + "hallucination_rate": 0.0, + } + + unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims) + return { + "total_claims": len(claims), + "unsupported_claims": unsupported, + "hallucination_rate": unsupported / len(claims), + } diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 2e97c53c46539..362e413aeec3b 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -39,6 +39,7 @@ aws = ["langchain-aws"] deepseek = ["langchain-deepseek"] xai = ["langchain-xai"] perplexity = ["langchain-perplexity"] +transformers = ["transformers>=4.0.0,<5.0.0"] [project.urls] "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain" @@ -83,6 +84,7 @@ test_integration = [ "langchainhub>=0.1.16,<1.0.0", "langchain-core", "langchain-text-splitters", + "transformers>=4.0.0,<5.0.0", ] lint = [ "ruff>=0.13.1,<0.14.0", diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py new file mode 100644 index 0000000000000..63a658a1f31db --- /dev/null +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -0,0 +1,139 @@ +import os +from collections.abc import Generator +from unittest.mock import patch + +import pytest + +from langchain.evaluation.hallucination.detector import HallucinationDetector + +# ----------------------------- +# Integration Tests (Real HF model) +# ----------------------------- +skip_if_no_hf = pytest.mark.skipif( + "HF_TOKEN" not in os.environ, + reason="Hugging Face token not available" +) + + +@pytest.fixture(scope="module") +@skip_if_no_hf +@pytest.mark.requires("integration") +def detector_real() -> HallucinationDetector: + """Runs only if Hugging Face token is available.""" + return HallucinationDetector(model_name="facebook/bart-large-mnli") + + +@skip_if_no_hf +@pytest.mark.requires("integration") +def test_extract_claims_integration(detector_real: HallucinationDetector) -> None: + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) + claims = detector_real.extract_claims(text) + # Check structure and basic logic + assert isinstance(claims, list) + assert len(claims) == 2 + # Ensure at least one claim matches expected + assert any("Barack Obama was the 44th President" in c for c in claims) + + +@skip_if_no_hf +@pytest.mark.requires("integration") +def test_compute_hallucination_rate_integration( + detector_real: HallucinationDetector, +) -> None: + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) + evidence = [ + ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017." + ), + "Barack Obama was born in Hawaii, not Kenya.", + ] + result = detector_real.compute_hallucination_rate(text, evidence) + + # Validate structure + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: + assert key in result + + total = result["total_claims"] + unsupported = result["unsupported_claims"] + hallucination_rate = result["hallucination_rate"] + + assert total == 2 + assert 0 <= unsupported <= total + assert abs(hallucination_rate - unsupported / total) < 1e-6 + assert 0 <= hallucination_rate <= 1 + + +# ----------------------------- +# Unit Tests (Mocked) +# ----------------------------- +@pytest.fixture(scope="module") +def detector_mock() -> Generator[HallucinationDetector, None, None]: + """Mock pipeline to make unit tests deterministic.""" + with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: + # Mock NLI behavior + mock_pipeline.return_value = lambda text: [ + {"label": "ENTAILMENT", "score": 0.9} + if "President" in text + else {"label": "CONTRADICTION", "score": 0.9} + ] + detector = HallucinationDetector(model_name="any") # Model not loaded + yield detector + + +def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None: + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) + claims = detector_mock.extract_claims(text) + assert isinstance(claims, list) + assert len(claims) == 2 + + +def test_verify_claim_supported_mock(detector_mock: HallucinationDetector) -> None: + claim = "Barack Obama was the 44th President of the United States" + evidence = ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017." + ) + assert detector_mock.verify_claim(claim, evidence) is True + + +def test_verify_claim_unsupported_mock(detector_mock: HallucinationDetector) -> None: + claim = "Barack Obama was born in Kenya" + evidence = "Barack Obama was born in Hawaii, not Kenya." + assert detector_mock.verify_claim(claim, evidence) is False + + +def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) -> None: + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) + evidence = [ + ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017." + ), + "Barack Obama was born in Hawaii, not Kenya.", + ] + result = detector_mock.compute_hallucination_rate(text, evidence) + # Validate structure and logical consistency + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: + assert key in result + assert result["total_claims"] == 2 + assert 0 <= result["unsupported_claims"] <= 2 + assert ( + abs( + result["hallucination_rate"] + - result["unsupported_claims"] / result["total_claims"] + ) + < 1e-6 + )