From cef02408ddc79ba57dbfa5765e293c2f2f6c1328 Mon Sep 17 00:00:00 2001 From: Vipul Date: Sun, 5 Oct 2025 23:34:35 +0530 Subject: [PATCH 1/9] feat: add HallucinationDetector for evaluating hallucinations (#33191) --- .../evaluation/hallucination/__init__.py | 3 + .../evaluation/hallucination/detector.py | 38 ++++ libs/langchain/pyproject.toml | 4 + .../evaluation/hallucination/__init__.py | 0 .../evaluation/hallucination/test_detector.py | 207 ++++++++++++++++++ 5 files changed, 252 insertions(+) create mode 100644 libs/langchain/langchain/evaluation/hallucination/__init__.py create mode 100644 libs/langchain/langchain/evaluation/hallucination/detector.py create mode 100644 libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py create mode 100644 libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py diff --git a/libs/langchain/langchain/evaluation/hallucination/__init__.py b/libs/langchain/langchain/evaluation/hallucination/__init__.py new file mode 100644 index 0000000000000..dde5106ca1e31 --- /dev/null +++ b/libs/langchain/langchain/evaluation/hallucination/__init__.py @@ -0,0 +1,3 @@ +from .detector import HallucinationDetector + +__all__ = ["HallucinationDetector"] diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py new file mode 100644 index 0000000000000..6db27161886d4 --- /dev/null +++ b/libs/langchain/langchain/evaluation/hallucination/detector.py @@ -0,0 +1,38 @@ +from typing import List, Dict +from transformers import pipeline + +class HallucinationDetector: + """ + Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli). + - Extract claims (basic sentence split) + - Verify claims against evidence docs using NLI + - Compute hallucination rate + """ + + def __init__(self, model_name: str = "facebook/bart-large-mnli"): + self.nli_pipeline = pipeline("text-classification", model=model_name) + + def extract_claims(self, text: str) -> List[str]: + """Naive sentence-based claim extraction""" + return [c.strip() for c in text.split(".") if c.strip()] + + def verify_claim(self, claim: str, evidence: str) -> bool: + """Check if a claim is supported by given evidence""" + result = self.nli_pipeline(f"{claim} {evidence}") + return result[0]["label"].lower() == "entailment" + + def verify_claim_multi(self, claim: str, evidence_docs: List[str]) -> bool: + """A claim is supported if any evidence doc entails it""" + return any(self.verify_claim(claim, e) for e in evidence_docs) + + def compute_hallucination_rate(self, text: str, evidence_docs: List[str]) -> Dict[str, float]: + claims = self.extract_claims(text) + if not claims: + return {"total_claims": 0, "unsupported_claims": 0, "hallucination_rate": 0.0} + + unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims) + return { + "total_claims": len(claims), + "unsupported_claims": unsupported, + "hallucination_rate": unsupported / len(claims), + } diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 712f7190edac3..3e4db79137df1 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -39,6 +39,8 @@ together = ["langchain-together"] # deepseek = ["langchain-deepseek"] # xai = ["langchain-xai"] # perplexity = ["langchain-perplexity"] +# hallucination = ["transformers", "torch"] + [project.urls] "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain" @@ -83,6 +85,8 @@ test_integration = [ "langchainhub>=0.1.16,<1.0.0", "langchain-core", "langchain-text-splitters", + "transformers>=4.35.0,<5.0.0", + "torch>=2.1.0,<3.0.0", ] lint = [ "ruff>=0.13.1,<0.14.0", diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py new file mode 100644 index 0000000000000..9b42e83c7bf43 --- /dev/null +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -0,0 +1,207 @@ +# import os +# import pytest +# from unittest.mock import patch + +# from langchain.evaluation.hallucination.detector import HallucinationDetector + +# # ----------------------------- +# # Integration Tests (Real HF model) +# # ----------------------------- +# skip_if_no_hf = pytest.mark.skipif( +# "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" +# ) + +# @pytest.fixture(scope="module") +# @skip_if_no_hf +# @pytest.mark.requires("integration") +# def detector_real(): +# # Only runs locally if HF token is available +# return HallucinationDetector(model_name="facebook/bart-large-mnli") + + +# @skip_if_no_hf +# @pytest.mark.requires("integration") +# def test_extract_claims_integration(detector_real): +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." +# claims = detector_real.extract_claims(text) +# assert isinstance(claims, list) +# assert len(claims) == 2 +# assert "Barack Obama was the 44th President of the United States" in claims + + +# @skip_if_no_hf +# @pytest.mark.requires("integration") +# def test_compute_hallucination_rate_integration(detector_real): +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." +# evidence = [ +# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", +# "Barack Obama was born in Hawaii, not Kenya." +# ] +# result = detector_real.compute_hallucination_rate(text, evidence) +# unsupported = result["unsupported_claims"] +# total = result["total_claims"] +# hallucination_rate = result["hallucination_rate"] + +# assert "total_claims" in result +# assert "unsupported_claims" in result +# assert "hallucination_rate" in result +# assert result["total_claims"] == 2 +# assert unsupported in [1, 2] # Accepts both possible outputs +# assert 0 <= hallucination_rate <= 1 # Just check it’s a valid rate + + +# # ----------------------------- +# # Unit Tests (Mocked) +# # ----------------------------- +# # Unit test fixture +# @pytest.fixture(scope="module") +# def detector_mock(): +# with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: +# # mock NLI results +# mock_pipeline.return_value = lambda text: [ +# {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} +# ] +# # Now constructor won't load HF model +# detector = HallucinationDetector(model_name="any") +# yield detector + + +# def test_extract_claims_mock(detector_mock): +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." +# claims = detector_mock.extract_claims(text) +# assert isinstance(claims, list) +# assert len(claims) == 2 + + +# def test_verify_claim_supported_mock(detector_mock): +# claim = "Barack Obama was the 44th President of the United States" +# evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." +# result = detector_mock.verify_claim(claim, evidence) +# assert result is True + + +# def test_verify_claim_unsupported_mock(detector_mock): +# claim = "Barack Obama was born in Kenya" +# evidence = "Barack Obama was born in Hawaii, not Kenya." +# result = detector_mock.verify_claim(claim, evidence) +# assert result is False + + +# def test_compute_hallucination_rate_mock(detector_mock): +# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." +# evidence = [ +# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", +# "Barack Obama was born in Hawaii, not Kenya." +# ] +# result = detector_mock.compute_hallucination_rate(text, evidence) +# assert "total_claims" in result +# assert "unsupported_claims" in result +# assert "hallucination_rate" in result + + + +import os +import pytest +from unittest.mock import patch + +from langchain.evaluation.hallucination.detector import HallucinationDetector + +# ----------------------------- +# Integration Tests (Real HF model) +# ----------------------------- +skip_if_no_hf = pytest.mark.skipif( + "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" +) + +@pytest.fixture(scope="module") +@skip_if_no_hf +@pytest.mark.requires("integration") +def detector_real(): + """Runs only if Hugging Face token is available.""" + return HallucinationDetector(model_name="facebook/bart-large-mnli") + + +@skip_if_no_hf +@pytest.mark.requires("integration") +def test_extract_claims_integration(detector_real): + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + claims = detector_real.extract_claims(text) + # Check structure and basic logic + assert isinstance(claims, list) + assert len(claims) == 2 + # Ensure at least one claim matches expected + assert any("Barack Obama was the 44th President" in c for c in claims) + + +@skip_if_no_hf +@pytest.mark.requires("integration") +def test_compute_hallucination_rate_integration(detector_real): + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + evidence = [ + "Barack Obama served as the 44th President of the United States from 2009 to 2017.", + "Barack Obama was born in Hawaii, not Kenya." + ] + result = detector_real.compute_hallucination_rate(text, evidence) + + # Validate structure + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: + assert key in result + + total = result["total_claims"] + unsupported = result["unsupported_claims"] + hallucination_rate = result["hallucination_rate"] + + # Check logical consistency + assert total == 2 + assert 0 <= unsupported <= total # Dynamic check + assert abs(hallucination_rate - unsupported / total) < 1e-6 # Matches formula + assert 0 <= hallucination_rate <= 1 + + +# ----------------------------- +# Unit Tests (Mocked) +# ----------------------------- +@pytest.fixture(scope="module") +def detector_mock(): + """Mock pipeline to make unit tests deterministic.""" + with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: + # Mock NLI behavior + mock_pipeline.return_value = lambda text: [ + {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} + ] + detector = HallucinationDetector(model_name="any") # Model not loaded + yield detector + + +def test_extract_claims_mock(detector_mock): + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + claims = detector_mock.extract_claims(text) + assert isinstance(claims, list) + assert len(claims) == 2 + + +def test_verify_claim_supported_mock(detector_mock): + claim = "Barack Obama was the 44th President of the United States" + evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." + assert detector_mock.verify_claim(claim, evidence) is True + + +def test_verify_claim_unsupported_mock(detector_mock): + claim = "Barack Obama was born in Kenya" + evidence = "Barack Obama was born in Hawaii, not Kenya." + assert detector_mock.verify_claim(claim, evidence) is False + + +def test_compute_hallucination_rate_mock(detector_mock): + text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + evidence = [ + "Barack Obama served as the 44th President of the United States from 2009 to 2017.", + "Barack Obama was born in Hawaii, not Kenya." + ] + result = detector_mock.compute_hallucination_rate(text, evidence) + # Validate structure and logical consistency + for key in ["total_claims", "unsupported_claims", "hallucination_rate"]: + assert key in result + assert result["total_claims"] == 2 + assert 0 <= result["unsupported_claims"] <= 2 + assert abs(result["hallucination_rate"] - result["unsupported_claims"] / result["total_claims"]) < 1e-6 From df5cb4f36974d5aed94f3738977e75e10c078509 Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 21:48:40 +0530 Subject: [PATCH 2/9] pyproject changed --- libs/langchain/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 3e4db79137df1..2f189a3dd9b28 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -75,6 +75,8 @@ test = [ "packaging>=24.2.0,<26.0.0", "numpy>=1.26.4; python_version<'3.13'", "numpy>=2.1.0; python_version>='3.13'", + "transformers>=4.35.0,<5.0.0", + "torch>=2.1.0,<3.0.0", ] test_integration = [ "vcrpy>=7.0.0,<8.0.0", From c1d99ea6fd5c420660d932447f721c1df769e8af Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 21:55:19 +0530 Subject: [PATCH 3/9] commented code has been removed --- .../evaluation/hallucination/test_detector.py | 102 ------------------ 1 file changed, 102 deletions(-) diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py index 9b42e83c7bf43..c23ef146257ac 100644 --- a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -1,105 +1,3 @@ -# import os -# import pytest -# from unittest.mock import patch - -# from langchain.evaluation.hallucination.detector import HallucinationDetector - -# # ----------------------------- -# # Integration Tests (Real HF model) -# # ----------------------------- -# skip_if_no_hf = pytest.mark.skipif( -# "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" -# ) - -# @pytest.fixture(scope="module") -# @skip_if_no_hf -# @pytest.mark.requires("integration") -# def detector_real(): -# # Only runs locally if HF token is available -# return HallucinationDetector(model_name="facebook/bart-large-mnli") - - -# @skip_if_no_hf -# @pytest.mark.requires("integration") -# def test_extract_claims_integration(detector_real): -# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." -# claims = detector_real.extract_claims(text) -# assert isinstance(claims, list) -# assert len(claims) == 2 -# assert "Barack Obama was the 44th President of the United States" in claims - - -# @skip_if_no_hf -# @pytest.mark.requires("integration") -# def test_compute_hallucination_rate_integration(detector_real): -# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." -# evidence = [ -# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", -# "Barack Obama was born in Hawaii, not Kenya." -# ] -# result = detector_real.compute_hallucination_rate(text, evidence) -# unsupported = result["unsupported_claims"] -# total = result["total_claims"] -# hallucination_rate = result["hallucination_rate"] - -# assert "total_claims" in result -# assert "unsupported_claims" in result -# assert "hallucination_rate" in result -# assert result["total_claims"] == 2 -# assert unsupported in [1, 2] # Accepts both possible outputs -# assert 0 <= hallucination_rate <= 1 # Just check it’s a valid rate - - -# # ----------------------------- -# # Unit Tests (Mocked) -# # ----------------------------- -# # Unit test fixture -# @pytest.fixture(scope="module") -# def detector_mock(): -# with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: -# # mock NLI results -# mock_pipeline.return_value = lambda text: [ -# {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} -# ] -# # Now constructor won't load HF model -# detector = HallucinationDetector(model_name="any") -# yield detector - - -# def test_extract_claims_mock(detector_mock): -# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." -# claims = detector_mock.extract_claims(text) -# assert isinstance(claims, list) -# assert len(claims) == 2 - - -# def test_verify_claim_supported_mock(detector_mock): -# claim = "Barack Obama was the 44th President of the United States" -# evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." -# result = detector_mock.verify_claim(claim, evidence) -# assert result is True - - -# def test_verify_claim_unsupported_mock(detector_mock): -# claim = "Barack Obama was born in Kenya" -# evidence = "Barack Obama was born in Hawaii, not Kenya." -# result = detector_mock.verify_claim(claim, evidence) -# assert result is False - - -# def test_compute_hallucination_rate_mock(detector_mock): -# text = "Barack Obama was the 44th President of the United States. He was born in Kenya." -# evidence = [ -# "Barack Obama served as the 44th President of the United States from 2009 to 2017.", -# "Barack Obama was born in Hawaii, not Kenya." -# ] -# result = detector_mock.compute_hallucination_rate(text, evidence) -# assert "total_claims" in result -# assert "unsupported_claims" in result -# assert "hallucination_rate" in result - - - import os import pytest from unittest.mock import patch From 75ce98a3a8589812a26eb2d39cd342012803d544 Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 22:12:56 +0530 Subject: [PATCH 4/9] changes --- libs/langchain/pyproject.toml | 69 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 2f189a3dd9b28..2e97c53c46539 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -1,13 +1,13 @@ [build-system] -requires = ["pdm-backend"] -build-backend = "pdm.backend" +requires = ["hatchling"] +build-backend = "hatchling.build" [project] authors = [] license = { text = "MIT" } requires-python = ">=3.10.0,<4.0.0" dependencies = [ - "langchain-core>=0.3.72,<1.0.0", + "langchain-core>=0.3.72,<2.0.0", "langchain-text-splitters>=0.3.9,<1.0.0", "langsmith>=0.1.17,<1.0.0", "pydantic>=2.7.4,<3.0.0", @@ -16,31 +16,29 @@ dependencies = [ "PyYAML>=5.3.0,<7.0.0", "async-timeout>=4.0.0,<5.0.0; python_version < \"3.11\"", ] -name = "langchain" +name = "langchain-classic" version = "0.3.27" description = "Building applications with LLMs through composability" readme = "README.md" [project.optional-dependencies] -community = ["langchain-community"] +# community = ["langchain-community"] anthropic = ["langchain-anthropic"] openai = ["langchain-openai"] -# azure-ai = ["langchain-azure-ai"] -# cohere = ["langchain-cohere"] +#azure-ai = ["langchain-azure-ai"] +#cohere = ["langchain-cohere"] google-vertexai = ["langchain-google-vertexai"] google-genai = ["langchain-google-genai"] -# fireworks = ["langchain-fireworks"] -# ollama = ["langchain-ollama"] +fireworks = ["langchain-fireworks"] +ollama = ["langchain-ollama"] together = ["langchain-together"] -# mistralai = ["langchain-mistralai"] -# huggingface = ["langchain-huggingface"] -# groq = ["langchain-groq"] -# aws = ["langchain-aws"] -# deepseek = ["langchain-deepseek"] -# xai = ["langchain-xai"] -# perplexity = ["langchain-perplexity"] -# hallucination = ["transformers", "torch"] - +mistralai = ["langchain-mistralai"] +#huggingface = ["langchain-huggingface"] +groq = ["langchain-groq"] +aws = ["langchain-aws"] +deepseek = ["langchain-deepseek"] +xai = ["langchain-xai"] +perplexity = ["langchain-perplexity"] [project.urls] "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain" @@ -52,31 +50,29 @@ test = [ "pytest>=8.0.0,<9.0.0", "pytest-cov>=4.0.0,<5.0.0", "pytest-dotenv>=0.5.2,<1.0.0", - "duckdb-engine>=0.9.2,<1.0.0", "pytest-watcher>=0.2.6,<1.0.0", + "pytest-asyncio>=0.23.2,<1.0.0", + "pytest-mock>=3.10.0,<4.0.0", + "pytest-socket>=0.6.0,<1.0.0", + "pytest-xdist<4.0.0,>=3.6.1", + "numpy>=1.26.4; python_version<'3.13'", + "numpy>=2.1.0; python_version>='3.13'", + "cffi<1.17.1; python_version < \"3.10\"", + "cffi; python_version >= \"3.10\"", + "duckdb-engine>=0.9.2,<1.0.0", "freezegun>=1.2.2,<2.0.0", "responses>=0.22.0,<1.0.0", - "pytest-asyncio>=0.23.2,<1.0.0", "lark>=1.1.5,<2.0.0", "pandas>=2.0.0,<3.0.0", - "pytest-mock>=3.10.0,<4.0.0", - "pytest-socket>=0.6.0,<1.0.0", "syrupy>=4.0.2,<5.0.0", "requests-mock>=1.11.0,<2.0.0", - "pytest-xdist<4.0.0,>=3.6.1", "blockbuster>=1.5.18,<1.6.0", - "cffi<1.17.1; python_version < \"3.10\"", - "cffi; python_version >= \"3.10\"", + "toml>=0.10.2,<1.0.0", + "packaging>=24.2.0,<26.0.0", "langchain-tests", "langchain-core", "langchain-text-splitters", "langchain-openai", - "toml>=0.10.2,<1.0.0", - "packaging>=24.2.0,<26.0.0", - "numpy>=1.26.4; python_version<'3.13'", - "numpy>=2.1.0; python_version>='3.13'", - "transformers>=4.35.0,<5.0.0", - "torch>=2.1.0,<3.0.0", ] test_integration = [ "vcrpy>=7.0.0,<8.0.0", @@ -87,8 +83,6 @@ test_integration = [ "langchainhub>=0.1.16,<1.0.0", "langchain-core", "langchain-text-splitters", - "transformers>=4.35.0,<5.0.0", - "torch>=2.1.0,<3.0.0", ] lint = [ "ruff>=0.13.1,<0.14.0", @@ -97,17 +91,17 @@ lint = [ ] typing = [ "mypy>=1.15.0,<1.16.0", + "mypy-protobuf>=3.0.0,<4.0.0", "types-pyyaml>=6.0.12.2,<7.0.0.0", "types-requests>=2.28.11.5,<3.0.0.0", "types-toml>=0.10.8.1,<1.0.0.0", "types-redis>=4.3.21.6,<5.0.0.0", "types-pytz>=2023.3.0.0,<2024.0.0.0", "types-chardet>=5.0.4.6,<6.0.0.0", - "mypy-protobuf>=3.0.0,<4.0.0", - "langchain-core", - "langchain-text-splitters", "numpy>=1.26.4; python_version < '3.13'", "numpy>=2.1.0; python_version >= '3.13'", + "langchain-core", + "langchain-text-splitters", ] dev = [ "jupyter>=1.0.0,<2.0.0", @@ -125,7 +119,6 @@ langchain-text-splitters = { path = "../text-splitters", editable = true } langchain-openai = { path = "../partners/openai", editable = true } [tool.ruff] -target-version = "py39" exclude = ["tests/integration_tests/examples/non-utf8-encoding.py"] [tool.mypy] @@ -197,7 +190,7 @@ ignore-var-parameters = true # ignore missing documentation for *args and **kwa "scripts/*.py" = [ "INP001", # Not a package ] -"langchain/chains/constitutional_ai/principles.py" = [ +"langchain_classic/chains/constitutional_ai/principles.py" = [ "E501", # Line too long ] "**/retrievers/*time_weighted_retriever.py" = [ From 6814cc7b95fa94b87ee79ed765bea1d03b51b6f6 Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 22:49:27 +0530 Subject: [PATCH 5/9] code fixes according to linter --- libs/langchain/pyproject.toml | 2 + .../evaluation/hallucination/test_detector.py | 44 ++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 2e97c53c46539..362e413aeec3b 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -39,6 +39,7 @@ aws = ["langchain-aws"] deepseek = ["langchain-deepseek"] xai = ["langchain-xai"] perplexity = ["langchain-perplexity"] +transformers = ["transformers>=4.0.0,<5.0.0"] [project.urls] "Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain" @@ -83,6 +84,7 @@ test_integration = [ "langchainhub>=0.1.16,<1.0.0", "langchain-core", "langchain-text-splitters", + "transformers>=4.0.0,<5.0.0", ] lint = [ "ruff>=0.13.1,<0.14.0", diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py index c23ef146257ac..cd7eed0e190f3 100644 --- a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -1,7 +1,9 @@ import os -import pytest +from collections.abc import Generator from unittest.mock import patch +import pytest + from langchain.evaluation.hallucination.detector import HallucinationDetector # ----------------------------- @@ -11,17 +13,18 @@ "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" ) + @pytest.fixture(scope="module") @skip_if_no_hf @pytest.mark.requires("integration") -def detector_real(): +def detector_real() -> HallucinationDetector: """Runs only if Hugging Face token is available.""" return HallucinationDetector(model_name="facebook/bart-large-mnli") @skip_if_no_hf @pytest.mark.requires("integration") -def test_extract_claims_integration(detector_real): +def test_extract_claims_integration(detector_real: HallucinationDetector) -> None: text = "Barack Obama was the 44th President of the United States. He was born in Kenya." claims = detector_real.extract_claims(text) # Check structure and basic logic @@ -33,11 +36,13 @@ def test_extract_claims_integration(detector_real): @skip_if_no_hf @pytest.mark.requires("integration") -def test_compute_hallucination_rate_integration(detector_real): +def test_compute_hallucination_rate_integration( + detector_real: HallucinationDetector, +) -> None: text = "Barack Obama was the 44th President of the United States. He was born in Kenya." evidence = [ "Barack Obama served as the 44th President of the United States from 2009 to 2017.", - "Barack Obama was born in Hawaii, not Kenya." + "Barack Obama was born in Hawaii, not Kenya.", ] result = detector_real.compute_hallucination_rate(text, evidence) @@ -49,10 +54,9 @@ def test_compute_hallucination_rate_integration(detector_real): unsupported = result["unsupported_claims"] hallucination_rate = result["hallucination_rate"] - # Check logical consistency assert total == 2 - assert 0 <= unsupported <= total # Dynamic check - assert abs(hallucination_rate - unsupported / total) < 1e-6 # Matches formula + assert 0 <= unsupported <= total + assert abs(hallucination_rate - unsupported / total) < 1e-6 assert 0 <= hallucination_rate <= 1 @@ -60,41 +64,43 @@ def test_compute_hallucination_rate_integration(detector_real): # Unit Tests (Mocked) # ----------------------------- @pytest.fixture(scope="module") -def detector_mock(): +def detector_mock() -> Generator[HallucinationDetector, None, None]: """Mock pipeline to make unit tests deterministic.""" with patch("langchain.evaluation.hallucination.detector.pipeline") as mock_pipeline: # Mock NLI behavior mock_pipeline.return_value = lambda text: [ - {"label": "ENTAILMENT", "score": 0.9} if "President" in text else {"label": "CONTRADICTION", "score": 0.9} + {"label": "ENTAILMENT", "score": 0.9} + if "President" in text + else {"label": "CONTRADICTION", "score": 0.9} ] detector = HallucinationDetector(model_name="any") # Model not loaded yield detector -def test_extract_claims_mock(detector_mock): +def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None: text = "Barack Obama was the 44th President of the United States. He was born in Kenya." claims = detector_mock.extract_claims(text) assert isinstance(claims, list) assert len(claims) == 2 -def test_verify_claim_supported_mock(detector_mock): +def test_verify_claim_supported_mock(detector_mock: HallucinationDetector) -> None: claim = "Barack Obama was the 44th President of the United States" evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." assert detector_mock.verify_claim(claim, evidence) is True -def test_verify_claim_unsupported_mock(detector_mock): +def test_verify_claim_unsupported_mock(detector_mock: HallucinationDetector) -> None: claim = "Barack Obama was born in Kenya" evidence = "Barack Obama was born in Hawaii, not Kenya." assert detector_mock.verify_claim(claim, evidence) is False -def test_compute_hallucination_rate_mock(detector_mock): +def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) -> None: text = "Barack Obama was the 44th President of the United States. He was born in Kenya." evidence = [ "Barack Obama served as the 44th President of the United States from 2009 to 2017.", - "Barack Obama was born in Hawaii, not Kenya." + "Barack Obama was born in Hawaii, not Kenya.", ] result = detector_mock.compute_hallucination_rate(text, evidence) # Validate structure and logical consistency @@ -102,4 +108,10 @@ def test_compute_hallucination_rate_mock(detector_mock): assert key in result assert result["total_claims"] == 2 assert 0 <= result["unsupported_claims"] <= 2 - assert abs(result["hallucination_rate"] - result["unsupported_claims"] / result["total_claims"]) < 1e-6 + assert ( + abs( + result["hallucination_rate"] + - result["unsupported_claims"] / result["total_claims"] + ) + < 1e-6 + ) From 29612a513520f17f54051ddb30677871b5e1266f Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 23:11:56 +0530 Subject: [PATCH 6/9] following pep8 --- .../evaluation/hallucination/test_detector.py | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py index cd7eed0e190f3..b7f7044d8aeca 100644 --- a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -10,7 +10,8 @@ # Integration Tests (Real HF model) # ----------------------------- skip_if_no_hf = pytest.mark.skipif( - "HF_TOKEN" not in os.environ, reason="Hugging Face token not available" + "HF_TOKEN" not in os.environ, + reason="Hugging Face token not available" ) @@ -25,7 +26,10 @@ def detector_real() -> HallucinationDetector: @skip_if_no_hf @pytest.mark.requires("integration") def test_extract_claims_integration(detector_real: HallucinationDetector) -> None: - text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) claims = detector_real.extract_claims(text) # Check structure and basic logic assert isinstance(claims, list) @@ -39,9 +43,15 @@ def test_extract_claims_integration(detector_real: HallucinationDetector) -> Non def test_compute_hallucination_rate_integration( detector_real: HallucinationDetector, ) -> None: - text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) evidence = [ - "Barack Obama served as the 44th President of the United States from 2009 to 2017.", + ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017." + ), "Barack Obama was born in Hawaii, not Kenya.", ] result = detector_real.compute_hallucination_rate(text, evidence) @@ -78,7 +88,10 @@ def detector_mock() -> Generator[HallucinationDetector, None, None]: def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None: - text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) claims = detector_mock.extract_claims(text) assert isinstance(claims, list) assert len(claims) == 2 @@ -86,7 +99,10 @@ def test_extract_claims_mock(detector_mock: HallucinationDetector) -> None: def test_verify_claim_supported_mock(detector_mock: HallucinationDetector) -> None: claim = "Barack Obama was the 44th President of the United States" - evidence = "Barack Obama served as the 44th President of the United States from 2009 to 2017." + evidence = ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017." + ) assert detector_mock.verify_claim(claim, evidence) is True @@ -97,9 +113,15 @@ def test_verify_claim_unsupported_mock(detector_mock: HallucinationDetector) -> def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) -> None: - text = "Barack Obama was the 44th President of the United States. He was born in Kenya." + text = ( + "Barack Obama was the 44th President of the United States. " + "He was born in Kenya." + ) evidence = [ - "Barack Obama served as the 44th President of the United States from 2009 to 2017.", + ( + "Barack Obama served as the 44th President of the United States " + "from 2009 to 2017.", + ), "Barack Obama was born in Hawaii, not Kenya.", ] result = detector_mock.compute_hallucination_rate(text, evidence) From 5109f3249a769fa15fa1a4ec40d94a1b9bc876c0 Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 23:48:08 +0530 Subject: [PATCH 7/9] transformer issues get fixed --- .../evaluation/hallucination/detector.py | 35 ++++++++++++++----- .../evaluation/hallucination/test_detector.py | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py index 6db27161886d4..f772060be6def 100644 --- a/libs/langchain/langchain/evaluation/hallucination/detector.py +++ b/libs/langchain/langchain/evaluation/hallucination/detector.py @@ -1,18 +1,31 @@ -from typing import List, Dict -from transformers import pipeline +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from transformers import pipeline as PipelineType + +# Lazy import for runtime +try: + from transformers import pipeline +except ImportError: + pipeline = None + class HallucinationDetector: - """ - Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli). + """Simple Hallucination Detector using NLI models (e.g., facebook/bart-large-mnli). - Extract claims (basic sentence split) - Verify claims against evidence docs using NLI - Compute hallucination rate """ def __init__(self, model_name: str = "facebook/bart-large-mnli"): + if pipeline is None: + raise ImportError( + "The 'transformers' package is required for HallucinationDetector. " + "Install it with `pip install transformers`." + ) self.nli_pipeline = pipeline("text-classification", model=model_name) - def extract_claims(self, text: str) -> List[str]: + def extract_claims(self, text: str) -> list[str]: """Naive sentence-based claim extraction""" return [c.strip() for c in text.split(".") if c.strip()] @@ -21,14 +34,20 @@ def verify_claim(self, claim: str, evidence: str) -> bool: result = self.nli_pipeline(f"{claim} {evidence}") return result[0]["label"].lower() == "entailment" - def verify_claim_multi(self, claim: str, evidence_docs: List[str]) -> bool: + def verify_claim_multi(self, claim: str, evidence_docs: list[str]) -> bool: """A claim is supported if any evidence doc entails it""" return any(self.verify_claim(claim, e) for e in evidence_docs) - def compute_hallucination_rate(self, text: str, evidence_docs: List[str]) -> Dict[str, float]: + def compute_hallucination_rate( + self, text: str, evidence_docs: list[str] + ) -> dict[str, float]: claims = self.extract_claims(text) if not claims: - return {"total_claims": 0, "unsupported_claims": 0, "hallucination_rate": 0.0} + return { + "total_claims": 0, + "unsupported_claims": 0, + "hallucination_rate": 0.0, + } unsupported = sum(not self.verify_claim_multi(c, evidence_docs) for c in claims) return { diff --git a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py index b7f7044d8aeca..63a658a1f31db 100644 --- a/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py +++ b/libs/langchain/tests/integration_tests/evaluation/hallucination/test_detector.py @@ -120,7 +120,7 @@ def test_compute_hallucination_rate_mock(detector_mock: HallucinationDetector) - evidence = [ ( "Barack Obama served as the 44th President of the United States " - "from 2009 to 2017.", + "from 2009 to 2017." ), "Barack Obama was born in Hawaii, not Kenya.", ] From 7a0bed5a25e721fda346ec2792f8fd746cbe87f2 Mon Sep 17 00:00:00 2001 From: Vipul Date: Mon, 6 Oct 2025 23:58:11 +0530 Subject: [PATCH 8/9] handling pipeline issue --- libs/langchain/langchain/evaluation/hallucination/detector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py index f772060be6def..b90da31cc2a8b 100644 --- a/libs/langchain/langchain/evaluation/hallucination/detector.py +++ b/libs/langchain/langchain/evaluation/hallucination/detector.py @@ -3,6 +3,8 @@ if TYPE_CHECKING: from transformers import pipeline as PipelineType + +pipeline: "PipelineType" | None = None # type: ignore # Lazy import for runtime try: from transformers import pipeline From 9adee9e5a7c88013c21dc6f3b12b593aa6df1362 Mon Sep 17 00:00:00 2001 From: Vipul Date: Tue, 7 Oct 2025 00:05:46 +0530 Subject: [PATCH 9/9] added typelinter standard pipeline so won't failed during ci --- .../evaluation/hallucination/detector.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/libs/langchain/langchain/evaluation/hallucination/detector.py b/libs/langchain/langchain/evaluation/hallucination/detector.py index b90da31cc2a8b..19265299c7f40 100644 --- a/libs/langchain/langchain/evaluation/hallucination/detector.py +++ b/libs/langchain/langchain/evaluation/hallucination/detector.py @@ -1,15 +1,12 @@ -from typing import TYPE_CHECKING +from typing import Callable, Optional, Any -if TYPE_CHECKING: - from transformers import pipeline as PipelineType - - -pipeline: "PipelineType" | None = None # type: ignore -# Lazy import for runtime +# Lazy import for optional transformers dependency +pipeline: Optional[Callable[..., Any]] = None try: - from transformers import pipeline + from transformers import pipeline as _pipeline + pipeline = _pipeline except ImportError: - pipeline = None + pass class HallucinationDetector: