cleanlab
diff --git a/‎src/cleanlab_tlm/internal/rag.py‎
Lines changed: 130 additions & 0 deletions b/‎src/cleanlab_tlm/internal/rag.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎src/cleanlab_tlm/utils/rag.py‎
Lines changed: 36 additions & 0 deletions b/‎src/cleanlab_tlm/utils/rag.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎tests/internal/__init__.py‎ b/‎tests/internal/__init__.py‎
diff --git a/‎tests/internal/test_rag.py‎
Lines changed: 178 additions & 0 deletions b/‎tests/internal/test_rag.py‎
Lines changed: 178 additions & 0 deletions
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+from functools import wraps
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+
+from cleanlab_tlm.utils.chat import _TOOL_CALL_TAG_END, _TOOL_CALL_TAG_START
+
+if TYPE_CHECKING:
+    from collections.abc import Coroutine
+
+# Define type variables for the response types
+ResponseT = TypeVar("ResponseT")
+
+
+def _is_tool_call_response(response_string: str) -> bool:
+    """Check if response string represents a tool call."""
+    stripped = response_string.strip()
+
+    # If response doesn't contain tool call tags, it's not a tool call
+    if _TOOL_CALL_TAG_START not in stripped or _TOOL_CALL_TAG_END not in stripped:
+        return False
+
+    # Find all tool call sections and remove them
+    remaining_content = stripped
+    while _TOOL_CALL_TAG_START in remaining_content and _TOOL_CALL_TAG_END in remaining_content:
+        start_pos = remaining_content.find(_TOOL_CALL_TAG_START)
+        end_pos = remaining_content.find(_TOOL_CALL_TAG_END, start_pos)
+
+        # If we can't find a matching closing tag, break
+        if end_pos == -1:
+            break
+
+        # Remove this tool call section (including the tags)
+        end_pos += len(_TOOL_CALL_TAG_END)
+        remaining_content = remaining_content[:start_pos] + remaining_content[end_pos:]
+
+    # If there's any non-whitespace content left after removing all tool calls,
+    # then this response contains regular text and is not a pure tool call response
+    return not remaining_content.strip()
+
+
+def _handle_tool_call_filtering(
+    func: Callable[..., Coroutine[Any, Any, ResponseT]],
+) -> Callable[..., Coroutine[Any, Any, ResponseT]]:
+    """
+    Decorator to handle tool call filtering for scoring methods.
+
+    When tool call handling is enabled and a tool call is detected:
+    - Filters out evals that have response_identifier (these would get None scores)
+    - Calls the original method with filtered evals via a context wrapper
+    - Adds None scores for the filtered evals
+    - Returns the combined result
+
+    This implementation avoids modifying the original instance state to prevent
+    race conditions in concurrent async operations.
+    """
+
+    @wraps(func)
+    async def wrapper(self: Any, **kwargs: Any) -> ResponseT:
+        response = kwargs.get("response", {})
+        response_text = response.get("response", "")
+        is_tool_call = _is_tool_call_response(str(response_text))
+
+        # If not a tool call, just call the original method
+        if not is_tool_call:
+            return await func(self, **kwargs)
+
+        # It's a tool call - determine which evals to process vs. filter
+        # Default behavior:
+        #   - Evals with response_identifier are filtered out (score None)
+        #   - Evals without response_identifier are still evaluated normally
+        # Optional per-eval overrides via instance-level include/exclude name sets:
+        #   - If name in exclude set, filter (score None)
+
+        exclude_names = set(getattr(self, "_tool_call_eval_exclude_names", set()) or set())
+
+        evals_to_process = []
+        tool_call_filtered_evals = []
+
+        for eval_obj in self._evals:
+            # Start from default filtering decision
+            is_filtered = eval_obj.response_identifier is not None and eval_obj.name in exclude_names
+
+            if is_filtered:
+                tool_call_filtered_evals.append(eval_obj)
+            else:
+                evals_to_process.append(eval_obj)
+
+        # Create a context wrapper that temporarily provides filtered evals
+        # without modifying the original instance
+        class _EvalsContextWrapper:
+            def __init__(self, original_instance: Any, filtered_evals: list[Any]):
+                self._original = original_instance
+                self._filtered_evals = filtered_evals
+
+            def __getattr__(self, name: str) -> Any:
+                if name == "_evals":
+                    return self._filtered_evals
+                return getattr(self._original, name)
+
+            def __repr__(self) -> str:
+                return repr(self._original)
+
+            def __str__(self) -> str:
+                return str(self._original)
+
+        # Use the wrapper instance to call the original method
+        wrapper_instance = _EvalsContextWrapper(self, evals_to_process)
+        backend_response: ResponseT = await func(wrapper_instance, **kwargs)
+        return _rebuild_response(backend_response, self._evals)
+
+    return wrapper
+
+
+def _rebuild_response(backend_response: ResponseT, evals: list[Any]) -> ResponseT:
+    eval_names = [e.name for e in evals]
+    ordered = {}
+
+    for k, v in backend_response.items():  # type: ignore
+        if k not in eval_names:
+            ordered[k] = v
+
+    for e in evals:
+        name = e.name
+        if name in backend_response:  # type: ignore
+            ordered[name] = backend_response[name]  # type: ignore
+        else:
+            ordered[name] = {"score": None}  # filtered or missing
+
+    return ordered  # type: ignore
@@ -41,6 +41,7 @@
     _VALID_TLM_QUALITY_PRESETS,
 )
 from cleanlab_tlm.internal.exception_handling import handle_tlm_exceptions
+from cleanlab_tlm.internal.rag import _handle_tool_call_filtering
 from cleanlab_tlm.internal.validation import (
     _validate_trustworthy_rag_options,
     tlm_score_process_response_and_kwargs,
@@ -87,6 +88,10 @@ class TrustworthyRAG(BaseTLM):
             To come up with your custom `evals`, we recommend you first run [get_default_evals()](#function-get_default_evals) and then add/remove/modify the returned list.
             Each [Eval](#class-eval) in this list provides real-time detection of specific issues in your RAG application based on the user query, retrieved context (documents), and/or LLM-generated response.
             Set this to an empty list to only score response trustworthiness without additional evaluations.
+
+        Tool call handling: by default, when a tool call response is detected, evaluations that analyze the response content
+        (those with a `response_identifier`) are assigned `score=None`. You can override this behavior for specific evals via
+        `_configure_tool_call_eval_overrides()`.
     """
 
     def __init__(
@@ -135,6 +140,35 @@ def __init__(
 
         _validate_trustworthy_rag_options(options=options, initialized_evals=self._evals)
 
+        # Optional per-eval tool call overrides
+        # These are name-based include/exclude sets used only in the _handle_tool_call_filtering decorator
+        self._configure_tool_call_eval_overrides(exclude_names=[k.name for k in self._evals if k.response_identifier])
+
+    def _configure_tool_call_eval_overrides(
+        self,
+        *,
+        exclude_names: Optional[list[str]] = None,
+    ) -> None:
+        """Validates and stores tool-call exclusion names.
+
+        Only evals that read from the model response (have a non-None `response_identifier`)
+        are eligible for tool-call filtering. We validate here (configuration boundary) so the
+        decorator `_handle_tool_call_filtering` can assume a correct set and remain simple.
+
+        - If an eval name is in exclude_names, it will be filtered (score=None) during tool call handling.
+
+        Args:
+            exclude_names (list[str] | None): Evaluation names to always filter during tool calls.
+        """
+        names = exclude_names or []
+        eligible = {e.name for e in self._evals if e.response_identifier is not None}
+        invalid = [n for n in names if n not in eligible]
+        if invalid:
+            raise ValidationError(
+                f"Invalid eval name(s) for tool-call exclusion (must exist and have response_identifier): {', '.join(invalid)}"
+            )
+        self._tool_call_eval_exclude_names = set(names)  # membership filter; order/dupes irrelevant
+
     def score(
         self,
         *,
@@ -434,6 +468,7 @@ async def _batch_async(
             await gather_task,
         )
 
+    @_handle_tool_call_filtering
     @handle_tlm_exceptions("TrustworthyRAGResponse")
     async def _generate_async(
         self,
@@ -480,6 +515,7 @@ async def _generate_async(
             ),
         )
 
+    @_handle_tool_call_filtering
     @handle_tlm_exceptions("TrustworthyRAGScore")
     async def _score_async(
         self,
 
@@ -0,0 +1,178 @@
+from typing import Any
+from unittest import mock
+
+from cleanlab_tlm.utils.rag import TrustworthyRAG
+from tests.test_tlm_rag import (
+    test_context,
+    test_prompt,
+    test_query,
+    test_response,
+    trustworthy_rag,  # noqa: F401
+    trustworthy_rag_api_key,  # noqa: F401
+)
+
+
+def test_decorator_skips_bulk_logic_for_non_tool_calls(trustworthy_rag: TrustworthyRAG) -> None:  # noqa: F811
+    """Tests that the _handle_tool_call_filtering decorator skips the bulk of its logic for non-tool calls.
+
+    Expected:
+    - When _is_tool_call_response returns False, the decorator should skip eval filtering logic
+    - The original _evals should not be modified during execution
+    - No None scores should be added for tool call filtered evals
+    """
+    # Store original evals for comparison
+    original_evals = trustworthy_rag._evals.copy()
+    original_evals_id = id(trustworthy_rag._evals)
+
+    # Mock to track if the bulk logic is executed
+    with mock.patch("cleanlab_tlm.internal.rag._is_tool_call_response", return_value=False) as mock_is_tool_call:
+        # Track if evals are temporarily modified (which shouldn't happen for non-tool calls)
+        evals_modifications = []
+        original_setattr = object.__setattr__
+
+        def track_evals_setattr(self: Any, name: str, value: Any) -> Any:
+            if name == "_evals" and hasattr(self, "_evals"):
+                evals_modifications.append((name, value, id(value)))
+            return original_setattr(self, name, value)
+
+        with mock.patch.object(type(trustworthy_rag), "__setattr__", track_evals_setattr):
+            response = trustworthy_rag.score(
+                query=test_query,
+                context=test_context,
+                response=test_response,
+                prompt=test_prompt,
+            )
+
+        # Verify _is_tool_call_response was called (decorator logic was entered)
+        assert mock_is_tool_call.call_count > 0
+
+        # Verify that evals were not temporarily modified (bulk logic was skipped)
+        # The only modifications should be the initial assignment during init, not temporary changes
+        evals_temp_modifications = [mod for mod in evals_modifications if mod[2] != original_evals_id]
+        assert len(evals_temp_modifications) == 0, f"Evals were temporarily modified: {evals_temp_modifications}"
+
+    # Verify evals are unchanged after the call
+    assert trustworthy_rag._evals == original_evals
+    assert id(trustworthy_rag._evals) == original_evals_id
+
+    # Verify we got a normal response with actual scores (not None scores from tool call filtering)
+    assert isinstance(response, dict)
+    for eval_name, eval_data in response.items():
+        if eval_name != "trustworthiness":  # trustworthiness might have None score if disabled
+            # Non-tool calls should have actual scores, not None scores from tool call filtering
+            assert eval_data["score"] is not None or eval_name == "trustworthiness"
+
+
+def test_decorator_calls_api_with_full_evals_for_non_tool_calls(trustworthy_rag_api_key: str) -> None:  # noqa: F811
+    """Decorator should pass full evals to API for non-tool-call responses.
+
+    Expected:
+    - When _is_tool_call_response returns False, the decorator should call the underlying API
+      with the complete _evals parameter (no filtering applied).
+    """
+    # Create TrustworthyRAG instance
+    tlm_rag = TrustworthyRAG(api_key=trustworthy_rag_api_key)
+
+    # Store the original evals to verify they're passed through
+    original_evals = tlm_rag._evals.copy()
+
+    # Mock _is_tool_call_response to return False (non-tool call)
+    with (
+        mock.patch("cleanlab_tlm.internal.rag._is_tool_call_response", return_value=False),
+        mock.patch("cleanlab_tlm.internal.api.api.tlm_rag_score") as mock_api_score,
+    ):
+        # Configure the mock to return a valid response
+        mock_api_score.return_value = {eval_name: {"score": 0.8, "reason": "test"} for eval_name in original_evals}
+
+        response = tlm_rag.score(
+            query=test_query,
+            context=test_context,
+            response=test_response,
+        )
+
+        # Verify the API was called with the full evals (no filtering)
+        assert mock_api_score.call_count == 1
+        call_args = mock_api_score.call_args
+        assert call_args is not None
+
+        # Check that evals parameter matches the original evals
+        called_evals = call_args.kwargs.get("evals")
+        assert called_evals == original_evals
+
+    # Should get a normal response
+    assert isinstance(response, dict)
+    for eval_dict in response.values():
+        assert isinstance(eval_dict["score"], float)
+
+
+def test_ordering_preserved_for_non_tool_calls(trustworthy_rag_api_key: str) -> None:  # noqa: F811
+    """When not a tool call, ordering should match exactly what the mocked api.tlm_rag_score returns."""
+    tlm_rag = TrustworthyRAG(api_key=trustworthy_rag_api_key)
+
+    # Construct a mocked backend result with a specific insertion order
+    mocked_backend = {
+        "trustworthiness": {"score": 0.91},
+        # Put evals in a custom order to ensure we preserve this order
+        "query_ease": {"score": 0.11},
+        "context_sufficiency": {"score": 0.22},
+        "response_helpfulness": {"score": 0.33},
+        "response_groundedness": {"score": 0.44},
+    }
+
+    with (
+        mock.patch("cleanlab_tlm.internal.rag._is_tool_call_response", return_value=False),
+        mock.patch("cleanlab_tlm.internal.api.api.tlm_rag_score", return_value=mocked_backend),
+    ):
+        result = tlm_rag.score(
+            query=test_query,
+            context=test_context,
+            response=test_response,
+        )
+
+    assert isinstance(result, dict)
+    assert list(result.keys()) == list(mocked_backend.keys())
+
+
+def test_ordering_rebuilt_for_tool_calls(trustworthy_rag_api_key: str) -> None:  # noqa: F811
+    """For tool calls, non-eval keys keep backend order, then all evals in self._evals order with filtered as None."""
+    tlm_rag = TrustworthyRAG(api_key=trustworthy_rag_api_key)
+
+    # Default eval order from TrustworthyRAG
+    eval_order = [e.name for e in tlm_rag._evals]
+    assert eval_order == [
+        "context_sufficiency",
+        "response_groundedness",
+        "response_helpfulness",
+        "query_ease",
+    ]
+
+    # Backend only processes non-response evals during tool-calls (decorator filters response-based evals)
+    # Intentionally put processed evals in a non-evals order to ensure rebuild will override to eval_order
+    mocked_backend_processed = {
+        "trustworthiness": {"score": 0.9},
+        "query_ease": {"score": 0.5},
+        "context_sufficiency": {"score": 0.8},
+    }
+
+    with (
+        mock.patch("cleanlab_tlm.internal.rag._is_tool_call_response", return_value=True),
+        mock.patch("cleanlab_tlm.internal.api.api.tlm_rag_score", return_value=mocked_backend_processed),
+    ):
+        result = tlm_rag.score(
+            query=test_query,
+            context=test_context,
+            response=test_response,
+            prompt=test_prompt,
+        )
+
+    assert isinstance(result, dict)
+
+    # Non-eval keys (trustworthiness) should appear first preserving backend order
+    expected_keys = ["trustworthiness", *eval_order]
+    assert list(result.keys()) == expected_keys
+
+    # Filtered response-based evals should be present with None score
+    assert result["response_groundedness"]["score"] is None
+    assert result["response_helpfulness"]["score"] is None
+    assert result["query_ease"]["score"] == mocked_backend_processed["query_ease"]["score"]
+    assert result["context_sufficiency"]["score"] == mocked_backend_processed["context_sufficiency"]["score"]