fix: ensure that stop token is not ignored if llm_params is None (#1529)

Pouyanpi · RobGeada · web-flow · commit 2df7a1406b4e · 2025-12-02T18:08:45.000+01:00
* Fix ignored stop token if llm_params is None Signed-off-by: Rob Geada <rob@geada.net> * fix(llm): pass stop tokens to ainvoke or invoke instead of bind Commit 67de947 ("chore(types): Type-clean /actions") introduced a bug where stop tokens were only passed to the LLM when llm_params was truthy. When llm_params was None or empty, stop tokens were completely ignored. Fix: Restored the original pattern where stop is passed directly to ainvoke() as a kwarg, and .bind() is only used for llm_params: - stop is now passed to _invoke_with_string_prompt() and _invoke_with_message_list() - Those functions pass stop=stop to llm.ainvoke() - .bind() is only called when llm_params is truthy (no stop in bind) --------- Signed-off-by: Rob Geada <rob@geada.net> Co-authored-by: Rob Geada <rob@geada.net>
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -164,14 +164,12 @@ async def llm_call(
     _setup_llm_call_info(llm, model_name, model_provider)
     all_callbacks = _prepare_callbacks(custom_callback_handlers)
 
-    generation_llm: Union[BaseLanguageModel, Runnable] = (
-        llm.bind(stop=stop, **llm_params) if llm_params and llm is not None else llm
-    )
+    generation_llm: Union[BaseLanguageModel, Runnable] = llm.bind(**llm_params) if llm_params else llm
 
     if isinstance(prompt, str):
-        response = await _invoke_with_string_prompt(generation_llm, prompt, all_callbacks)
+        response = await _invoke_with_string_prompt(generation_llm, prompt, all_callbacks, stop)
     else:
-        response = await _invoke_with_message_list(generation_llm, prompt, all_callbacks)
+        response = await _invoke_with_message_list(generation_llm, prompt, all_callbacks, stop)
 
     _store_reasoning_traces(response)
     _store_tool_calls(response)
@@ -206,10 +204,11 @@ async def _invoke_with_string_prompt(
     llm: Union[BaseLanguageModel, Runnable],
     prompt: str,
     callbacks: BaseCallbackManager,
+    stop: Optional[List[str]],
 ):
     """Invoke LLM with string prompt."""
     try:
-        return await llm.ainvoke(prompt, config=RunnableConfig(callbacks=callbacks))
+        return await llm.ainvoke(prompt, config=RunnableConfig(callbacks=callbacks), stop=stop)
     except Exception as e:
         raise LLMCallException(e)
 
@@ -218,12 +217,13 @@ async def _invoke_with_message_list(
     llm: Union[BaseLanguageModel, Runnable],
     prompt: List[dict],
     callbacks: BaseCallbackManager,
+    stop: Optional[List[str]],
 ):
     """Invoke LLM with message list after converting to LangChain format."""
     messages = _convert_messages_to_langchain_format(prompt)
 
     try:
-        return await llm.ainvoke(messages, config=RunnableConfig(callbacks=callbacks))
+        return await llm.ainvoke(messages, config=RunnableConfig(callbacks=callbacks), stop=stop)
     except Exception as e:
         raise LLMCallException(e)
 
diff --git a/tests/test_actions_llm_utils.py b/tests/test_actions_llm_utils.py
@@ -532,3 +532,19 @@ def test_store_tool_calls_with_real_aimessage_multiple_tool_calls():
     assert len(tool_calls) == 2
     assert tool_calls[0]["name"] == "foo"
     assert tool_calls[1]["name"] == "bar"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("llm_params", [None, {}])
+async def test_llm_call_stop_tokens_passed_without_llm_params(llm_params):
+    """Stop tokens must be passed to ainvoke even when llm_params is None or empty."""
+    from unittest.mock import AsyncMock, MagicMock
+
+    from nemoguardrails.actions.llm.utils import llm_call
+
+    mock_llm = AsyncMock()
+    mock_llm.ainvoke.return_value = MagicMock(content="response")
+
+    await llm_call(mock_llm, "prompt", stop=["User:"], llm_params=llm_params)
+
+    assert mock_llm.ainvoke.call_args[1]["stop"] == ["User:"]
diff --git a/tests/test_llm_params_e2e.py b/tests/test_llm_params_e2e.py
@@ -46,8 +46,7 @@ def nim_config_content():
     models:
       - type: main
         engine: nim
-        model: meta/llama-3.1-70b-instruct
-        api_base: https://integrate.api.nvidia.com/v1
+        model: meta/llama-3.3-70b-instruct
     """
 
 
@@ -197,6 +196,26 @@ async def test_openai_llm_params_streaming(self, openai_config_path):
         content = response.response[-1]["content"]
         assert "1" in content
 
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("OPENAI_API_KEY"),
+        reason="OpenAI API key not available for e2e testing",
+    )
+    async def test_openai_stop_tokens_without_llm_params(self, openai_config_path):
+        """Test stop tokens work without llm_params (regression test for 67de94723)."""
+        config = RailsConfig.from_path(openai_config_path)
+        rails = LLMRails(config, verbose=False)
+
+        response = await llm_call(
+            rails.llm,
+            "Count from 1 to 10, one number per line.",
+            stop=["5"],
+            llm_params=None,
+        )
+
+        assert "4" in response
+        assert "5" not in response
+
 
 @pytest.mark.skipif(
     not LIVE_TEST_MODE,
@@ -392,7 +411,7 @@ async def test_openai_unsupported_params_error_handling(self, openai_config_path
         models:
           - type: main
             engine: openai
-            model: o1-mini
+            model: o3-mini
         """
 
         with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/tests/test_tool_calling_utils.py b/tests/test_tool_calling_utils.py
@@ -249,7 +249,7 @@ async def test_llm_call_with_llm_params():
     result = await llm_call(mock_llm, "Test prompt", llm_params=llm_params)
 
     assert result == "LLM response with params"
-    mock_llm.bind.assert_called_once_with(stop=None, **llm_params)
+    mock_llm.bind.assert_called_once_with(**llm_params)
     mock_bound_llm.ainvoke.assert_called_once()
 
 
@@ -298,7 +298,7 @@ async def test_llm_call_with_llm_params_temperature_max_tokens():
     result = await llm_call(mock_llm, "Test prompt", llm_params=llm_params)
 
     assert result == "Response with temp and tokens"
-    mock_llm.bind.assert_called_once_with(stop=None, temperature=0.8, max_tokens=50)
+    mock_llm.bind.assert_called_once_with(temperature=0.8, max_tokens=50)
     mock_bound_llm.ainvoke.assert_called_once()