[https://nvbugs/5427043][fix] cherrypick: request length exceeds max_num_tokens (#7718)

Superjomn · web-flow · commit bc4136ffe7ef · 2025-09-22T03:37:48.000-07:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -932,7 +932,9 @@ def handle_for_ipc_batched(self, responses: List[tllm.Response]) -> None:
 
         for response in responses:
 
-            if self.worker._has_background_error():
+            if isinstance(response, ErrorResponse):
+                pass  # send ErrorResponse directly
+            elif self.worker._has_background_error():
                 response = self.worker._create_error_response(response)
             elif response.has_error():
                 # Convert to ErrorResponse, because tllm.Response cannot be
diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py
@@ -80,6 +80,18 @@ def test_single_completion(client: openai.OpenAI, model_name):
     assert len(completion.choices[0].text) >= 1
 
 
+def test_single_completion_with_too_long_prompt(client: openai.OpenAI,
+                                                model_name):
+    completion = client.completions.create(
+        model=model_name,
+        prompt="Hello, my name is" * 100,
+        max_tokens=5,
+        temperature=0.0,
+    )
+
+    print(completion)
+
+
 @pytest.mark.asyncio(loop_scope="module")
 @pytest.mark.parametrize("echo", [True, False])
 async def test_completion_streaming(async_client: openai.AsyncOpenAI,
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -1,3 +1,4 @@
+import random
 from contextlib import contextmanager, nullcontext
 
 import pytest
@@ -821,3 +822,17 @@ def test_llm_with_proxy_error():
                 match="Mock GenerationExecutorWorker initialization failed"):
             llm = LLM(model=llama_model_path,
                       kv_cache_config=global_kvcache_config)
+
+
+class TestLlmError:
+
+    def test_max_num_token_check(self):
+        """ LLM should raise error when got prompt length exceed the valid range. """
+        llm = LLM(llama_model_path,
+                  kv_cache_config=global_kvcache_config,
+                  max_num_tokens=100)
+
+        with pytest.raises(ValueError,
+                           match="should not exceed max_num_tokens"):
+            ids = [random.randint(10, 100) for _ in range(101)]
+            llm.generate([ids])