NVIDIA · SimengLiu-nv · Nov 8, 2025 · Nov 26, 2025 · Dec 3, 2025 · Dec 5, 2025
@@ -66,7 +66,7 @@ repos:
         additional_dependencies:
         - tomli
         # add ignore words list
-        args: ["-L", "Mor,ans,thirdparty"]
+        args: ["-L", "Mor,ans,thirdparty", "--skip", "security_scanning/*"]
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.9.4
     hooks:

@@ -481,7 +481,12 @@ def _handle_response(self, response: "GenerationExecutor.Response"):
                 else:
                     beam_output.text = self.tokenizer.decode(
                         beam_output.token_ids, **kwargs)
-
+                # Update _last_token_ids_len after detokenization to prevent
+                # re-decoding the same tokens in subsequent responses when n > 1.
+                # Without this, outputs not updated in _handle_sequence would have
+                # stale _last_token_ids_len, causing token_ids_diff to return old tokens.
+                beam_output._last_token_ids_len = len(beam_output.token_ids)
+
                 is_generating = not self._done
                 is_finished_with_stop_or_length = (
                     beam_output.finish_reason == 'stop'

@@ -197,9 +197,3 @@ def parse_chat_messages_coroutines(
 
     return conversation, mm_data_tracker.retrieve_all_async(
     ), mm_placeholder_counts
-
-
-def check_multiple_response(n: int, backend: Optional[str]):
-    if n > 1 and backend == "pytorch":
-        raise ValueError(
-            "Multiple response is not supported in PyTorch workflow")
@@ -31,8 +31,7 @@
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.logger import logger
 from tensorrt_llm.metrics.collector import MetricsCollector
-from tensorrt_llm.serve.chat_utils import (check_multiple_response,
-                                           parse_chat_messages_coroutines)
+from tensorrt_llm.serve.chat_utils import parse_chat_messages_coroutines
 from tensorrt_llm.serve.metadata_server import create_metadata_server
 from tensorrt_llm.serve.openai_protocol import (ChatCompletionRequest,
                                                 ChatCompletionResponse,
@@ -417,7 +416,6 @@ async def create_chat_response(
             return chat_response
 
         try:
-            check_multiple_response(request.n, self.llm.args.backend)
             conversation: List[ConversationMessage] = []
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
@@ -524,7 +522,6 @@ async def create_mm_embedding_response(promise: RequestOutput):
             )
 
         try:
-            check_multiple_response(request.n, self.llm.args.backend)
             conversation: List[ConversationMessage] = []
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
@@ -651,7 +648,6 @@ async def generator_wrapper(generator: AsyncIterator[Any]):
             yield "data: [DONE]\n\n"
 
         try:
-            check_multiple_response(request.n, self.llm.args.backend)
             if isinstance(request.prompt, str) or \
                 (isinstance(request.prompt, list) and isinstance(request.prompt[0], int)):
                 prompts = [request.prompt]

@@ -68,6 +68,8 @@ def server(model_name: str, backend: str, extra_llm_api_options: bool,
            temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
     args = ["--backend", f"{backend}"]
+    args.extend(["--kv_cache_free_gpu_memory_fraction",
+                 "0.2"])  # for co-existence with other servers
     if backend == "trt":
         args.extend(["--max_beam_width", "4"])
     if extra_llm_api_options:
@@ -78,11 +80,34 @@ def server(model_name: str, backend: str, extra_llm_api_options: bool,
         yield remote_server
 
 
+@pytest.fixture(scope="module")
+def server_with_beam_search(model_name: str, backend: str,
+                            extra_llm_api_options: bool,
+                            temp_extra_llm_api_options_file: str,
+                            num_postprocess_workers: int):
+    model_path = get_model_path(model_name)
+    args = ["--backend", f"{backend}"]
+    args.extend(["--kv_cache_free_gpu_memory_fraction",
+                 "0.2"])  # for co-existence with other servers
+    args.extend(["--max_beam_width", "2"])
+    if extra_llm_api_options:
+        args.extend(
+            ["--extra_llm_api_options", temp_extra_llm_api_options_file])
+    args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
+    with RemoteOpenAIServer(model_path, args) as remote_server:
+        yield remote_server
+
+
 @pytest.fixture(scope="module")
 def client(server: RemoteOpenAIServer):
     return server.get_client()
 
 
+@pytest.fixture(scope="module")
+def client_with_beam_search(server_with_beam_search: RemoteOpenAIServer):
+    return server_with_beam_search.get_client()
+
+
 @pytest.fixture(scope="module")
 def async_client(server: RemoteOpenAIServer):
     return server.get_async_client()
@@ -180,7 +205,33 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
                             backend: str):
     if backend == "pytorch":
         pytest.skip(
-            "Multiple responses are not supported in PyTorch backend yet")
+            "'n' not allowed with temperature=0 unless TLLM_ALLOW_N_GREEDY_DECODING=1"
+        )
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+    # test n and best_of
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        n=2,
+        temperature=0.0,
+        extra_body=dict(best_of=4),
+    )
+    assert len(chat_completion.choices) == 2
+
+
+def test_multiple_responses_and_beam_search(client: openai.OpenAI,
+                                            model_name: str, backend: str):
+    if backend == "pytorch":
+        pytest.skip(
+            "Mixing beam search and regular requests is not supported in PyTorch backend"
+        )
 
     messages = [{
         "role": "system",
@@ -202,6 +253,7 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
     assert chat_completion.choices[
         0].message.content != chat_completion.choices[
             1].message.content, "beam search should be different"
+
     # test n and best_of
     chat_completion = client.chat.completions.create(
         model=model_name,
@@ -214,6 +266,30 @@ def test_multiple_responses(client: openai.OpenAI, model_name: str,
     assert len(chat_completion.choices) == 2
 
 
+def test_multiple_responses_with_beam_search(
+        client_with_beam_search: openai.OpenAI, model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+    # test beam search
+    chat_completion = client_with_beam_search.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        n=2,
+        temperature=0.0,
+        extra_body=dict(use_beam_search=True),
+    )
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[
+            1].message.content, "beam search should be different"
+
+
 @pytest.mark.asyncio(loop_scope="module")
 async def test_chat_streaming(async_client: openai.AsyncOpenAI,
                               model_name: str):

@@ -1,10 +1,12 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_completion.py
 
+import itertools
 from typing import List
 
 import openai
 import pytest
+from utils.util import similar
 
 from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
@@ -33,8 +35,21 @@ def num_postprocess_workers(request):
 def server(model_name: str, backend: str, num_postprocess_workers: int):
     model_path = get_model_path(model_name)
     args = ["--backend", f"{backend}"]
-    if backend == "trt":
-        args.extend(["--max_beam_width", "4"])
+    args.extend(["--kv_cache_free_gpu_memory_fraction",
+                 "0.2"])  # for co-existence with other servers
+    args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
+    with RemoteOpenAIServer(model_path, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_with_beam_search(model_name: str, backend: str,
+                            num_postprocess_workers: int):
+    model_path = get_model_path(model_name)
+    args = ["--backend", f"{backend}"]
+    args.extend(["--kv_cache_free_gpu_memory_fraction",
+                 "0.2"])  # for co-existence with other servers
+    args.extend(["--max_beam_width", "2"])
     args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
@@ -50,6 +65,11 @@ def async_client(server: RemoteOpenAIServer):
     return server.get_async_client()
 
 
+@pytest.fixture(scope="module")
+def async_client_with_beam_search(server_with_beam_search: RemoteOpenAIServer):
+    return server_with_beam_search.get_async_client()
+
+
 def test_single_completion(client: openai.OpenAI, model_name):
     completion = client.completions.create(
         model=model_name,
@@ -146,12 +166,10 @@ async def test_batch_completions(async_client: openai.AsyncOpenAI, model_name,
 @pytest.mark.asyncio(loop_scope="module")
 @pytest.mark.parametrize("prompts",
                          [["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2])
-async def test_batch_completions_beam_search(async_client: openai.AsyncOpenAI,
-                                             model_name, prompts, backend):
+async def test_batch_completions_beam_search(
+        async_client_with_beam_search: openai.AsyncOpenAI, model_name, prompts):
     # test beam search
-    if backend == 'pytorch':
-        pytest.skip("Beam search is not supported in PyTorch backend yet")
-    batch = await async_client.completions.create(
+    batch = await async_client_with_beam_search.completions.create(
         model=model_name,
         prompt=prompts,
         n=2,
@@ -189,6 +207,51 @@ async def test_batch_completions_streaming(async_client: openai.AsyncOpenAI,
     assert texts[0] == texts[1]
 
 
+@pytest.mark.asyncio(loop_scope="module")
+@pytest.mark.parametrize("prompts", [["Hello, my name is"] * 2])
+async def test_batch_completions_with_option_n_streaming(
+        async_client: openai.AsyncOpenAI, model_name, prompts):
+    # Use non-stream single generation as reference
+    completion_ref = await async_client.completions.create(
+        model=model_name,
+        prompt=prompts[0],
+        max_tokens=5,
+        temperature=0.0001,
+    )
+
+    text_ref = completion_ref.choices[0].text
+    # .choices[0].text
+
+    # test beam search with streaming
+    batch = await async_client.completions.create(
+        model=model_name,
+        prompt=prompts,
+        n=3,  # number of completions to generate for each prompt.
+        max_tokens=5,
+        temperature=0.0001,
+        stream=True,
+    )
+    texts = [""] * 6  # 2 prompts × 3 generations per prompt = 6 choices
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+
+    assert "" not in texts  # Assert all the generations are not empty
+
+    # Check all pairs within first request are consistent
+    for i, j in itertools.combinations(texts[:3], 2):
+        assert similar(i, j, threshold=0.8)
+
+    # Check all pairs within second request are consistent
+    for i, j in itertools.combinations(texts[3:], 2):
+        assert similar(i, j, threshold=0.8)
+
+    # Check all generations are consistent with the reference
+    for text in texts:
+        assert similar(text, text_ref, threshold=0.8)
+
+
 @pytest.mark.asyncio(loop_scope="module")
 async def test_completion_stream_options(async_client: openai.AsyncOpenAI,
                                          model_name: str):