[TRTLLM-8650][fix] beam search request validation (#8433) (#9228)

ixlmar · web-flow · commit 095b6864a8ca · 2025-11-21T04:08:45.000-08:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -310,7 +310,7 @@ def _fetch_and_process_requests(
             new_requests)
 
         # Validate and filter requests
-        new_requests = self._validate_and_filter_requests(new_requests)
+        new_requests = self._handle_special_queue_items(new_requests)
 
         # Attach Python objects to requests
         if py_request_objects and (self.dist.tp_size > 1
@@ -482,11 +482,11 @@ def _handle_request_broadcasting(self,
 
         return new_requests, py_request_objects
 
-    def _validate_and_filter_requests(
+    def _handle_special_queue_items(
             self,
             new_requests: List[RequestQueueItem]) -> List[RequestQueueItem]:
-        """Validate and filter requests, handling shutdown signals."""
-        valid_new_requests = []
+        """Handle special signals."""
+        accepted_new_requests = []
         for idx, req_item in enumerate(new_requests):
             if req_item.is_shutdown_request:
                 self.is_shutdown = True
@@ -499,17 +499,9 @@ def _validate_and_filter_requests(
                     self.request_accumulated.extend(new_requests[idx + 1:])
                 break
             else:
-                valid_new_requests.append(req_item)
+                accepted_new_requests.append(req_item)
 
-        # Check beam width validation
-        for req_item in valid_new_requests:
-            if req_item.request and hasattr(req_item.request,
-                                            'sampling_config'):
-                assert req_item.request.sampling_config.beam_width == self.max_beam_width, \
-                    f"Request beam width {req_item.request.sampling_config.beam_width} " \
-                    f"is not equal to max_beam_width {self.max_beam_width}. This is not supported!"
-
-        return valid_new_requests
+        return accepted_new_requests
 
     def _balance_requests_across_ranks(
             self, new_requests: List[RequestQueueItem],
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1607,6 +1607,16 @@ def _forward_step_inter_pp(self, scheduled_batch) -> SampleState:
         )
 
     def _validate_request(self, request: LlmRequest):
+        # Validate beam width
+        sampling_config = request.sampling_config
+        if sampling_config is not None:
+            if sampling_config.beam_width != self.max_beam_width:
+                raise ValueError(
+                    f"Request beam width {sampling_config.beam_width} "
+                    f"is not equal to max_beam_width {self.max_beam_width}. This is not supported!"
+                )
+
+        # Check token ID ranges
         if isinstance(self.model_engine.model, DecoderModelForCausalLM):
             # Only skip token‐range checks for Llama4 when the request has multimodal data
             from ..models.modeling_llama import Llama4ForConditionalGeneration
diff --git a/tests/unittest/_torch/executor/test_executor_request_queue.py b/tests/unittest/_torch/executor/test_executor_request_queue.py
@@ -475,8 +475,8 @@ def test_get_from_waiting_queue_edge_cases(executor_queue, queue_size,
     assert len(executor_queue.waiting_queue) == expected_remaining
 
 
-def test_validate_and_filter_requests(executor_queue):
-    """Test request validation and filtering."""
+def test_handle_special_queue_items(executor_queue):
+    """Test special queue item handling."""
     # Create a mock request without sampling_config to avoid beam validation
     mock_request = Mock()
     delattr(mock_request, 'sampling_config') if hasattr(
@@ -488,7 +488,7 @@ def test_validate_and_filter_requests(executor_queue):
 
     requests = [normal_req, cancel_req, shutdown_req]
 
-    valid_requests = executor_queue._validate_and_filter_requests(requests)
+    valid_requests = executor_queue._handle_special_queue_items(requests)
 
     assert len(valid_requests) == 1
     assert valid_requests[0] == normal_req
diff --git a/tests/unittest/_torch/sampler/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py
@@ -18,6 +18,8 @@
 import pytest
 import torch
 from transformers.configuration_utils import PretrainedConfig
+from utils.llm_data import llm_models_root
+from utils.util import force_ampere
 
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm._torch.attention_backend.interface import AttentionMetadata
@@ -31,6 +33,7 @@
 from tensorrt_llm._torch.models.modeling_utils import (
     ModelConfig, register_auto_model, register_checkpoint_weight_loader,
     register_config_loader)
+from tensorrt_llm.executor import RequestError
 from tensorrt_llm.executor.result import CompletionOutput, GenerationResult
 from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig
 
@@ -263,11 +266,21 @@ def fixed_params():
 
 
 @pytest.fixture(scope="module")
-def llm(fixed_params, input_prompts):
+def model_kwargs(fixed_params) -> dict[str, Any]:
     assert fixed_params[
         "max_beam_width"] == 2, "This test only works for a beam width of 2"
-    return LLM(
+    return dict(
         model=_pl.Path("dummy_path"),
+        checkpoint_loader=HfCheckpointLoader(
+            weight_loader=DummyWeightLoader(),
+            config_loader=DummyConfigLoader(),
+        ),
+    )
+
+
+def _build_llm(fixed_params, input_prompts, model_kwargs):
+    return LLM(
+        **model_kwargs,
         kv_cache_config=KvCacheConfig(max_tokens=10000),
         max_batch_size=fixed_params["max_beam_width"] * len(
             input_prompts
@@ -276,16 +289,18 @@ def llm(fixed_params, input_prompts):
         max_beam_width=fixed_params["max_beam_width"],
         disable_overlap_scheduler=True,
         cuda_graph_config=None,
-        checkpoint_loader=HfCheckpointLoader(weight_loader=DummyWeightLoader(),
-                                             config_loader=DummyConfigLoader()))
+    )
 
 
 @pytest.fixture(scope="module")
-def llm_cuda_graph(fixed_params, input_prompts):
-    assert fixed_params[
-        "max_beam_width"] == 2, "This test only works for a beam width of 2"
+def llm(fixed_params, input_prompts, model_kwargs):
+    return _build_llm(fixed_params, input_prompts, model_kwargs)
+
+
+@pytest.fixture(scope="module")
+def llm_cuda_graph(fixed_params, input_prompts, model_kwargs):
     return LLM(
-        model=_pl.Path("dummy_path"),
+        **model_kwargs,
         kv_cache_config=KvCacheConfig(max_tokens=10000),
         max_batch_size=fixed_params["max_beam_width"] * len(
             input_prompts
@@ -295,8 +310,7 @@ def llm_cuda_graph(fixed_params, input_prompts):
         disable_overlap_scheduler=False,
         cuda_graph_config=CudaGraphConfig(batch_sizes=[1, 2, 4, 8],
                                           enable_padding=True),
-        checkpoint_loader=HfCheckpointLoader(weight_loader=DummyWeightLoader(),
-                                             config_loader=DummyConfigLoader()))
+    )
 
 
 def check_generation_logits(beam: CompletionOutput,
@@ -473,5 +487,110 @@ def test_beam_search_output_shapes_cuda_graph_and_overlap(
                      sampling_params)
 
 
+@force_ampere  # Save H100 resource
+class TestParameterValidation:
+    """Ensure that unsupported request parameters do not crash/hang the engine."""
+
+    @pytest.fixture(scope="module")
+    @staticmethod
+    def fixed_params():
+        return {"max_tokens": 8, "max_beam_width": 4}
+
+    @pytest.fixture(scope="module")
+    @staticmethod
+    def model_kwargs() -> dict[str, Any]:
+        root = llm_models_root()
+        assert root is not None
+        return dict(model=root / "llama-models-v2" /
+                    "TinyLlama-1.1B-Chat-v1.0", )
+
+    # NB: Class-level fixture overrides do not work without this
+    @pytest.fixture(scope="module")
+    @staticmethod
+    def llm(fixed_params, input_prompts, model_kwargs):
+        return _build_llm(fixed_params, input_prompts, model_kwargs)
+
+    def _check_engine_responds(self, llm: LLM, input_prompts: list[str],
+                               fixed_params: dict):
+        _ = llm.generate(input_prompts,
+                         sampling_params=SamplingParams(
+                             max_tokens=fixed_params["max_tokens"],
+                             n=1,
+                             best_of=fixed_params["max_beam_width"],
+                             use_beam_search=True,
+                             end_id=-1,
+                         ))
+
+    @pytest.mark.timeout(120)
+    @pytest.mark.threadleak(enabled=False)
+    def test_use_beam_search_false(
+        self,
+        llm: LLM,
+        input_prompts: list[str],
+        fixed_params: dict,
+    ):
+        assert fixed_params["max_beam_width"] > 2
+        with pytest.raises(
+                ValueError,
+                match=
+                ".*Greedy decoding in the LLM API does not allow multiple returns.*"
+        ):
+            _ = llm.generate(input_prompts,
+                             sampling_params=SamplingParams(
+                                 max_tokens=fixed_params["max_tokens"],
+                                 n=1,
+                                 best_of=fixed_params["max_beam_width"],
+                                 use_beam_search=False,
+                                 end_id=-1,
+                             ))
+        self._check_engine_responds(llm, input_prompts, fixed_params)
+
+    @pytest.mark.timeout(120)
+    @pytest.mark.threadleak(enabled=False)
+    def test_use_beam_search_ommitted(
+        self,
+        llm: LLM,
+        input_prompts: list[str],
+        fixed_params: dict,
+    ):
+        assert fixed_params["max_beam_width"] > 2
+        with pytest.raises(
+                ValueError,
+                match=
+                ".*Greedy decoding in the LLM API does not allow multiple returns.*"
+        ):
+            _ = llm.generate(input_prompts,
+                             sampling_params=SamplingParams(
+                                 max_tokens=fixed_params["max_tokens"],
+                                 n=1,
+                                 best_of=fixed_params["max_beam_width"],
+                                 end_id=-1,
+                             ))
+        self._check_engine_responds(llm, input_prompts, fixed_params)
+
+    @pytest.mark.timeout(120)
+    @pytest.mark.threadleak(enabled=False)
+    def test_smaller_beam_width(
+        self,
+        llm: LLM,
+        input_prompts: list[str],
+        fixed_params: dict,
+    ):
+        assert fixed_params["max_beam_width"] > 2
+        with pytest.raises(
+                RequestError,
+                match=".*Request beam width 2 is not equal to max_beam_width 4*"
+        ):
+            _ = llm.generate(input_prompts,
+                             sampling_params=SamplingParams(
+                                 max_tokens=fixed_params["max_tokens"],
+                                 n=1,
+                                 best_of=2,
+                                 use_beam_search=True,
+                                 end_id=-1,
+                             ))
+        self._check_engine_responds(llm, input_prompts, fixed_params)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])