[fix]: Revert commit 388b491 (NVIDIA#6143)

LinPoly · dc3671 · commit 48ddc3d4b9dc · 2025-07-22T12:48:00.000+08:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -544,14 +544,6 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                 raise ValueError(
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
-            # Check prompt length and query length against max_num_tokens to filter illegal requests.
-            # Skip check for gen-only requests
-            if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only:
-                max_num_tokens = self.args.max_num_tokens
-                if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
-                    raise ValueError(
-                        f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed "
-                        f"max_num_tokens ({max_num_tokens})")
             return
 
         build_config = self.args.build_config
@@ -568,7 +560,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
             (sampling_params.max_tokens or 0) > max_seq_len):
             raise ValueError(
                 f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}) and query length ({query_len}) max_tokens ({sampling_params.max_tokens}) should not exceed "
-                f"max_seq_len ({max_seq_len})")
+                f"max_seq_len ({build_config.max_seq_len})")
 
         if sampling_params.use_beam_search and sampling_params.best_of > build_config.max_beam_width:
             if sampling_params.n == sampling_params.best_of:
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -2089,36 +2089,24 @@ def success_path():
     success_path()
 
 
-def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1):
-    llm_args_extra = {}
-    if pytorch_backend:
-        LLM_CLASS = LLM_torch
-        llm_args_extra["max_num_tokens"] = 64
-    else:
-        LLM_CLASS = LLM
-        build_config = BuildConfig()
-        build_config.max_num_tokens = 64
-        llm_args_extra["fast_build"] = True
-        llm_args_extra["build_config"] = build_config
+def _test_llm_capture_request_error(tp_size: int = 1):
+    build_config = BuildConfig()
+    build_config.max_num_tokens = 64
 
-    llm = LLM_CLASS(
+    llm = LLM(
         model=llama_model_path,
-        tensor_parallel_size=tp_size,
-        **llm_args_extra,
+        build_config=build_config,
+        fast_build=True,
     )
 
     prompt = 'A ' * 65  # the minimum max_num_tokens is 64
-    if pytorch_backend:
-        # pytorch backend will raise ValueError for max_num_tokens
-        with pytest.raises(ValueError):
-            llm.generate(prompt)
-    else:
-        with pytest.raises(RequestError):
-            llm.generate(prompt)
+
+    with pytest.raises(RequestError):
+        llm.generate(prompt)
 
 
 def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=False, tp_size=1)
+    _test_llm_capture_request_error(tp_size=1)
 
 
 def test_llm_shutdown_executor():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -466,7 +466,7 @@ def test_llm_get_stats_async_tp2(pytorch_backend):
 
 
 def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=False, tp_size=2)
+    _test_llm_capture_request_error(tp_size=2)
 
 
 def test_llm_with_postprocess_parallel_tp2():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -7,17 +7,11 @@
 from tensorrt_llm.lora_manager import LoraConfig
 from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness
 from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness
-from .test_llm import _test_llm_capture_request_error
 # isort: on
 
 global_kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
 
 
-@pytest.mark.gpu2
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=True, tp_size=2)
-
-
 @pytest.mark.gpu4
 def test_tinyllama_logits_processor_tp2pp2():
     tinyllama_logits_processor_test_harness(backend="pytorch",
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -6,11 +6,12 @@
 
 # isort: off
 from .lora_test_utils import check_llama_7b_multi_unique_lora_adapters_from_request
-from .test_llm import (
-    get_model_path, global_kvcache_config, llama_model_path,
-    llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts,
-    run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler,
-    tinyllama_logits_processor_test_harness, _test_llm_capture_request_error)
+from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
+                       llm_get_stats_async_test_harness,
+                       llm_get_stats_test_harness, prompts,
+                       run_llm_abort_request,
+                       run_llm_with_postprocess_parallel_and_result_handler,
+                       tinyllama_logits_processor_test_harness)
 from utils.util import (EnvVarsContextManager, force_ampere,
                         run_function_in_sub_process, similar,
                         skip_gpu_memory_less_than_40gb,
@@ -69,10 +70,6 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
         enable_iter_req_stats=enable_iter_req_stats)
 
 
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
-
-
 @force_ampere
 @pytest.mark.parametrize(
     "sampling_params",