[None][chore] set the default value of max_num_tokens explicitly (#8208)

QiJune · web-flow · commit 616d1df7a09e · 2025-10-14T23:03:02.000-07:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -248,9 +248,6 @@ def create_py_executor(
         max_batch_size,
     ) = llm_args.get_runtime_sizes()
 
-    if max_num_tokens is None:
-        max_num_tokens = 8192
-
     tokens_per_block = kv_cache_config.tokens_per_block
     if pytorch_backend_config.attn_backend == "VANILLA":
         tokens_per_block = max_num_tokens
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1595,7 +1595,7 @@ class BaseLlmArgs(StrictBaseModel):
                                           description="The maximum beam width.")
 
     max_num_tokens: Optional[int] = Field(
-        default=None, description="The maximum number of tokens.")
+        default=8192, description="The maximum number of tokens.")
 
     gather_generation_logits: bool = Field(
         default=False,
@@ -1894,13 +1894,15 @@ def validate_build_config_with_runtime_params(self):
 
         if self.max_batch_size is not None:
             if self.max_batch_size > self.build_config.max_batch_size:
-                raise ValueError(
-                    f"max_batch_size [{self.max_batch_size}] is greater than build_config.max_batch_size [{self.build_config.max_batch_size}] in build_config"
+                self.max_batch_size = self.build_config.max_batch_size
+                logger.warning(
+                    f"max_batch_size [{self.max_batch_size}] is overridden by build_config.max_batch_size [{self.build_config.max_batch_size}] in build_config"
                 )
         if self.max_num_tokens is not None:
             if self.max_num_tokens > self.build_config.max_num_tokens:
-                raise ValueError(
-                    f"max_num_tokens [{self.max_num_tokens}] is greater than build_config.max_num_tokens [{self.build_config.max_num_tokens}] in build_config"
+                self.max_num_tokens = self.build_config.max_num_tokens
+                logger.warning(
+                    f"max_num_tokens [{self.max_num_tokens}] is overridden by build_config.max_num_tokens [{self.build_config.max_num_tokens}] in build_config"
                 )
         if self.max_seq_len is not None:
             if self.max_seq_len != self.build_config.max_seq_len:
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -76,7 +76,7 @@ methods:
         default: null
       max_num_tokens:
         annotation: Optional[int]
-        default: null
+        default: 8192
       # Misc
       load_format:
         annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]