[fix] Update to properly set cuda graphs in trtllm-bench overrides. (NVIDIA#5634)

FrankD412 · web-flow · commit d61893dc775b · 2025-07-05T05:19:16.000+09:00
Signed-off-by: Frank Di Natale &lt;3429989+FrankD412@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -86,17 +86,15 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     enable_chunked_prefill = params.get("enable_chunked_prefill", False)
 
     kv_cache_dtype = "auto"
-    cuda_graph_batch_sizes = None
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
             llm_args_dict = yaml.safe_load(f)
-            if "kv_cache_dtype" in llm_args_dict:
-                kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
-            if "cuda_graph_batch_sizes" in llm_args_dict:
-                cuda_graph_batch_sizes = llm_args_dict["cuda_graph_batch_sizes"]
 
-            enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
-                                                       enable_chunked_prefill)
+        if "kv_cache_dtype" in llm_args_dict:
+            kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
+
+        enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
+                                                   enable_chunked_prefill)
 
     world_config = {
         "pp_size": params.get("pp"),
@@ -152,17 +150,17 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
             # Expecting this to be the max of chunk block and max_num_tokens.
             pass
 
+    cuda_graph_config = {
+        "padding_enabled": True,
+        "max_batch_size": max_batch_size
+    }
+
     pyt_options = {
-        "cuda_graph_config": {
-            "padding_enabled":
-            True,
-            "max_batch_size":
-            max_batch_size if cuda_graph_batch_sizes is None else 0,
-        },
+        "cuda_graph_config": cuda_graph_config,
         "kv_cache_dtype": kv_cache_dtype,
     }
-    backend = params.get("backend", "pytorch")
 
+    backend = params.get("backend", "pytorch")
     return {
         "sw_version": version("tensorrt_llm"),
         "model_path": model_path,