[None][fix] AD test_trtllm_bench to use small model config and skip loading weights (#8149)

MrGeva · web-flow · commit a1ed03fe8ac8 · 2025-10-12T18:30:20.000+03:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/_torch/auto_deploy/_utils_test/_model_test_utils.py
@@ -480,6 +480,12 @@ def apply_rotary_pos_emb_ds(q, k, cos, sin, position_ids, unsqueeze_dim=1):
             "ssm_state_size": 32,
         },
     },
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
+        "llm_models_subdir": "llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
+        "model_kwargs": {
+            "num_hidden_layers": 2,
+        },
+    },
 }
 
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -4,20 +4,13 @@
 
 import pytest
 import yaml
-from _model_test_utils import _hf_model_dir_or_hub_id
+from _model_test_utils import get_small_model_config
 from click.testing import CliRunner
 from utils.cpp_paths import llm_root  # noqa: F401
 
 from tensorrt_llm.commands.bench import main
 
 
-def tiny_llama_details():
-    model_path = "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
-    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    model_path_or_name = _hf_model_dir_or_hub_id(model_path, model_name)
-    return model_path_or_name, model_name, model_path
-
-
 def run_benchmark(model_name: str, dataset_path: str, extra_llm_api_options_path: str):
     runner = CliRunner()
 
@@ -74,20 +67,19 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_path_or_name: str):
 
 
 @pytest.mark.parametrize("compile_backend", ["torch-compile", "torch-opt", "torch-cudagraph"])
-def test_trtllm_bench(llm_root, compile_backend):  # noqa: F811
-    model_path_or_name, model_name, model_path = tiny_llama_details()
+@pytest.mark.parametrize("model_name", ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"])
+def test_trtllm_bench(llm_root, compile_backend, model_name):  # noqa: F811
+    config = get_small_model_config(model_name)
     with tempfile.TemporaryDirectory() as temp_dir:
         extra_llm_api_options_path = f"{temp_dir}/extra_llm_api_options.yaml"
         with open(extra_llm_api_options_path, "w") as f:
             yaml.dump(
                 {
-                    "model_kwargs": {"num_hidden_layers": 2},
-                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
-                    "max_batch_size": 128,
                     "compile_backend": compile_backend,
+                    **config["args"],
                 },
                 f,
             )
 
-        dataset_path = prepare_dataset(llm_root, temp_dir, model_path_or_name)
+        dataset_path = prepare_dataset(llm_root, temp_dir, config["args"]["model"])
         run_benchmark(model_name, dataset_path, extra_llm_api_options_path)