test: fix for perf test script issue (NVIDIA#4230)

ruodil · LarryXFly · web-flow · commit d555fe253081 · 2025-05-13T10:29:20.000+08:00
fix for perf test script issue

Signed-off-by: Ruodi &lt;200874449+ruodil@users.noreply.github.com&gt;
Co-authored-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/perf/model_yaml_config.py b/tests/integration/defs/perf/model_yaml_config.py
@@ -33,7 +33,6 @@ def get_model_yaml_config(model_label: str) -> dict:
             'print_iter_log': True,
             'use_cuda_graph': True,
             'cuda_graph_padding_enabled': True,
-            'cuda_graph_batch_sizes': [1, 512],
             'cuda_graph_max_batch_size': 4096,
         }
     }
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -608,8 +608,8 @@ def validate(self):
         # Validate quantization mode.
         if self.model_name in MODEL_PATH_DICT.keys():
             VALID_QUANTS = [
-                "", "nvfp4", "fp8", "int8_sq", "int4_awq", "w4a8_awq",
-                "w4a16_awq", "int8_wo", "int4_wo", "full_prec"
+                "", "nvfp4", "fp8", "int8", "int4_awq", "w4a8_awq", "w4a16_awq",
+                "int4_wo", "full_prec"
             ]
         else:
             VALID_QUANTS = [
@@ -625,6 +625,8 @@ def validate(self):
                 "int4_weight_only_gptq",
             ]
         assert self.quantization in VALID_QUANTS, f"Invalid quantization {self.quantization}!"
+        if self.backend == "pytorch":
+            assert self.quantization == "", f"Not support passing quantization {self.quantization} for pytorch backend!"
         assert self.num_beams >= 1, f"Invalid num_beams: {self.num_beams}!"
         assert self.num_loras >= 0, f"Invalid num_loras: {self.num_loras}!"
         assert self.num_reqs >= 1, f"Invalid num_reqs: {self.num_reqs}!"
@@ -803,8 +805,8 @@ def get_convert_weights_command(self, model_dir, engine_dir) -> str:
         if self._config.quantization != "":
             command, checkpoint_dir = quantize_data(
                 llm_venv=None,
-                example_root=os.path.join(get_llm_root(), "examples",
-                                          example_name),
+                example_root=os.path.join(get_llm_root(), "examples", "models",
+                                          "core", example_name),
                 model_dir=model_dir,
                 calib_dataset=os.path.join(llm_models_root(), "datasets",
                                            "cnn_dailymail"),
@@ -816,8 +818,8 @@ def get_convert_weights_command(self, model_dir, engine_dir) -> str:
         else:
             command, checkpoint_dir = convert_weights(
                 llm_venv=None,
-                example_root=os.path.join(get_llm_root(), "examples",
-                                          example_name),
+                example_root=os.path.join(get_llm_root(), "examples", "models",
+                                          "core", example_name),
                 cmodel_dir=engine_dir,
                 model=self._config.model_name,
                 model_path=model_dir,
@@ -1393,7 +1395,7 @@ def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer,
         Run through the commands and parse multiple perf metrics from the logs.
         """
         #print info to separate cases
-        print_info(f"Running perf test for case: {self._short_test_name_body}")
+        print_info(f"Running perf test for case: {self._short_test_name}")
         self._current_cmd_idx = 0
         metrics = self._get_metrics()
         outputs = {}
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml
@@ -64,8 +64,7 @@ trt_llm_release_perf_sanity_test:
   tests:
   - perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
   - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_sq-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_wo-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpu:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
@@ -101,8 +100,8 @@ trt_llm_release_perf_sanity_test:
         gte: 4
   tests:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
-  - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
-  - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
+  - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
 
 # Tests for systems with 8+ GPUs
 - condition:
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
@@ -64,7 +64,7 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
   - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2]
   - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_sq-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_wo-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
@@ -96,9 +96,10 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,32-gpus:4]
-  - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
-  - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:512,32-gpus:4]
-  - perf/test_perf.py::test_perf[starcoder_15b-bench-pytorch-float16-input_output_len:512,200-gpus:4]
+  - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4]
+  - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
+  - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
 
 - condition:
     ranges:
@@ -197,7 +198,6 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:4]
   - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4]
-  - perf/test_perf.py::test_perf[starcoder_15b-bench-pytorch-float16-input_output_len:512,512-quant:fp8-tp:4]
 
 - condition:
     terms:

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,6 @@ def get_model_yaml_config(model_label: str) -> dict:`
`33`	`33`	`'print_iter_log': True,`
`34`	`34`	`'use_cuda_graph': True,`
`35`	`35`	`'cuda_graph_padding_enabled': True,`
`36`		`- 'cuda_graph_batch_sizes': [1, 512],`
`37`	`36`	`'cuda_graph_max_batch_size': 4096,`
`38`	`37`	`}`
`39`	`38`	`}`