tests: skip writing prepare_dataset output to logs, and add llama_v3.1_8b_fp8, llama_v3.3_70b_fp8, llama_v3.1_405b_fp4 models (NVIDIA#3864)

ruodil · LarryXFly · web-flow · commit 4d0e462723eb · 2025-05-07T13:56:35.000+08:00
* tests: skip writing prepare_dataset output to logs

Signed-off-by: Ruodi &lt;200874449+ruodil@users.noreply.github.com&gt;

* test: add llama_v3.1_8b_fp8 model, llama_v3.1_405b model and llama_nemotron_49b model in perf test, and modify original llama models dtype from float16 to bfloat16 according to README.md

Signed-off-by: Ruodi &lt;200874449+ruodil@users.noreply.github.com&gt;

---------

Signed-off-by: Ruodi &lt;200874449+ruodil@users.noreply.github.com&gt;
Signed-off-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
Co-authored-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -44,16 +44,22 @@
     "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf",  # not safetensors repo
     "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
     "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
+    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
     "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
+    "llama_v3.3_70b_instruct_fp8":
+    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
+    "llama_v3.1_405b_instruct_fp4":
+    "llm-models/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
     "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
     "llama_v3.2_11b": "llama-3.2-models/Llama-3.2-11B-Vision",
+    "llama_v3.3_nemotron_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1/",
     "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
     # "llama_30b": "llama-models/llama-30b-hf",
     "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
     "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
     "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
     "mistral_7b_v0.1": "mistral-7b-v0.1",
-    "deepseek_r1": "DeepSeek-R1/DeepSeek-R1",
+    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
     "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
     "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
@@ -596,7 +602,7 @@ def validate(self):
         assert self.mode in VALID_MODES, f"Invalid mode {self.mode}!"
 
         # Validate dtype.
-        VALID_DTYPES = ["float32", "float16", "bfloat16"]
+        VALID_DTYPES = ["float32", "float16", "bfloat16", "float8", "float4"]
         assert self.data_type in VALID_DTYPES, f"Invalid data_type {self.data_type}!"
 
         # Validate quantization mode.
@@ -978,8 +984,8 @@ def get_prepare_data_command(self, engine_dir, input_len,
             nloras = self._config.num_loras
             lora_data = os.path.join(engine_dir,
                                      f"token-norm-dist-lora-{nloras}.json")
-            with open(lora_data, 'w') as file:
-                pass
+            # with open(lora_data, 'w') as file:
+            #     pass
             data_cmd += [
                 "python3", prepare_data_script, f"--output={lora_data}",
                 f"--rand-task-id 0 {nloras-1}", f"--tokenizer={tokenizer_dir}",
diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py
@@ -425,7 +425,9 @@ def run_ex(self,
                             print(collect_and_clean_myelin_time(output))
 
                     # Print the output log to stdout and cache it.
-                    print(buf.getvalue())
+                    # skip the output log for prepare dataset command
+                    if 'prepare_dataset' not in commands.get_cmd_str(cmd_idx):
+                        print(buf.getvalue())
                     outputs[cmd_idx] = buf.getvalue()
             else:
                 print_info(f"Reusing cached logs for command index {cmd_idx}.")
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_cluster_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_cluster_test.yml
@@ -5,16 +5,23 @@ trt_llm_release_perf_cluster_test:
       system_gpu_count:
         gte: 1
   tests:
-  - perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-float16-input_output_len:128,128]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:128,128-quant:fp8]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:512,32-quant:fp8]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:nvfp4-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:256-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:256-input_output_len:512,32-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8]
   - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20]
   - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8]
   - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8]
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 2
+  tests:
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
 
 # Tests for systems with 4+ GPUs
@@ -27,28 +34,23 @@ trt_llm_release_perf_cluster_test:
   - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:512,32-gpus:4]
   - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
 
 # Tests for systems with 8+ GPUs
 - condition:
     ranges:
       system_gpu_count:
         gte: 8
   tests:
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:8]
   - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:8]
   - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
   - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
-
-# GB chip specific tests with high memory
-- condition:
-    wildcards:
-      gpu:
-        - '*b100*'
-      linux_distribution_name: '*'
-  tests:
-  - perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:fp8-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-tp:8-gpus:8] #min latency test
-  - perf/test_perf.py::test_perf[deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:fp8-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
-  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:512-input_output_len:128,128-quant:nvfp4-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float16-maxbs:512-input_output_len:1000,1000-quant:nvfp4-con:4096-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8] #min latency test
-  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:1000,1000-con:4096-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml
@@ -24,7 +24,9 @@ trt_llm_release_perf_sanity_test:
   - perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20]
   - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20]
   - perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128+512,32]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
 
   # Test list validation
   - test_list_validation.py::test_list_validation
@@ -41,7 +43,7 @@ trt_llm_release_perf_sanity_test:
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32]
 
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
   - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-float16-input_output_len:128,128]
@@ -51,8 +53,8 @@ trt_llm_release_perf_sanity_test:
     terms:
       supports_fp8: true
   tests:
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:fp8]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:512,32-quant:fp8]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:fp8]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32-quant:fp8]
 
 # Tests for systems with 2+ GPUs
 - condition:
@@ -62,10 +64,10 @@ trt_llm_release_perf_sanity_test:
   tests:
   - perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
   - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:int8_sq-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-quant:int8_wo-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-maxbs:256-input_output_len:128,128-gpu:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-float16-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_sq-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8_wo-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpu:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.2_11b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
 
 # FP8 tests for systems with 2+ GPUs
@@ -77,7 +79,7 @@ trt_llm_release_perf_sanity_test:
         gte: 2
   tests:
   - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128-quant:fp8-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.2_11b-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
 
@@ -89,7 +91,7 @@ trt_llm_release_perf_sanity_test:
       gpu_memory:
         gt: 80000
   tests:
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2]
 
 # Tests for systems with 4+ GPUs
@@ -98,7 +100,7 @@ trt_llm_release_perf_sanity_test:
       system_gpu_count:
         gte: 4
   tests:
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
   - perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
 
@@ -108,8 +110,8 @@ trt_llm_release_perf_sanity_test:
       system_gpu_count:
         gte: 8
   tests:
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
 
 # Tests for systems with 8+ GPUs and high memory
 - condition:
@@ -129,5 +131,6 @@ trt_llm_release_perf_sanity_test:
       system_gpu_count:
         gte: 8
   tests:
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-float16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt