From 83a72437e0e3bb568dedb92da12772c99aca3da9 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Tue, 1 Jul 2025 08:50:10 +0000
Subject: [PATCH 01/14] update NVILA-15B-FP16 match keywords

---
 tests/integration/defs/test_e2e.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 145c069fd93..48c95f636a3 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2012,6 +2012,20 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
                 ],
             ],
         },
+        "NVILA-15B-FP16": {
+            "image": [
+                ["stormy", "ocean", "waves", "clouds", "gray", "sky"],
+                ["rock", "formation", "sunny", "sky", "clouds"],
+                ["road", "busy", "car", "black", "blue"],
+            ],
+            "video": [
+                ["woman", "street", "night", "walking", "camera"],
+                [
+                    "stunning", "earth", "space", "planet", "curvature", "dark",
+                    "bright", "contrast", "illuminate"
+                ],
+            ],
+        },
         "llava-v1.6-mistral-7b": {
             "image": [
                 [

From f997eb2cee71093eeb4e820e935c17c2a23bd8df Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Wed, 2 Jul 2025 01:00:33 +0000
Subject: [PATCH 02/14] update perf case - only support pytorch backend

---
 tests/integration/test_lists/qa/llm_release_digits_perf.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index a216f04c302..d4aef72bff7 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,7 +1,3 @@
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]

From f038bf87c3d69143f85d53c7fd040561402d355a Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Wed, 2 Jul 2025 01:45:02 +0000
Subject: [PATCH 03/14] add model Mixtral-7B-Instruct-v0.3

---
 tests/integration/defs/test_e2e.py                          | 3 +++
 tests/integration/test_lists/qa/llm_release_digits_func.txt | 1 +
 2 files changed, 4 insertions(+)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 48c95f636a3..d883beb6412 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -1607,6 +1607,9 @@ def test_ptp_quickstart(llm_root, llm_venv):
     pytest.param('Mixtral-8x7B-BF16',
                  'Mixtral-8x7B-Instruct-v0.1',
                  marks=skip_pre_blackwell),
+    pytest.param('Mixtral-7B-Instruct-v0.3',
+                 'Mistral-7B-Instruct-v0.3',
+                 marks=skip_pre_blackwell),
     pytest.param('Mistral-Nemo-12b-Base',
                  'Mistral-Nemo-Base-2407',
                  marks=skip_pre_blackwell),
diff --git a/tests/integration/test_lists/qa/llm_release_digits_func.txt b/tests/integration/test_lists/qa/llm_release_digits_func.txt
index 00d0bac895e..6ea117597e7 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_func.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_func.txt
@@ -12,6 +12,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP8-modelopt-hf-model-hub
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP4-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1]
 test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-BF16-Mixtral-8x7B-Instruct-v0.1]
+test_e2e.py::test_ptp_quickstart_advanced[Mixtral-7B-Instruct-v0.3-Mixtral-7B-Instruct-v0.3]
 test_e2e.py::test_ptp_quickstart_advanced[Mistral-Nemo-12b-Base-Mistral-Nemo-Base-2407]
 test_e2e.py::test_ptp_quickstart_advanced[DeepSeek-R1-Distill-Qwen-32B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B]
 

From 5cd1740428391a409616e8acac8ddf36666269f6 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Wed, 2 Jul 2025 06:55:01 +0000
Subject: [PATCH 04/14] add perf test cases

---
 tests/integration/defs/perf/test_perf.py      |  3 +-
 .../test_lists/qa/llm_release_digits_perf.txt | 44 ++++++++++++++-----
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index cdbeea45971..28c38b25f1c 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -82,6 +82,7 @@
     "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B",
     "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
     "mistral_7b_v0.1": "mistral-7b-v0.1",
+    "mistral_7b_v0.3": "Mistral-7B-Instruct-v0.3",
     "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
     "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
@@ -1379,7 +1380,7 @@ def get_commands(self):
                     data_cmd = self.get_prepare_data_command(
                         engine_dir, input_len, output_len)
                     data_cmds.append(data_cmd)
-
+        print(f"================= data_cmd: {data_cmd}")
         # Construct MPI command.
         mpi_cmd = []
         if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index d4aef72bff7..6936171a5cc 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,24 +1,44 @@
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8] # passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,128] 
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]#passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]#passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]#passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8] 
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] 
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
 
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128]
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:128,128]#passed
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:512,32]#passed
+
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128] #passed
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128]
 
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32] #passed
 
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
+
+perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+
+# FP4 cases failed
+# ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.
+
 
-perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-input_output_len:128,128]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-input_output_len:128,128]

From 063e5860282c0f219d02e3d29b1bbad441bb4be6 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 3 Jul 2025 01:37:16 +0000
Subject: [PATCH 05/14] update func test

---
 tests/integration/defs/test_e2e.py                          | 2 +-
 tests/integration/test_lists/qa/llm_release_digits_func.txt | 2 +-
 tests/integration/test_lists/qa/llm_release_digits_perf.txt | 4 ----
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index d883beb6412..4a8265dbad5 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -1607,7 +1607,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
     pytest.param('Mixtral-8x7B-BF16',
                  'Mixtral-8x7B-Instruct-v0.1',
                  marks=skip_pre_blackwell),
-    pytest.param('Mixtral-7B-Instruct-v0.3',
+    pytest.param('Mistral-7B-Instruct-v0.3',
                  'Mistral-7B-Instruct-v0.3',
                  marks=skip_pre_blackwell),
     pytest.param('Mistral-Nemo-12b-Base',
diff --git a/tests/integration/test_lists/qa/llm_release_digits_func.txt b/tests/integration/test_lists/qa/llm_release_digits_func.txt
index 6ea117597e7..7a0c0611bde 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_func.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_func.txt
@@ -12,7 +12,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP8-modelopt-hf-model-hub
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP4-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4]
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1]
 test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-BF16-Mixtral-8x7B-Instruct-v0.1]
-test_e2e.py::test_ptp_quickstart_advanced[Mixtral-7B-Instruct-v0.3-Mixtral-7B-Instruct-v0.3]
+test_e2e.py::test_ptp_quickstart_advanced[Mistral-7B-Instruct-v0.3-Mistral-7B-Instruct-v0.3]
 test_e2e.py::test_ptp_quickstart_advanced[Mistral-Nemo-12b-Base-Mistral-Nemo-Base-2407]
 test_e2e.py::test_ptp_quickstart_advanced[DeepSeek-R1-Distill-Qwen-32B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B]
 
diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 6936171a5cc..94641a3b128 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -38,7 +38,3 @@ perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 
-# FP4 cases failed
-# ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.
-
-

From dc95d34afc903ec9378ee12e4f82a25671ac6eed Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 3 Jul 2025 01:59:23 +0000
Subject: [PATCH 06/14] delete blanks

---
 tests/integration/test_lists/qa/llm_release_digits_perf.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 94641a3b128..1c0649db97b 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -36,5 +36,4 @@ perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
 
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
\ No newline at end of file

From e6b226f6d549ed5566e5452cd8737e964f386877 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 3 Jul 2025 02:29:42 +0000
Subject: [PATCH 07/14] Fix whitespace and end-of-file issues in test list

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
---
 .../test_lists/qa/llm_release_digits_perf.txt | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 1c0649db97b..cef63909bd8 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,39 +1,34 @@
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8] # passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,128] 
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128] #passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] #passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128] #passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048] #passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]#passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]#passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]#passed
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8] 
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] 
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
-
-perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:128,128]#passed
-perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:512,32]#passed
-
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:128,128]
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:512,32]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128] #passed
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128]
-
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32] #passed
-
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
-
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
\ No newline at end of file
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]

From 30f33971d5da5fd45edc5fbf38ecdf50a375a8b0 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 3 Jul 2025 05:49:24 +0000
Subject: [PATCH 08/14] Update performance test configuration

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
---
 tests/integration/defs/perf/test_perf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 28c38b25f1c..7dd0f85d777 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -1380,7 +1380,6 @@ def get_commands(self):
                     data_cmd = self.get_prepare_data_command(
                         engine_dir, input_len, output_len)
                     data_cmds.append(data_cmd)
-        print(f"================= data_cmd: {data_cmd}")
         # Construct MPI command.
         mpi_cmd = []
         if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":

From 589085e6f2429ebd536f36511480918a06ff41ce Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 05:38:15 +0000
Subject: [PATCH 09/14] Add performance test configurations with
 maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1 parameters

- Added 12 model variants with different precision configurations
- Includes LLaMA 3.1 8B, LLaMA 3.3 Nemotron Super 49B, LLaMA 3.3 70B, Mixtral 8x7B variants
- Added fp8, fp4, float16, and bfloat16 precision variants
- All configurations use PyTorch backend with specified performance parameters
---
 .../test_lists/qa/llm_release_digits_perf.txt | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index cef63909bd8..91245e35e96 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,3 +1,19 @@
+# Added configurations with the requested parameters (maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1)
+# for each model with different precision variants
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
@@ -8,7 +24,6 @@ perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-inp
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
@@ -31,4 +46,4 @@ perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
\ No newline at end of file

From aadd70e277b892c7d6a884128ec809e328664f08 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 05:54:23 +0000
Subject: [PATCH 10/14] change perf test order

---
 .../test_lists/qa/llm_release_digits_perf.txt | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 91245e35e96..55fa2d6d97b 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,33 +1,26 @@
 # Added configurations with the requested parameters (maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1)
 # for each model with different precision variants
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
 perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:128,128]
 perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:512,32]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
@@ -35,15 +28,22 @@ perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-flo
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
-perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
\ No newline at end of file
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]

From b6db2e09dac65eecc1e514d1b1bd52237725729d Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 05:55:54 +0000
Subject: [PATCH 11/14] change perf test order

---
 tests/integration/test_lists/qa/llm_release_digits_perf.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 55fa2d6d97b..e14c596e595 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -34,13 +34,13 @@ perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-ma
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]

From 4cb725a0c04837d83b5781da98897882dcffa75b Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 05:59:19 +0000
Subject: [PATCH 12/14] change perf test order

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
---
 tests/integration/test_lists/qa/llm_release_digits_perf.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index e14c596e595..5b8052880e3 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -35,8 +35,8 @@ perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]

From 5d774f5b48a6c8eea447e32c8a7680cd15a39476 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 07:58:14 +0000
Subject: [PATCH 13/14] delete some bigger input cases

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
---
 .../test_lists/qa/llm_release_digits_perf.txt | 47 +++++++------------
 1 file changed, 17 insertions(+), 30 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 5b8052880e3..2ee7a0fc9ab 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,5 +1,20 @@
-# Added configurations with the requested parameters (maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1)
-# for each model with different precision variants
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,128]
@@ -7,43 +22,15 @@ perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bflo
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-reqs:100-con:2]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:128,128]
 perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-input_output_len:512,32]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:512,128]
 perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
-perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128]

From 2f794ef4d6db9cca38cb0a2efcb48f01db345102 Mon Sep 17 00:00:00 2001
From: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
Date: Fri, 11 Jul 2025 07:16:00 +0000
Subject: [PATCH 14/14] waive some cases by bug

Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
---
 tests/integration/defs/perf/test_perf.py         |  2 ++
 .../test_lists/qa/llm_release_digits_perf.txt    | 16 +++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 7dd0f85d777..16f9d2776e1 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -138,6 +138,7 @@
     "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
     "flan_t5_base_hf": "google/flan-t5-small",
     "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
+    "nvila_15b": "nvidia/NVILA-15B",
 }
 LORA_MODEL_PATH = {
     "llama_v2_13b": "llama-models-v2/chinese-llama-2-lora-13b",
@@ -1381,6 +1382,7 @@ def get_commands(self):
                         engine_dir, input_len, output_len)
                     data_cmds.append(data_cmd)
         # Construct MPI command.
+        print(f"##########JJJJJJ data_cmd: {data_cmd}")
         mpi_cmd = []
         if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
             if cpu_socket_count_gt_1():
diff --git a/tests/integration/test_lists/qa/llm_release_digits_perf.txt b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
index 2ee7a0fc9ab..e7292fa8202 100644
--- a/tests/integration/test_lists/qa/llm_release_digits_perf.txt
+++ b/tests/integration/test_lists/qa/llm_release_digits_perf.txt
@@ -1,10 +1,10 @@
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
-perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1] #passed
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1] #passed
+perf/test_perf.py::test_perf[mistral_7b_v0.3-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1] #passed
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5387445)
+perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5387445)
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5387445)
+perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5387445)
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-maxbs:1-maxnt:320-input_output_len:128,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
 perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
@@ -34,3 +34,5 @@ perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-inp
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
 perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8]
 perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
+perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-maxbs:1-maxnt:3000-input_output_len:2048,128-reqs:32-con:1]
+perf/test_perf.py::test_perf[nvila_15b-bench-pytorch-bfloat16-input_output_len:128,128]