[None][test] add gpt oss model for trtllm perf test (#7328)

ruodil · LarryXFly · web-flow · commit e6073b3911a0 · 2025-09-17T15:23:21.000+08:00
Signed-off-by: Ruodi Lu &lt;ruodil@nvidia.com&gt;
Signed-off-by: Ruodi Lu &lt;ruodil@users.noreply.github.com&gt;
Co-authored-by: Ruodi Lu &lt;ruodil@users.noreply.github.com&gt;
Co-authored-by: Larry &lt;197874197+LarryXFly@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -166,6 +166,46 @@ def get_model_yaml_config(model_label: str,
                     ]
                 }
             }
+        },
+        # GPT-OSS 120B max throughput test
+        {
+            'patterns': [
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256',
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512',
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024',
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096'
+            ],
+            'config': {
+                'enable_attention_dp': True,
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 720,
+                },
+                'moe_config': {
+                    'backend': 'CUTLASS'
+                },
+                'stream_interval': 10,
+                'num_postprocess_workers': 4
+            }
+        },
+        # GPT-OSS 120B min latency test
+        {
+            'patterns': [
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1',
+                'gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32'
+            ],
+            'config': {
+                'enable_attention_dp': False,
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 720,
+                },
+                'moe_config': {
+                    'backend': 'TRTLLM'
+                },
+                'stream_interval': 10,
+                'num_postprocess_workers': 4
+            }
         }
     ]
 
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -130,6 +130,7 @@
     "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
     "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
     "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
+    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
 }
 # Model PATH of HuggingFace
 HF_MODEL_PATH = {
diff --git a/tests/integration/test_lists/qa/llm_perf_cluster.yml b/tests/integration/test_lists/qa/llm_perf_cluster.yml
@@ -144,3 +144,12 @@ llm_perf_cluster:
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8]
+  #gpt_oss_120b
+  # max throughput test
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180)
+  # min latency test
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]
diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@@ -172,8 +172,8 @@ llm_perf_sanity:
   #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
 
 # FP8 tests for systems with 8+ GPUs
 - condition:
@@ -215,6 +215,8 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:128,128]
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
+  # for chunked prefill cases
+  - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200]
   - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,7 @@`
`130`	`130`	`"bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",`
`131`	`131`	`"bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",`
`132`	`132`	`"mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",`
	`133`	`+ "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",`
`133`	`134`	`}`
`134`	`135`	`# Model PATH of HuggingFace`
`135`	`136`	`HF_MODEL_PATH = {`