Add dsr1 and gpt-oss test cases

chenfeiz0326 · chenfeiz0326 · commit 129290db8fe8 · 2025-11-04T00:52:34.000-08:00
Signed-off-by: Chenfei Zhang &lt;chenfeiz@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -2731,7 +2731,6 @@ def launchTestJobs(pipeline, testFilter)
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
         "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
-        "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-trtllm", "perf_sanity_l0_gb200_multi_gpus", 1, 1, 4],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
     ]
     fullSet += SBSASlurmTestConfigs.keySet()
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -686,8 +686,10 @@ def generate_extra_llm_api_config(self) -> str:
             if self.max_draft_len > 0:
                 config_lines.append(f"  max_draft_len: {self.max_draft_len}")
             if self.speculative_model_dir:
+                spec_model_dir = os.path.join(llm_models_root(),
+                                              self.speculative_model_dir)
                 config_lines.append(
-                    f"  speculative_model_dir: {self.speculative_model_dir}")
+                    f"  speculative_model_dir: {spec_model_dir}")
 
         return "\n".join(config_lines)
 
diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml
@@ -3,8 +3,8 @@ perf_sanity_l0_dgx_b200:
 - condition:
     ranges:
       system_gpu_count:
-        gte: 4
-        lte: 4
+        gte: 8
+        lte: 8
     wildcards:
       gpu:
       - '*b200*'
@@ -15,13 +15,13 @@ perf_sanity_l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_dep4,r1_fp4_tep4,r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_dep4,gpt_oss_tep4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep4_mtp1,r1_fp8_tep4_mtp3,gpt_oss_fp4_eagle3_tp8]
 
 - condition:
     ranges:
       system_gpu_count:
-        gte: 4
-        lte: 4
+        gte: 8
+        lte: 8
     wildcards:
       gpu:
       - '*b200*'
@@ -32,13 +32,13 @@ perf_sanity_l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_dep4,r1_fp4_tep4,r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_dep4,gpt_oss_tep4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep4_mtp1,r1_fp8_tep4_mtp3,gpt_oss_fp4_eagle3_tp8]
 
 - condition:
     ranges:
       system_gpu_count:
-        gte: 8
-        lte: 8
+        gte: 4
+        lte: 4
     wildcards:
       gpu:
       - '*b200*'
@@ -49,13 +49,13 @@ perf_sanity_l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep4_mtp1,r1_fp8_tep4_mtp3]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_fp4_dep2,gpt_oss_fp4_dep4]
 
 - condition:
     ranges:
       system_gpu_count:
-        gte: 8
-        lte: 8
+        gte: 4
+        lte: 4
     wildcards:
       gpu:
       - '*b200*'
@@ -66,4 +66,4 @@ perf_sanity_l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep4_mtp1,r1_fp8_tep4_mtp3]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_fp4_dep2,gpt_oss_fp4_dep4]
diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_gb200_multi_gpus.yml
diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml
@@ -1,61 +1,60 @@
 server_configs:
-  - name: "r1_fp4_dep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
+  - name: "r1_fp8_dep4_mtp1"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tp: 8
+    ep: 8
     pp: 1
     attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+    moe_backend: "DEEPGEMM"
     enable_attention_dp: true
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
+    batching_wait_iter: 0
+    enable_balance: true
+    timeout_iters: 60
+    max_batch_size: 512
+    max_num_tokens: 2112
     kv_cache_dtype: "fp8"
     free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
+    cuda_graph_max_batch_size: 512
     enable_padding: true
+    spec_decoding_type: "MTP"
+    num_nextn_predict_layers: 1
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
+      - name: "con1024_iter10_1k1k"
+        concurrency: 1024
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 1.0
+        backend: "openai"
+        use_chat_template: true
 
-  - name: "r1_fp4_tep4"
-    model_name: "deepseek_r1_0528_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
+  - name: "r1_fp8_tep4_mtp3"
+    model_name: "deepseek_r1_0528_fp8"
+    gpus: 8
+    tp: 8
+    ep: 1
     pp: 1
     attention_backend: "TRTLLM"
-    moe_backend: "CUTLASS"
+    moe_backend: "TRTLLM"
     enable_attention_dp: false
-    enable_chunked_prefill: false
-    max_num_tokens: 2176
+    max_batch_size: 32
+    max_num_tokens: 3136
     kv_cache_dtype: "fp8"
     free_gpu_memory_fraction: 0.8
-    max_batch_size: 256
+    cuda_graph_max_batch_size: 32
     enable_padding: true
+    spec_decoding_type: "MTP"
+    num_nextn_predict_layers: 3
     client_configs:
-      - name: "con1_iter1_1024_1024"
-        concurrency: 1
-        iterations: 1
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.0
-      - name: "con8_iter1_1024_1024"
-        concurrency: 8
-        iterations: 1
+      - name: "con32_iter10_1k1k"
+        concurrency: 32
+        iterations: 10
         isl: 1024
         osl: 1024
-        random_range_ratio: 0.0
+        random_range_ratio: 1.0
+        backend: "openai"
+        use_chat_template: true
 
   - name: "r1_fp4_v2_dep4_mtp1"
     model_name: "deepseek_r1_0528_fp4_v2"
@@ -114,78 +113,52 @@ server_configs:
         backend: "openai"
         use_chat_template: true
 
-  - name: "r1_fp8_dep4_mtp1"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tp: 8
-    ep: 8
-    pp: 1
-    attention_backend: "TRTLLM"
-    moe_backend: "DEEPGEMM"
-    enable_attention_dp: true
-    batching_wait_iter: 0
-    enable_balance: true
-    timeout_iters: 60
-    max_batch_size: 512
-    max_num_tokens: 2112
-    kv_cache_dtype: "fp8"
-    free_gpu_memory_fraction: 0.8
-    cuda_graph_max_batch_size: 512
-    enable_padding: true
-    spec_decoding_type: "MTP"
-    num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con1024_iter10_1k1k"
-        concurrency: 1024
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 1.0
-        backend: "openai"
-        use_chat_template: true
-
-  - name: "r1_fp8_tep4_mtp3"
-    model_name: "deepseek_r1_0528_fp8"
+  - name: "gpt_oss_fp4_eagle3_tp8"
+    model_name: "gpt_oss_120b_fp4"
     gpus: 8
     tp: 8
     ep: 1
     pp: 1
     attention_backend: "TRTLLM"
     moe_backend: "TRTLLM"
     enable_attention_dp: false
-    max_batch_size: 32
-    max_num_tokens: 3136
+    max_batch_size: 1
+    max_num_tokens: 20000
     kv_cache_dtype: "fp8"
     free_gpu_memory_fraction: 0.8
-    cuda_graph_max_batch_size: 32
+    cuda_graph_max_batch_size: 1
     enable_padding: true
-    spec_decoding_type: "MTP"
-    num_nextn_predict_layers: 3
+    num_postprocess_workers: 4
+    stream_interval: 20
+    spec_decoding_type: "Eagle"
+    eagle3_layers_to_capture: 1
+    max_draft_len: 3
+    speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
     client_configs:
-      - name: "con32_iter10_1k1k"
-        concurrency: 32
-        iterations: 10
+      - name: "con1_iter32_1k1k"
+        concurrency: 1
+        iterations: 32
         isl: 1024
         osl: 1024
-        random_range_ratio: 1.0
+        random_range_ratio: 0.8
         backend: "openai"
         use_chat_template: true
 
-  - name: "gpt_oss_dep4"
+  - name: "gpt_oss_fp4_dep2"
     model_name: "gpt_oss_120b_fp4"
-    gpus: 4
-    tp: 4
-    ep: 4
+    gpus: 2
+    tp: 2
+    ep: 2
     pp: 1
     attention_backend: "TRTLLM"
     moe_backend: "TRTLLM"
     enable_attention_dp: true
     enable_balance: true
-    max_batch_size: 512
+    max_batch_size: 1024
     max_num_tokens: 20000
     kv_cache_dtype: "fp8"
     free_gpu_memory_fraction: 0.8
-    cuda_graph_max_batch_size: 512
+    cuda_graph_max_batch_size: 1024
     enable_padding: true
     num_postprocess_workers: 4
     stream_interval: 20
@@ -199,27 +172,28 @@ server_configs:
         backend: "openai"
         use_chat_template: true
 
-  - name: "gpt_oss_tep4"
+  - name: "gpt_oss_fp4_dep4"
     model_name: "gpt_oss_120b_fp4"
     gpus: 4
     tp: 4
-    ep: 1
+    ep: 4
     pp: 1
     attention_backend: "TRTLLM"
     moe_backend: "TRTLLM"
-    enable_attention_dp: false
-    max_batch_size: 64
+    enable_attention_dp: true
+    enable_balance: true
+    max_batch_size: 512
     max_num_tokens: 20000
     kv_cache_dtype: "fp8"
     free_gpu_memory_fraction: 0.8
-    cuda_graph_max_batch_size: 64
+    cuda_graph_max_batch_size: 512
     enable_padding: true
     num_postprocess_workers: 4
     stream_interval: 20
     client_configs:
-      - name: "con64_iter3_1k1k"
-        concurrency: 64
-        iterations: 3
+      - name: "con2048_iter5_1k1k"
+        concurrency: 2048
+        iterations: 5
         isl: 1024
         osl: 1024
         random_range_ratio: 1.0
diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml

Original file line number	Diff line number	Diff line change
`@@ -2731,7 +2731,6 @@ def launchTestJobs(pipeline, testFilter)`
`2731`	`2731`	`// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],`
`2732`	`2732`	`"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],`
`2733`	`2733`	`"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],`
`2734`		`- "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-trtllm", "perf_sanity_l0_gb200_multi_gpus", 1, 1, 4],`
`2735`	`2734`	`// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],`
`2736`	`2735`	`]`
`2737`	`2736`	`fullSet += SBSASlurmTestConfigs.keySet()`