Skip to content

Commit 3e1a9b5

Browse files
mgoinnpanpaliya
authored andcommitted
[CI] Initial tests for SM100 Blackwell runner (vllm-project#21877)
Signed-off-by: mgoin <[email protected]>
1 parent c4364fb commit 3e1a9b5

File tree

3 files changed

+30
-14
lines changed

3 files changed

+30
-14
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -647,13 +647,31 @@ steps:
647647
- label: Blackwell Test
648648
working_dir: "/vllm-workspace/"
649649
gpu: b200
650-
optional: true
650+
# optional: true
651651
source_file_dependencies:
652-
- csrc/
653-
- vllm/
652+
- csrc/quantization/fp4/
653+
- csrc/attention/mla/
654+
- csrc/quantization/cutlass_w8a8/moe/
655+
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
656+
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
657+
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
658+
- vllm/v1/attention/backends/flashinfer.py
659+
- vllm/compilation/fusion.py
654660
commands:
655661
- nvidia-smi
656662
- python3 examples/offline_inference/basic/chat.py
663+
# Attention
664+
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
665+
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
666+
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
667+
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
668+
# Quantization
669+
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
670+
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
671+
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
672+
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
673+
# Fusion
674+
- pytest -v -s tests/compile/test_fusion_all_reduce.py
657675

658676
##### 1 GPU test #####
659677
##### multi gpus test #####

tests/compile/test_fusion_all_reduce.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,15 @@ def ops_in_model_before(self):
136136

137137

138138
@multi_gpu_test(num_gpus=2)
139-
@pytest.mark.parametrize("test_model", [
140-
TestAllReduceRMSNormModel,
141-
TestAllReduceFusedAddRMSNormModel,
142-
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
143-
TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
144-
])
139+
@pytest.mark.parametrize(
140+
"test_model",
141+
[
142+
TestAllReduceRMSNormModel,
143+
TestAllReduceFusedAddRMSNormModel,
144+
TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
145+
# TODO: Enable with torch==2.8.0
146+
# TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
147+
])
145148
@pytest.mark.parametrize("batch_size", [8])
146149
@pytest.mark.parametrize("seq_len", [8])
147150
@pytest.mark.parametrize("hidden_size", [16])

tests/kernels/quantization/test_cutlass_scaled_mm.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
559559
m_a_scales = m_g if per_act_token else 1
560560
n_b_scales = n_g if per_out_ch else 1
561561

562-
print("shape:", m_g, n_g, k_g)
563-
564562
# Create group-specific A and B (FP8) and output (FP16/FP32)
565563
a_g = to_fp8(torch.randn((m_g, k_g), device=device))
566564
b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
@@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
639637
for g in range(num_experts):
640638
baseline = baseline_tensors[g]
641639
c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
642-
print(baseline)
643-
print(c)
644-
print("*")
645640
torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)

0 commit comments

Comments
 (0)