@@ -288,6 +288,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
288
288
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
289
289
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
290
290
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
291
+ "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
291
292
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
292
293
"csrc/cutlass_extensions/common.cpp"
293
294
"csrc/attention/mla/cutlass_mla_entry.cu" )
@@ -495,7 +496,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
495
496
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
496
497
set (SRCS
497
498
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
498
- "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" )
499
+ "csrc/quantization/fp4/nvfp4_experts_quant.cu"
500
+ "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
501
+ "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" )
499
502
set_gencode_flags_for_srcs(
500
503
SRCS "${SRCS} "
501
504
CUDA_ARCHS "${FP4_ARCHS} " )
@@ -533,7 +536,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
533
536
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
534
537
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
535
538
# to compile MoE kernels that use its output.
536
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS} " )
539
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a " "${CUDA_ARCHS} " )
537
540
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
538
541
set (SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
539
542
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu" )
0 commit comments