Skip to content

Commit 0c0fdae

Browse files
authored
[Hardware/NVIDIA/Kernel] Enable nvidia/DeepSeek-R1-FP4 Model (#16362)
1 parent 3b602cd commit 0c0fdae

File tree

16 files changed

+1994
-112
lines changed

16 files changed

+1994
-112
lines changed

CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
288288
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
289289
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
290290
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
291+
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
291292
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
292293
"csrc/cutlass_extensions/common.cpp"
293294
"csrc/attention/mla/cutlass_mla_entry.cu")
@@ -495,7 +496,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
495496
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
496497
set(SRCS
497498
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
498-
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
499+
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
500+
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
501+
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
499502
set_gencode_flags_for_srcs(
500503
SRCS "${SRCS}"
501504
CUDA_ARCHS "${FP4_ARCHS}")
@@ -533,7 +536,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
533536
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
534537
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
535538
# to compile MoE kernels that use its output.
536-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
539+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
537540
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
538541
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
539542
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")

0 commit comments

Comments
 (0)