@@ -86,6 +86,9 @@ find_package(Torch REQUIRED)
86
86
# Supported NVIDIA architectures.
87
87
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
88
88
if (DEFINED CMAKE_CUDA_COMPILER_VERSION AND
89
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
90
+ set (CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0" )
91
+ elseif (DEFINED CMAKE_CUDA_COMPILER_VERSION AND
89
92
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
90
93
set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" )
91
94
else ()
@@ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
175
178
list (APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS} " )
176
179
endif ()
177
180
181
+ #
182
+ # Set compression mode for CUDA >=13.x.
183
+ #
184
+ if (VLLM_GPU_LANG STREQUAL "CUDA" AND
185
+ DEFINED CMAKE_CUDA_COMPILER_VERSION AND
186
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
187
+ list (APPEND VLLM_GPU_FLAGS "--compress-mode=size" )
188
+ endif ()
189
+
178
190
#
179
191
# Set CUDA include flags for CXX compiler.
180
192
#
@@ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
270
282
SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
271
283
272
284
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
273
- set (CUTLASS_REVISION "v4.0.0 " CACHE STRING "CUTLASS revision to use" )
285
+ set (CUTLASS_REVISION "v4.2.1 " CACHE STRING "CUTLASS revision to use" )
274
286
275
287
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
276
288
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -305,7 +317,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
305
317
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
306
318
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
307
319
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
308
- "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
309
320
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
310
321
"csrc/cutlass_extensions/common.cpp"
311
322
"csrc/quantization/fp8/per_token_group_quant.cu" )
@@ -440,7 +451,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
440
451
441
452
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
442
453
# CUDA 12.8 or later
443
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS} " )
454
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
455
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS} " )
456
+ else ()
457
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS} " )
458
+ endif ()
444
459
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
445
460
set (SRCS
446
461
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
@@ -470,7 +485,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
470
485
471
486
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
472
487
# require CUDA 12.8 or later
473
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS} " )
488
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
489
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
490
+ else ()
491
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
492
+ endif ()
474
493
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
475
494
set (SRCS
476
495
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@@ -550,7 +569,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
550
569
551
570
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
552
571
# CUDA 12.8 or later
553
- cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS} " )
572
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
573
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS} " )
574
+ else ()
575
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS} " )
576
+ endif ()
554
577
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
555
578
set (SRCS
556
579
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -569,7 +592,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
569
592
endif ()
570
593
571
594
# FP4 Archs and flags
572
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS} " )
595
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
596
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
597
+ else ()
598
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS} " )
599
+ endif ()
573
600
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
574
601
set (SRCS
575
602
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -591,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
591
618
endif ()
592
619
593
620
# CUTLASS MLA Archs and flags
594
- cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS} " )
621
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
622
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
623
+ else ()
624
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
625
+ endif ()
595
626
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
596
627
set (SRCS
597
628
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu" )
@@ -635,7 +666,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
635
666
endif ()
636
667
endif ()
637
668
638
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
669
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
670
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS} " )
671
+ else ()
672
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
673
+ endif ()
639
674
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
640
675
set (SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu" )
641
676
set_gencode_flags_for_srcs(
@@ -656,7 +691,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
656
691
endif ()
657
692
658
693
# moe_data.cu is used by all CUTLASS MoE kernels.
659
- cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS} " )
694
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
695
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
696
+ else ()
697
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
698
+ endif ()
660
699
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
661
700
set (SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu" )
662
701
set_gencode_flags_for_srcs(
@@ -675,7 +714,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
675
714
endif ()
676
715
endif ()
677
716
678
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
717
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
718
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
719
+ else ()
720
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
721
+ endif ()
679
722
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
680
723
set (SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu" )
681
724
set_gencode_flags_for_srcs(
0 commit comments