@@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
1515
1616# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
1717set (VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM" )
18-
1918message (STATUS "Build type: ${CMAKE_BUILD_TYPE} " )
2019message (STATUS "Target device: ${VLLM_TARGET_DEVICE} " )
2120
@@ -251,9 +250,8 @@ set(VLLM_EXT_SRC
251250if (VLLM_GPU_LANG STREQUAL "CUDA" )
252251 SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
253252
254- # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
255- # Please keep this in sync with FetchContent_Declare line below.
256- set (CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use" )
253+ # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
254+ set (CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use" )
257255
258256 # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
259257 if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -271,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
271269 cutlass
272270 GIT_REPOSITORY https://github.com/nvidia/cutlass.git
273271 # Please keep this in sync with CUTLASS_REVISION line above.
274- GIT_TAG v3.9.0
272+ GIT_TAG ${CUTLASS_REVISION}
275273 GIT_PROGRESS TRUE
276274
277275 # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -304,8 +302,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
304302 # are not supported by Machete yet.
305303 cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS} " )
306304 if (MARLIN_ARCHS)
305+
306+ #
307+ # For the Marlin kernels we automatically generate sources for various
308+ # preselected input type pairs and schedules.
309+ # Generate sources:
310+ set (MARLIN_GEN_SCRIPT
311+ ${CMAKE_CURRENT_SOURCE_DIR} /csrc/quantization/gptq_marlin/generate_kernels.py)
312+ file (MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
313+
314+ message (STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH} " )
315+ message (STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH} " )
316+
317+ if (NOT DEFINED CACHE {MARLIN_GEN_SCRIPT_HASH}
318+ OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH} )
319+ execute_process (
320+ COMMAND ${CMAKE_COMMAND} -E env
321+ PYTHONPATH=$PYTHONPATH
322+ ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
323+ RESULT_VARIABLE marlin_generation_result
324+ OUTPUT_VARIABLE marlin_generation_result
325+ OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR} /marlin_generation.log
326+ ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR} /marlin_generation.log
327+ )
328+
329+ if (NOT marlin_generation_result EQUAL 0)
330+ message (FATAL_ERROR "Marlin generation failed."
331+ " Result: \" ${marlin_generation_result} \" "
332+ "\n Check the log for details: "
333+ "${CMAKE_CURRENT_BINARY_DIR} /marlin_generation.log" )
334+ else ()
335+ set (MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
336+ CACHE STRING "Last run Marlin generate script hash" FORCE)
337+ message (STATUS "Marlin generation completed successfully." )
338+ endif ()
339+ else ()
340+ message (STATUS "Marlin generation script has not changed, skipping generation." )
341+ endif ()
342+
343+ file (GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu" )
344+ set_gencode_flags_for_srcs(
345+ SRCS "${MARLIN_TEMPLATE_KERNEL_SRC} "
346+ CUDA_ARCHS "${MARLIN_ARCHS} " )
347+
348+ list (APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC} )
349+
307350 set (MARLIN_SRCS
308- "csrc/quantization/fp8/fp8_marlin.cu"
309351 "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
310352 "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
311353 "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -647,7 +689,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
647689 OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH} )
648690 execute_process (
649691 COMMAND ${CMAKE_COMMAND} -E env
650- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR} /csrc/cutlass_extensions/: ${CUTLASS_DIR} /python/: ${VLLM_PYTHON_PATH} :$ PYTHONPATH
692+ PYTHONPATH=$PYTHONPATH
651693 ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
652694 RESULT_VARIABLE moe_marlin_generation_result
653695 OUTPUT_VARIABLE moe_marlin_generation_output
@@ -683,6 +725,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
683725 endif ()
684726endif ()
685727
728+ if (VLLM_GPU_LANG STREQUAL "CUDA" )
729+ set (MOE_PERMUTE_SRC
730+ "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
731+ "csrc/moe/moe_permute_unpermute_op.cu" )
732+
733+ set_gencode_flags_for_srcs(
734+ SRCS "${MARLIN_PERMUTE_SRC} "
735+ CUDA_ARCHS "${MOE_PERMUTE_ARCHS} " )
736+
737+ list (APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC} " )
738+ endif ()
686739message (STATUS "Enabling moe extension." )
687740define_gpu_extension_target(
688741 _moe_C
@@ -691,6 +744,8 @@ define_gpu_extension_target(
691744 SOURCES ${VLLM_MOE_EXT_SRC}
692745 COMPILE_FLAGS ${VLLM_GPU_FLAGS}
693746 ARCHITECTURES ${VLLM_GPU_ARCHES}
747+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
748+ INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
694749 USE_SABI 3
695750 WITH_SOABI)
696751
0 commit comments