1
1
cmake_minimum_required (VERSION 3.26)
2
2
3
+ # When building directly using CMake, make sure you run the install step
4
+ # (it places the .so files in the correct location).
5
+ #
6
+ # Example:
7
+ # mkdir build && cd build
8
+ # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
9
+ # cmake --build . --target install
10
+ #
11
+ # If you want to only build one target, make sure to install it manually:
12
+ # cmake --build . --target _C
13
+ # cmake --install . --component _C
3
14
project (vllm_extensions LANGUAGES CXX)
4
15
5
16
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
13
24
# Suppress potential warnings about unused manually-specified variables
14
25
set (ignoreMe "${VLLM_PYTHON_PATH} " )
15
26
27
+ # Prevent installation of dependencies (cutlass) by default.
28
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
29
+
16
30
#
17
31
# Supported python versions. These versions will be searched in order, the
18
32
# first match will be selected. These should be kept in sync with setup.py.
@@ -70,19 +84,6 @@ endif()
70
84
find_package (Torch REQUIRED)
71
85
72
86
#
73
- # Add the `default` target which detects which extensions should be
74
- # built based on platform/architecture. This is the same logic that
75
- # setup.py uses to select which extensions should be built and should
76
- # be kept in sync.
77
- #
78
- # The `default` target makes direct use of cmake easier since knowledge
79
- # of which extensions are supported has been factored in, e.g.
80
- #
81
- # mkdir build && cd build
82
- # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
83
- # cmake --build . --target default
84
- #
85
- add_custom_target (default)
86
87
message (STATUS "Enabling core extension." )
87
88
88
89
# Define _core_C extension
@@ -100,8 +101,6 @@ define_gpu_extension_target(
100
101
USE_SABI 3
101
102
WITH_SOABI)
102
103
103
- add_dependencies (default _core_C)
104
-
105
104
#
106
105
# Forward the non-CUDA device extensions to external CMake scripts.
107
106
#
@@ -173,6 +172,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
173
172
list (APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS} " )
174
173
endif ()
175
174
175
+ include (FetchContent)
176
+
176
177
#
177
178
# Set rocm version dev int.
178
179
#
@@ -203,8 +204,11 @@ set(VLLM_EXT_SRC
203
204
"csrc/torch_bindings.cpp" )
204
205
205
206
if (VLLM_GPU_LANG STREQUAL "CUDA" )
206
- include (FetchContent)
207
207
SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
208
+
209
+ # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
210
+ set (CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use" )
211
+
208
212
FetchContent_Declare(
209
213
cutlass
210
214
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@@ -301,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
301
305
"csrc/custom_all_reduce.cu" )
302
306
endif ()
303
307
308
+ message (STATUS "Enabling C extension." )
304
309
define_gpu_extension_target(
305
310
_C
306
311
DESTINATION vllm
@@ -331,6 +336,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
331
336
"csrc/moe/marlin_moe_ops.cu" )
332
337
endif ()
333
338
339
+ message (STATUS "Enabling moe extension." )
334
340
define_gpu_extension_target(
335
341
_moe_C
336
342
DESTINATION vllm
@@ -341,7 +347,6 @@ define_gpu_extension_target(
341
347
USE_SABI 3
342
348
WITH_SOABI)
343
349
344
-
345
350
if (VLLM_GPU_LANG STREQUAL "HIP" )
346
351
#
347
352
# _rocm_C extension
@@ -364,16 +369,66 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
364
369
WITH_SOABI)
365
370
endif ()
366
371
372
+ # vllm-flash-attn currently only supported on CUDA
373
+ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" )
374
+ return ()
375
+ endif ()
367
376
368
- if (VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP" )
369
- message (STATUS "Enabling C extension." )
370
- add_dependencies (default _C)
377
+ #
378
+ # Build vLLM flash attention from source
379
+ #
380
+ # IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
381
+ # Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
382
+ # They should be identical but if they aren't, this is a massive footgun.
383
+ #
384
+ # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
385
+ # To only install vllm-flash-attn, use --component vllm_flash_attn_c.
386
+ # If no component is specified, vllm-flash-attn is still installed.
371
387
372
- message (STATUS "Enabling moe extension." )
373
- add_dependencies (default _moe_C)
388
+ # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
389
+ # This is to enable local development of vllm-flash-attn within vLLM.
390
+ # It can be set as an environment variable or passed as a cmake argument.
391
+ # The environment variable takes precedence.
392
+ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
393
+ set (VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR} )
374
394
endif ()
375
395
376
- if (VLLM_GPU_LANG STREQUAL "HIP" )
377
- message (STATUS "Enabling rocm extension." )
378
- add_dependencies (default _rocm_C)
396
+ if (VLLM_FLASH_ATTN_SRC_DIR)
397
+ FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR} )
398
+ else ()
399
+ FetchContent_Declare(
400
+ vllm-flash-attn
401
+ GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
402
+ GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
403
+ GIT_PROGRESS TRUE
404
+ )
379
405
endif ()
406
+
407
+ # Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
408
+ set (VLLM_PARENT_BUILD ON )
409
+
410
+ # Ensure the vllm/vllm_flash_attn directory exists before installation
411
+ install (CODE "file(MAKE_DIRECTORY \"\$ {CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\" )" COMPONENT vllm_flash_attn_c)
412
+
413
+ # Make sure vllm-flash-attn install rules are nested under vllm/
414
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
415
+ install (CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\$ {CMAKE_INSTALL_PREFIX}\" )" COMPONENT vllm_flash_attn_c)
416
+ install (CODE "set(CMAKE_INSTALL_PREFIX \"\$ {CMAKE_INSTALL_PREFIX}/vllm/\" )" COMPONENT vllm_flash_attn_c)
417
+
418
+ # Fetch the vllm-flash-attn library
419
+ FetchContent_MakeAvailable(vllm-flash-attn)
420
+ message (STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}" )
421
+
422
+ # Restore the install prefix
423
+ install (CODE "set(CMAKE_INSTALL_PREFIX \"\$ {OLD_CMAKE_INSTALL_PREFIX}\" )" COMPONENT vllm_flash_attn_c)
424
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
425
+
426
+ # Copy over the vllm-flash-attn python files
427
+ install (
428
+ DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
429
+ DESTINATION vllm/vllm_flash_attn
430
+ COMPONENT vllm_flash_attn_c
431
+ FILES_MATCHING PATTERN "*.py"
432
+ )
433
+
434
+ # Nothing after vllm-flash-attn, see comment about macros above
0 commit comments