Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 57ea101

Browse files
authored
Merge pull request #203 from ROCm/upstream_merge_24_9_23
Upstream merge 24 9 23
2 parents 1f0d319 + cebe70c commit 57ea101

File tree

287 files changed

+9576
-4448
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

287 files changed

+9576
-4448
lines changed

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ steps:
88
containers:
99
- image: badouralix/curl-jq
1010
command:
11-
- sh
12-
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
11+
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1312
- wait
1413
- label: "A100"
1514
agents:

.buildkite/nightly-benchmarks/scripts/wait-for-image.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
33
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
44

5+
TIMEOUT_SECONDS=10
6+
57
retries=0
68
while [ $retries -lt 1000 ]; do
7-
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
9+
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
810
exit 0
911
fi
1012

.buildkite/run-amd-test.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
8383
--ignore=kernels/test_encoder_decoder_attn.py \
8484
--ignore=kernels/test_flash_attn.py \
8585
--ignore=kernels/test_flashinfer.py \
86+
--ignore=kernels/test_gguf.py \
8687
--ignore=kernels/test_int8_quant.py \
8788
--ignore=kernels/test_machete_gemm.py \
8889
--ignore=kernels/test_mamba_ssm.py \
@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
9394
--ignore=kernels/test_sampler.py"
9495
fi
9596

97+
#ignore certain Entrypoints tests
98+
if [[ $commands == *" entrypoints/openai "* ]]; then
99+
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
100+
--ignore=entrypoints/openai/test_accuracy.py \
101+
--ignore=entrypoints/openai/test_audio.py \
102+
--ignore=entrypoints/openai/test_encoder_decoder.py \
103+
--ignore=entrypoints/openai/test_embedding.py \
104+
--ignore=entrypoints/openai/test_oot_registration.py "}
105+
fi
106+
96107
PARALLEL_JOB_COUNT=8
97108
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
98109
if [[ $commands == *"--shard-id="* ]]; then

.buildkite/test-pipeline.yaml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,15 @@ steps:
4343
fast_check: true
4444
source_file_dependencies:
4545
- vllm/
46+
- tests/mq_llm_engine
4647
- tests/async_engine
4748
- tests/test_inputs
4849
- tests/multimodal
4950
- tests/test_utils
5051
- tests/worker
5152
commands:
52-
- pytest -v -s async_engine # Async Engine
53+
- pytest -v -s mq_llm_engine # MQLLMEngine
54+
- pytest -v -s async_engine # AsyncLLMEngine
5355
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
5456
- pytest -v -s test_inputs.py
5557
- pytest -v -s multimodal
@@ -82,7 +84,7 @@ steps:
8284
- label: Entrypoints Test # 20min
8385
working_dir: "/vllm-workspace/tests"
8486
fast_check: true
85-
#mirror_hardwares: [amd]
87+
mirror_hardwares: [amd]
8688
source_file_dependencies:
8789
- vllm/
8890
commands:
@@ -163,13 +165,6 @@ steps:
163165
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
164166
- python3 offline_inference_encoder_decoder.py
165167

166-
- label: torch compile integration test
167-
source_file_dependencies:
168-
- vllm/
169-
commands:
170-
- pytest -v -s ./compile/test_full_graph.py
171-
- pytest -v -s ./compile/test_wrapper.py
172-
173168
- label: Prefix Caching Test # 7min
174169
#mirror_hardwares: [amd]
175170
source_file_dependencies:
@@ -259,6 +254,13 @@ steps:
259254
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
260255
- bash ./run-tests.sh -c configs/models-small.txt -t 1
261256

257+
- label: Encoder Decoder tests # 5min
258+
source_file_dependencies:
259+
- vllm/
260+
- tests/encoder_decoder
261+
commands:
262+
- pytest -v -s encoder_decoder
263+
262264
- label: OpenAI-Compatible Tool Use # 20 min
263265
fast_check: false
264266
mirror_hardwares: [ amd ]
@@ -348,7 +350,10 @@ steps:
348350
- vllm/executor/
349351
- vllm/model_executor/models/
350352
- tests/distributed/
353+
- vllm/compilation
351354
commands:
355+
- pytest -v -s ./compile/test_full_graph.py
356+
- pytest -v -s ./compile/test_wrapper.py
352357
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353358
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354359
# Avoid importing model tests that cause CUDA reinitialization error

.github/workflows/ruff.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ jobs:
2525
- name: Install dependencies
2626
run: |
2727
python -m pip install --upgrade pip
28-
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
28+
pip install -r requirements-lint.txt
2929
- name: Analysing the code with ruff
3030
run: |
31-
ruff .
31+
ruff check .
3232
- name: Spelling check with codespell
3333
run: |
3434
codespell --toml pyproject.toml

.github/workflows/scripts/build.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
1515
export MAX_JOBS=1
1616
# Make sure release wheels are built for the following architectures
1717
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
18+
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
1819
# Build
1920
$python_executable setup.py bdist_wheel --dist-dir=dist

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# vllm commit id, generated by setup.py
22
vllm/commit_id.py
33

4+
# vllm-flash-attn built from source
5+
vllm/vllm_flash_attn/
6+
47
# Byte-compiled / optimized / DLL files
58
__pycache__/
69
*.py[cod]
@@ -12,6 +15,8 @@ __pycache__/
1215
# Distribution / packaging
1316
.Python
1417
build/
18+
cmake-build-*/
19+
CMakeUserPresets.json
1520
develop-eggs/
1621
dist/
1722
downloads/

CMakeLists.txt

Lines changed: 80 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
cmake_minimum_required(VERSION 3.26)
22

3+
# When building directly using CMake, make sure you run the install step
4+
# (it places the .so files in the correct location).
5+
#
6+
# Example:
7+
# mkdir build && cd build
8+
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
9+
# cmake --build . --target install
10+
#
11+
# If you want to only build one target, make sure to install it manually:
12+
# cmake --build . --target _C
13+
# cmake --install . --component _C
314
project(vllm_extensions LANGUAGES CXX)
415

516
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
1324
# Suppress potential warnings about unused manually-specified variables
1425
set(ignoreMe "${VLLM_PYTHON_PATH}")
1526

27+
# Prevent installation of dependencies (cutlass) by default.
28+
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
29+
1630
#
1731
# Supported python versions. These versions will be searched in order, the
1832
# first match will be selected. These should be kept in sync with setup.py.
@@ -70,19 +84,6 @@ endif()
7084
find_package(Torch REQUIRED)
7185

7286
#
73-
# Add the `default` target which detects which extensions should be
74-
# built based on platform/architecture. This is the same logic that
75-
# setup.py uses to select which extensions should be built and should
76-
# be kept in sync.
77-
#
78-
# The `default` target makes direct use of cmake easier since knowledge
79-
# of which extensions are supported has been factored in, e.g.
80-
#
81-
# mkdir build && cd build
82-
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
83-
# cmake --build . --target default
84-
#
85-
add_custom_target(default)
8687
message(STATUS "Enabling core extension.")
8788

8889
# Define _core_C extension
@@ -100,8 +101,6 @@ define_gpu_extension_target(
100101
USE_SABI 3
101102
WITH_SOABI)
102103

103-
add_dependencies(default _core_C)
104-
105104
#
106105
# Forward the non-CUDA device extensions to external CMake scripts.
107106
#
@@ -173,6 +172,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
173172
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
174173
endif()
175174

175+
include(FetchContent)
176+
176177
#
177178
# Set rocm version dev int.
178179
#
@@ -203,8 +204,11 @@ set(VLLM_EXT_SRC
203204
"csrc/torch_bindings.cpp")
204205

205206
if(VLLM_GPU_LANG STREQUAL "CUDA")
206-
include(FetchContent)
207207
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
208+
209+
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
210+
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
211+
208212
FetchContent_Declare(
209213
cutlass
210214
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@@ -301,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
301305
"csrc/custom_all_reduce.cu")
302306
endif()
303307

308+
message(STATUS "Enabling C extension.")
304309
define_gpu_extension_target(
305310
_C
306311
DESTINATION vllm
@@ -331,6 +336,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
331336
"csrc/moe/marlin_moe_ops.cu")
332337
endif()
333338

339+
message(STATUS "Enabling moe extension.")
334340
define_gpu_extension_target(
335341
_moe_C
336342
DESTINATION vllm
@@ -341,7 +347,6 @@ define_gpu_extension_target(
341347
USE_SABI 3
342348
WITH_SOABI)
343349

344-
345350
if(VLLM_GPU_LANG STREQUAL "HIP")
346351
#
347352
# _rocm_C extension
@@ -364,16 +369,66 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
364369
WITH_SOABI)
365370
endif()
366371

372+
# vllm-flash-attn currently only supported on CUDA
373+
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
374+
return()
375+
endif ()
367376

368-
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
369-
message(STATUS "Enabling C extension.")
370-
add_dependencies(default _C)
377+
#
378+
# Build vLLM flash attention from source
379+
#
380+
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
381+
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
382+
# They should be identical but if they aren't, this is a massive footgun.
383+
#
384+
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
385+
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
386+
# If no component is specified, vllm-flash-attn is still installed.
371387

372-
message(STATUS "Enabling moe extension.")
373-
add_dependencies(default _moe_C)
388+
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
389+
# This is to enable local development of vllm-flash-attn within vLLM.
390+
# It can be set as an environment variable or passed as a cmake argument.
391+
# The environment variable takes precedence.
392+
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
393+
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
374394
endif()
375395

376-
if(VLLM_GPU_LANG STREQUAL "HIP")
377-
message(STATUS "Enabling rocm extension.")
378-
add_dependencies(default _rocm_C)
396+
if(VLLM_FLASH_ATTN_SRC_DIR)
397+
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
398+
else()
399+
FetchContent_Declare(
400+
vllm-flash-attn
401+
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
402+
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
403+
GIT_PROGRESS TRUE
404+
)
379405
endif()
406+
407+
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
408+
set(VLLM_PARENT_BUILD ON)
409+
410+
# Ensure the vllm/vllm_flash_attn directory exists before installation
411+
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
412+
413+
# Make sure vllm-flash-attn install rules are nested under vllm/
414+
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
415+
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
416+
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
417+
418+
# Fetch the vllm-flash-attn library
419+
FetchContent_MakeAvailable(vllm-flash-attn)
420+
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
421+
422+
# Restore the install prefix
423+
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
424+
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
425+
426+
# Copy over the vllm-flash-attn python files
427+
install(
428+
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
429+
DESTINATION vllm/vllm_flash_attn
430+
COMPONENT vllm_flash_attn_c
431+
FILES_MATCHING PATTERN "*.py"
432+
)
433+
434+
# Nothing after vllm-flash-attn, see comment about macros above

Dockerfile

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
4848
# see https://github.com/pytorch/pytorch/pull/123243
4949
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
5050
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
51+
# Override the arch list for flash-attn to reduce the binary size
52+
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
53+
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
5154
#################### BASE BUILD IMAGE ####################
5255

5356
#################### WHEEL BUILD IMAGE ####################
@@ -82,6 +85,7 @@ ENV BUILDKITE_COMMIT=${buildkite_commit}
8285
ARG USE_SCCACHE
8386
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
8487
ARG SCCACHE_REGION_NAME=us-west-2
88+
ARG SCCACHE_S3_NO_CREDENTIALS=0
8589
# if USE_SCCACHE is set, use sccache to speed up compilation
8690
RUN --mount=type=cache,target=/root/.cache/pip \
8791
if [ "$USE_SCCACHE" = "1" ]; then \
@@ -92,6 +96,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
9296
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
9397
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
9498
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
99+
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
95100
&& export SCCACHE_IDLE_TIMEOUT=0 \
96101
&& export CMAKE_BUILD_TYPE=Release \
97102
&& sccache --show-stats \
@@ -180,10 +185,6 @@ FROM vllm-base AS test
180185
ADD . /vllm-workspace/
181186

182187
# install development dependencies (for testing)
183-
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
184-
# This installation must complete before the test dependencies are collected and installed.
185-
RUN --mount=type=cache,target=/root/.cache/pip \
186-
python3 -m pip install "setuptools>=74.1.1"
187188
RUN --mount=type=cache,target=/root/.cache/pip \
188189
python3 -m pip install -r requirements-dev.txt
189190

Dockerfile.cpu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
2424

2525
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
2626

27+
WORKDIR /workspace
28+
2729
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
2830
RUN --mount=type=cache,target=/root/.cache/pip \
2931
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \

0 commit comments

Comments
 (0)