Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit f25aa53

Browse files
committed
Merge remote-tracking branch 'IBM/main' into sync-with-ibm
2 parents 255735f + 066041a commit f25aa53

File tree

299 files changed

+16421
-4789
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

299 files changed

+16421
-4789
lines changed

.buildkite/check-wheel-size.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import zipfile
33

4-
MAX_SIZE_MB = 100
4+
MAX_SIZE_MB = 150
55

66

77
def print_top_10_largest_files(zip_file):

.buildkite/run-amd-test.sh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This script build the ROCm docker image and runs test inside it.
1+
# This script runs test inside the corresponding ROCm docker container.
22
set -ex
33

44
# Print ROCm version
@@ -19,15 +19,16 @@ done
1919

2020
echo "--- Building container"
2121
sha=$(git rev-parse --short HEAD)
22-
container_name=rocm_${sha}
22+
image_name=rocm_${sha}
23+
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
2324
docker build \
24-
-t ${container_name} \
25+
-t ${image_name} \
2526
-f Dockerfile.rocm \
2627
--progress plain \
2728
.
2829

2930
remove_docker_container() {
30-
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
31+
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
3132
}
3233
trap remove_docker_container EXIT
3334

@@ -39,6 +40,6 @@ docker run \
3940
--rm \
4041
-e HF_TOKEN \
4142
--name ${container_name} \
42-
${container_name} \
43-
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
43+
${image_name} \
44+
/bin/bash -c "${@}"
4445

.buildkite/run-benchmarks.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
99
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
1010

1111
# run python-based benchmarks and upload the result to buildkite
12-
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
12+
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
1313
bench_latency_exit_code=$?
1414

15-
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
15+
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
1616
bench_throughput_exit_code=$?
1717

1818
# run server-based benchmarks and upload the result to buildkite
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
7474
exit $bench_serving_exit_code
7575
fi
7676

77-
/workspace/buildkite-agent artifact upload openai-*.json
77+
rm ShareGPT_V3_unfiltered_cleaned_split.json
78+
/workspace/buildkite-agent artifact upload "*.json"

.buildkite/test-pipeline.yaml

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55

66
steps:
77
- label: Regression Test
8+
mirror_hardwares: [amd]
89
command: pytest -v -s test_regression.py
910
working_dir: "/vllm-workspace/tests" # optional
1011

1112
- label: AsyncEngine Test
13+
#mirror_hardwares: [amd]
1214
command: pytest -v -s async_engine
1315

1416
- label: Basic Correctness Test
17+
mirror_hardwares: [amd]
1518
commands:
1619
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
1720
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -24,34 +27,40 @@ steps:
2427
command: pytest -v -s core
2528

2629
- label: Distributed Comm Ops Test
27-
command: pytest -v -s test_comm_ops.py
28-
working_dir: "/vllm-workspace/tests/distributed"
30+
#mirror_hardwares: [amd]
31+
command: pytest -v -s distributed/test_comm_ops.py
32+
working_dir: "/vllm-workspace/tests"
2933
num_gpus: 2
3034

3135
- label: Distributed Tests
32-
working_dir: "/vllm-workspace/tests/distributed"
33-
34-
num_gpus: 2 # only support 1 or 2 for now.
3536
mirror_hardwares: [amd]
36-
37+
working_dir: "/vllm-workspace/tests"
38+
num_gpus: 2
3739
commands:
38-
- pytest -v -s test_pynccl_library.py
39-
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
40-
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
41-
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
42-
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
40+
- pytest -v -s distributed/test_pynccl_library.py
41+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
42+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
43+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
44+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
45+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
46+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
47+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
48+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
49+
- pytest -v -s spec_decode/e2e/test_integration_dist.py
4350

4451
- label: Distributed Tests (Multiple Groups)
45-
working_dir: "/vllm-workspace/tests/distributed"
52+
#mirror_hardwares: [amd]
53+
working_dir: "/vllm-workspace/tests"
4654
num_gpus: 4
4755
commands:
48-
- pytest -v -s test_pynccl.py
56+
- pytest -v -s distributed/test_pynccl.py
4957

5058
- label: Engine Test
5159
mirror_hardwares: [amd]
5260
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
5361

5462
- label: Entrypoints Test
63+
#mirror_hardwares: [amd]
5564
commands:
5665
# these tests have to be separated, because each one will allocate all posible GPU memory
5766
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
@@ -62,21 +71,24 @@ steps:
6271
mirror_hardwares: [amd]
6372
commands:
6473
# install aws cli for llava_example.py
65-
- pip install awscli
74+
# install tensorizer for tensorize_vllm_model.py
75+
- pip install awscli tensorizer
6676
- python3 offline_inference.py
6777
- python3 offline_inference_with_prefix.py
6878
- python3 llm_engine_example.py
6979
- python3 llava_example.py
80+
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
7081

7182
- label: Kernels Test %N
83+
#mirror_hardwares: [amd]
7284
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
7385
parallelism: 4
7486

7587
- label: Models Test
76-
mirror_hardwares: [amd]
88+
#mirror_hardwares: [amd]
7789
commands:
7890
- bash ../.buildkite/download-images.sh
79-
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
91+
- pytest -v -s models --ignore=models/test_llava.py
8092

8193
- label: Llava Test
8294
mirror_hardwares: [amd]
@@ -90,6 +102,7 @@ steps:
90102
- pytest -v -s prefix_caching
91103

92104
- label: Samplers Test
105+
#mirror_hardwares: [amd]
93106
command: pytest -v -s samplers
94107

95108
- label: LogitsProcessor Test
@@ -101,20 +114,38 @@ steps:
101114
command: pytest -v -s worker
102115

103116
- label: Speculative decoding tests
104-
mirror_hardwares: [amd]
117+
#mirror_hardwares: [amd]
105118
command: pytest -v -s spec_decode
106119

107120
- label: LoRA Test %N
108-
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
121+
#mirror_hardwares: [amd]
122+
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
109123
parallelism: 4
110124

125+
- label: LoRA Long Context (Distributed)
126+
#mirror_hardwares: [amd]
127+
num_gpus: 4
128+
# This test runs llama 13B, so it is required to run on 4 GPUs.
129+
commands:
130+
# Temporarily run this way because we cannot clean up GPU mem usage
131+
# for multi GPU tests.
132+
# TODO(sang): Fix it.
133+
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
134+
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
135+
- pytest -v -s lora/test_long_context.py::test_self_consistency
136+
- pytest -v -s lora/test_long_context.py::test_quality
137+
- pytest -v -s lora/test_long_context.py::test_max_len
138+
111139
- label: Tensorizer Test
140+
#mirror_hardwares: [amd]
112141
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
113142

114143
- label: Metrics Test
144+
mirror_hardwares: [amd]
115145
command: pytest -v -s metrics
116146

117147
- label: Quantization Test
148+
#mirror_hardwares: [amd]
118149
command: pytest -v -s quantization
119150

120151
- label: Benchmarks

.buildkite/test-template.j2

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
{% set default_working_dir = "/vllm-workspace/tests" %}
44

55
steps:
6-
76
- label: ":docker: build image"
8-
commands:
7+
commands:
98
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
109
- "docker push {{ docker_image }}"
1110
env:
@@ -14,6 +13,8 @@ steps:
1413
automatic:
1514
- exit_status: -1 # Agent was lost
1615
limit: 5
16+
- exit_status: -10 # Agent was lost
17+
limit: 5
1718
- wait
1819

1920
- group: "AMD Tests"
@@ -24,7 +25,7 @@ steps:
2425
- label: "AMD: {{ step.label }}"
2526
agents:
2627
queue: amd
27-
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
28+
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
2829
env:
2930
DOCKER_BUILDKIT: "1"
3031
{% endif %}
@@ -53,6 +54,8 @@ steps:
5354
automatic:
5455
- exit_status: -1 # Agent was lost
5556
limit: 5
57+
- exit_status: -10 # Agent was lost
58+
limit: 5
5659
plugins:
5760
- kubernetes:
5861
podSpec:

CMakeLists.txt

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -167,19 +167,44 @@ set(VLLM_EXT_SRC
167167
"csrc/layernorm_kernels.cu"
168168
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
169169
"csrc/quantization/gptq/q_gemm.cu"
170-
"csrc/quantization/fp8/fp8_cuda_kernels.cu"
170+
"csrc/quantization/fp8/common.cu"
171171
"csrc/cuda_utils_kernels.cu"
172172
"csrc/moe_align_block_size_kernels.cu"
173173
"csrc/pybind.cpp")
174174

175175
if(VLLM_GPU_LANG STREQUAL "CUDA")
176+
include(FetchContent)
177+
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
178+
FetchContent_Declare(
179+
cutlass
180+
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
181+
# CUTLASS 3.5.0
182+
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
183+
)
184+
FetchContent_MakeAvailable(cutlass)
185+
176186
list(APPEND VLLM_EXT_SRC
177187
"csrc/quantization/aqlm/gemm_kernels.cu"
178188
"csrc/quantization/awq/gemm_kernels.cu"
179-
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
189+
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
190+
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
180191
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
181192
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
182-
"csrc/custom_all_reduce.cu")
193+
"csrc/custom_all_reduce.cu"
194+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
195+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
196+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
197+
198+
#
199+
# The CUTLASS kernels for Hopper require sm90a to be enabled.
200+
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
201+
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
202+
set_source_files_properties(
203+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
204+
PROPERTIES
205+
COMPILE_FLAGS
206+
"-gencode arch=compute_90a,code=sm_90a")
207+
183208
endif()
184209

185210
define_gpu_extension_target(
@@ -189,6 +214,7 @@ define_gpu_extension_target(
189214
SOURCES ${VLLM_EXT_SRC}
190215
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
191216
ARCHITECTURES ${VLLM_GPU_ARCHES}
217+
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
192218
WITH_SOABI)
193219

194220
#
@@ -219,7 +245,8 @@ set(VLLM_PUNICA_EXT_SRC
219245
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
220246
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
221247
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
222-
"csrc/punica/punica_ops.cc")
248+
"csrc/punica/punica_ops.cu"
249+
"csrc/punica/punica_pybind.cpp")
223250

224251
#
225252
# Copy GPU compilation flags+update for punica
@@ -243,6 +270,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
243270
endif()
244271
endforeach()
245272
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
273+
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
274+
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
275+
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
246276
endif()
247277

248278
if (VLLM_PUNICA_GPU_ARCHES)
@@ -277,11 +307,6 @@ add_custom_target(default)
277307
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
278308
message(STATUS "Enabling C extension.")
279309
add_dependencies(default _C)
280-
endif()
281-
282-
if(VLLM_GPU_LANG STREQUAL "CUDA")
283-
message(STATUS "Enabling moe extension.")
284-
add_dependencies(default _moe_C)
285310

286311
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
287312
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
@@ -292,3 +317,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
292317
add_dependencies(default _punica_C)
293318
endif()
294319
endif()
320+
321+
if(VLLM_GPU_LANG STREQUAL "CUDA")
322+
message(STATUS "Enabling moe extension.")
323+
add_dependencies(default _moe_C)
324+
endif()

Dockerfile

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -87,23 +87,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
8787
pip cache remove vllm_nccl*
8888
#################### EXTENSION Build IMAGE ####################
8989

90-
#################### FLASH_ATTENTION Build IMAGE ####################
91-
FROM dev as flash-attn-builder
92-
# max jobs used for build
93-
ARG max_jobs=2
94-
ENV MAX_JOBS=${max_jobs}
95-
# flash attention version
96-
ARG flash_attn_version=v2.5.8
97-
ENV FLASH_ATTN_VERSION=${flash_attn_version}
98-
99-
WORKDIR /usr/src/flash-attention-v2
100-
101-
# Download the wheel or build it if a pre-compiled release doesn't exist
102-
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
103-
--no-build-isolation --no-deps --no-cache-dir
104-
105-
#################### FLASH_ATTENTION Build IMAGE ####################
106-
10790
#################### vLLM installation IMAGE ####################
10891
# image with vLLM installed
10992
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
@@ -122,10 +105,6 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
122105
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
123106
--mount=type=cache,target=/root/.cache/pip \
124107
pip install dist/*.whl --verbose
125-
126-
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
127-
--mount=type=cache,target=/root/.cache/pip \
128-
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
129108
#################### vLLM installation IMAGE ####################
130109

131110

Dockerfile.rocm

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ COPY . .
9494

9595
RUN python3 -m pip install --upgrade pip numba
9696

97+
# make sure punica kernels are built (for LoRA)
98+
ENV VLLM_INSTALL_PUNICA_KERNELS=1
99+
97100
RUN --mount=type=cache,target=/root/.cache/pip \
98101
pip install -U -r requirements-rocm.txt \
99102
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \

0 commit comments

Comments
 (0)