Skip to content

Commit 49cdc3d

Browse files
committed
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/dbo-full-cudagraphs
2 parents ba00047 + 3724107 commit 49cdc3d

File tree

141 files changed

+5050
-1262
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+5050
-1262
lines changed

.buildkite/check-wheel-size.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import sys
66
import zipfile
77

8-
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
9-
# Note that we have 400 MiB quota, please use it wisely.
10-
# See https://github.com/pypi/support/issues/3792 .
8+
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
9+
# Note that we have 800 MiB quota, please use it wisely.
10+
# See https://github.com/pypi/support/issues/6326 .
1111
# Please also sync the value with the one in Dockerfile.
12-
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
12+
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
1313

1414

1515
def print_top_10_largest_files(zip_file):

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@ docker run \
3030
bash -c '
3131
set -e
3232
echo $ZE_AFFINITY_MASK
33-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
34-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
35-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
36-
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
33+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
34+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
35+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
36+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
37+
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3738
cd tests
3839
pytest -v -s v1/core
3940
pytest -v -s v1/engine

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Easy, fast, and cheap LLM serving for everyone
1818

1919
*Latest News* 🔥
2020

21+
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
2122
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
2223
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
2324
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).

benchmarks/auto_tune/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ cd vllm
3131

3232
You must set the following variables at the top of the script before execution.
3333

34+
Note: You can also override the default values below via environment variables when running the script.
35+
36+
```bash
37+
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
38+
```
39+
3440
| Variable | Description | Example Value |
3541
| --- | --- | --- |
3642
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |

benchmarks/auto_tune/auto_tune.sh

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,41 @@
55

66
TAG=$(date +"%Y_%m_%d_%H_%M")
77
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8-
BASE="$SCRIPT_DIR/../../.."
9-
MODEL="meta-llama/Llama-3.1-8B-Instruct"
10-
SYSTEM="TPU"
11-
TP=1
12-
DOWNLOAD_DIR=""
13-
INPUT_LEN=4000
14-
OUTPUT_LEN=16
15-
MAX_MODEL_LEN=4096
16-
MIN_CACHE_HIT_PCT=0
17-
MAX_LATENCY_ALLOWED_MS=100000000000
18-
NUM_SEQS_LIST="128 256"
19-
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
8+
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
9+
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
10+
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
11+
SYSTEM=${SYSTEM:-"TPU"}
12+
TP=${TP:-1}
13+
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
14+
INPUT_LEN=${INPUT_LEN:-4000}
15+
OUTPUT_LEN=${OUTPUT_LEN:-16}
16+
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
17+
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
18+
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
19+
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
20+
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
2021

2122
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
2223
RESULT="$LOG_FOLDER/result.txt"
2324
PROFILE_PATH="$LOG_FOLDER/profile"
2425

25-
echo "result file: $RESULT"
26-
echo "model: $MODEL"
26+
echo "====================== AUTO TUNE PARAMETERS ===================="
27+
echo "SCRIPT_DIR=$SCRIPT_DIR"
28+
echo "BASE=$BASE"
29+
echo "MODEL=$MODEL"
30+
echo "SYSTEM=$SYSTEM"
31+
echo "TP=$TP"
32+
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
33+
echo "INPUT_LEN=$INPUT_LEN"
34+
echo "OUTPUT_LEN=$OUTPUT_LEN"
35+
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
36+
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
37+
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
38+
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
39+
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
40+
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
41+
echo "RESULT_FILE=$RESULT"
42+
echo "====================== AUTO TUNEPARAMETERS ===================="
2743

2844
rm -rf $LOG_FOLDER
2945
rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
213229

214230
pkill -if vllm
215231
sleep 10
216-
printf '=%.0s' $(seq 1 20)
232+
echo "===================="
217233
return 0
218234
}
219235

benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ benchmark() {
6262
--max-model-len 10000 \
6363
--gpu-memory-utilization 0.6 \
6464
--kv-transfer-config \
65-
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
65+
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
6666

6767

6868
CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
7272
--max-model-len 10000 \
7373
--gpu-memory-utilization 0.6 \
7474
--kv-transfer-config \
75-
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
75+
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
7676

7777
wait_for_server 8100
7878
wait_for_server 8200

benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
6969
--max-model-len 10000 \
7070
--gpu-memory-utilization 0.6 \
7171
--kv-transfer-config \
72-
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
72+
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
7373

7474
CUDA_VISIBLE_DEVICES=1 python3 \
7575
-m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
7878
--max-model-len 10000 \
7979
--gpu-memory-utilization 0.6 \
8080
--kv-transfer-config \
81-
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
81+
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
8282

8383
wait_for_server 8100
8484
wait_for_server 8200

cmake/cpu_extension.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
8888

8989
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
9090
message(STATUS "Apple Silicon Detected")
91+
set(APPLE_SILICON_FOUND TRUE)
9192
set(ENABLE_NUMA OFF)
9293
check_sysctl(hw.optional.neon ASIMD_FOUND)
9394
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
189190
set(USE_ACL OFF)
190191
endif()
191192

192-
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
193+
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
193194
FetchContent_Declare(
194195
oneDNN
195196
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git

cmake/external_projects/vllm_flash_attn.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ else()
3838
FetchContent_Declare(
3939
vllm-flash-attn
4040
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
41-
GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
41+
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
4242
GIT_PROGRESS TRUE
4343
# Don't share the vllm-flash-attn build between build types
4444
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

csrc/attention/mla/sm100_cutlass_mla_kernel.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ struct IsPersistent {
6464
static const bool value = v;
6565
};
6666

67-
template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
67+
template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
6868
struct MlaSm100 {
6969
using Element = T;
7070
using ElementAcc = float;
71-
using ElementOut = T;
71+
using ElementOut = TOut;
7272

7373
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
7474
using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -178,7 +178,7 @@ typename T::Fmha::Arguments args_from_options(
178178
return arguments;
179179
}
180180

181-
template <typename Element, bool IsPaged128, typename PersistenceOption>
181+
template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
182182
void runMla(
183183
at::Tensor const& out,
184184
at::Tensor const& q_nope,
@@ -190,7 +190,7 @@ void runMla(
190190
double sm_scale,
191191
int64_t num_kv_splits,
192192
cudaStream_t stream) {
193-
using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
193+
using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
194194
typename MlaSm100Type::Fmha fmha;
195195
auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
196196

@@ -233,13 +233,13 @@ void sm100_cutlass_mla_decode(
233233
DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
234234
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
235235
if (in_dtype == at::ScalarType::Half) {
236-
runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
236+
runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
237237
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
238238
} else if (in_dtype == at::ScalarType::BFloat16) {
239-
runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
239+
runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
240240
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
241241
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
242-
runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
242+
runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
243243
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
244244
} else {
245245
TORCH_CHECK(false, "Unsupported input data type of MLA");
@@ -253,7 +253,7 @@ void sm100_cutlass_mla_decode(
253253
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
254254
// Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
255255
// which are float, so Element type here doesn't matter.
256-
using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
256+
using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
257257

258258
// Get split kv. Requires problem shape and sm_count only.
259259
typename MlaSm100Type::Fmha::Arguments arguments;

0 commit comments

Comments
 (0)