Skip to content

Commit 29bef2c

Browse files
committed
Merge remote-tracking branch 'upstream/main' into upstream_merge_2025_06_04
2 parents 8377189 + 8f4ffbd commit 29bef2c

File tree

100 files changed

+2053
-463
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+2053
-463
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
910
NUMA_NODE=${NUMA_NODE:-1}
1011

1112
export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2324
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2425

2526
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
27-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
29-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
27+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
3029

3130
function cpu_tests() {
3231
set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
5655
# Run AWQ test
5756
docker exec cpu-test-"$NUMA_NODE" bash -c "
5857
set -e
59-
pytest -s -v \
58+
VLLM_USE_V1=0 pytest -s -v \
6059
tests/quantization/test_ipex_quant.py"
6160

6261
# Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
6867
# online serving
6968
docker exec cpu-test-"$NUMA_NODE" bash -c "
7069
set -e
71-
export VLLM_CPU_KVCACHE_SPACE=10
72-
export VLLM_CPU_OMP_THREADS_BIND=$1
7370
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7471
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
7572
python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
8986

9087
# All of CPU tests are expected to be finished less than 40 mins.
9188
export -f cpu_tests
92-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
89+
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
150150
run_and_track_test 10 "test_pallas.py" \
151151
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
152152
run_and_track_test 11 "test_struct_output_generate.py" \
153-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
153+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'"
154154
run_and_track_test 12 "test_moe_pallas.py" \
155155
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156156
run_and_track_test 13 "test_lora.py" \

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ steps:
145145
- examples/offline_inference/rlhf_colocate.py
146146
- tests/examples/offline_inference/data_parallel.py
147147
- tests/v1/test_async_llm_dp.py
148+
- tests/v1/engine/test_engine_core_client.py
148149
commands:
149150
# test with tp=2 and external_dp=2
150151
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -154,6 +155,7 @@ steps:
154155
# test with internal dp
155156
- python3 ../examples/offline_inference/data_parallel.py
156157
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
158+
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
157159
- pytest -v -s distributed/test_utils.py
158160
- pytest -v -s compile/test_basic_correctness.py
159161
- pytest -v -s distributed/test_pynccl.py
@@ -318,6 +320,7 @@ steps:
318320
# these tests need to be separated, cannot combine
319321
- pytest -v -s compile/piecewise/test_simple.py
320322
- pytest -v -s compile/piecewise/test_toy_llama.py
323+
- pytest -v -s compile/piecewise/test_full_cudagraph.py
321324

322325
- label: PyTorch Fullgraph Test # 18min
323326
mirror_hardwares: [amdexperimental, amdproduction]

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
242242
"csrc/activation_kernels.cu"
243243
"csrc/layernorm_kernels.cu"
244244
"csrc/layernorm_quant_kernels.cu"
245+
"csrc/sampler.cu"
245246
"csrc/cuda_view.cu"
246247
"csrc/quantization/gptq/q_gemm.cu"
247248
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"

benchmarks/benchmark_dataset.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,15 @@ def sample(
865865
for item in self.data:
866866
if len(sampled_requests) >= num_requests:
867867
break
868-
prompt = f"{item['instruction']}:\n{item['input']}"
868+
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
869+
the code, do not include any explanation."
870+
871+
# apply template
872+
prompt = tokenizer.apply_chat_template(
873+
[{"role": "user", "content": prompt}],
874+
add_generation_prompt=True,
875+
tokenize=False,
876+
)
869877
prompt_len = len(tokenizer(prompt).input_ids)
870878
sampled_requests.append(
871879
SampleRequest(

csrc/attention/mla/cutlass_mla_kernels.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
119119
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
120120
static_cast<ElementAcc*>(nullptr), stride_LSE},
121121
hw_info,
122-
-1, // split_kv
122+
1, // split_kv
123123
nullptr, // is_var_split_kv
124124
};
125125
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

csrc/ops.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
9292
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
9393
torch::Tensor& weight, double epsilon);
9494

95+
void apply_repetition_penalties_(torch::Tensor& logits,
96+
const torch::Tensor& prompt_mask,
97+
const torch::Tensor& output_mask,
98+
const torch::Tensor& repetition_penalties);
99+
95100
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
96101
torch::Tensor& weight, torch::Tensor& scale,
97102
double epsilon);

csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
99
torch::Tensor const& b,
1010
torch::Tensor const& a_scales,
1111
torch::Tensor const& b_scales) {
12-
TORCH_CHECK(
13-
a.size(0) % 4 == 0,
14-
"Input tensor must have a number of rows that is a multiple of 4. ",
15-
"but got: ", a.size(0), " rows.");
1612
if (out.dtype() == torch::kBFloat16) {
1713
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
1814
out, a, b, a_scales, b_scales);

0 commit comments

Comments
 (0)