Skip to content

Commit bcbb7a6

Browse files
committed
Merge remote-tracking branch 'upstream/main' into upstream_merge_2025_06_04
2 parents ccfa3b8 + 9bc8bb0 commit bcbb7a6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1286
-335
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
steps:
22
- label: "Build wheel - CUDA 12.8"
3+
id: build-wheel-cuda-12-8
34
agents:
45
queue: cpu_queue_postmerge
56
commands:
@@ -11,6 +12,7 @@ steps:
1112
DOCKER_BUILDKIT: "1"
1213

1314
- label: "Build wheel - CUDA 12.6"
15+
id: build-wheel-cuda-12-6
1416
agents:
1517
queue: cpu_queue_postmerge
1618
commands:
@@ -28,6 +30,7 @@ steps:
2830

2931
- label: "Build wheel - CUDA 11.8"
3032
# depends_on: block-build-cu118-wheel
33+
id: build-wheel-cuda-11-8
3134
agents:
3235
queue: cpu_queue_postmerge
3336
commands:
@@ -44,13 +47,26 @@ steps:
4447

4548
- label: "Build release image"
4649
depends_on: block-release-image-build
50+
id: build-release-image
4751
agents:
4852
queue: cpu_queue_postmerge
4953
commands:
5054
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
5155
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
5256
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
5357

58+
- label: "Annotate release workflow"
59+
depends_on:
60+
- build-release-image
61+
- build-wheel-cuda-12-8
62+
- build-wheel-cuda-12-6
63+
- build-wheel-cuda-11-8
64+
id: annotate-release-workflow
65+
agents:
66+
queue: cpu_queue_postmerge
67+
commands:
68+
- "bash .buildkite/scripts/annotate-release.sh"
69+
5470
- label: "Build and publish TPU release image"
5571
depends_on: ~
5672
if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
7086
DOCKER_BUILDKIT: "1"
7187

7288
- input: "Provide Release version here"
89+
id: input-release-version
7390
fields:
7491
- text: "What is the release version?"
75-
key: "release-version"
92+
key: release-version
7693

7794
- block: "Build CPU release image"
7895
key: block-cpu-release-image-build
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
# Get release version and strip leading 'v' if present
6+
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
7+
8+
if [ -z "$RELEASE_VERSION" ]; then
9+
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
10+
exit 1
11+
fi
12+
13+
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
14+
To download the wheel:
15+
\`\`\`
16+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
17+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
18+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
19+
\`\`\`
20+
21+
To download and upload the image:
22+
23+
\`\`\`
24+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
25+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
26+
docker tag vllm/vllm-openai vllm/vllm-openai:latest
27+
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
28+
docker push vllm/vllm-openai:latest
29+
docker push vllm/vllm-openai:v${RELEASE_VERSION}
30+
\`\`\`
31+
EOF

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
150150
run_and_track_test 10 "test_pallas.py" \
151151
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
152152
run_and_track_test 11 "test_struct_output_generate.py" \
153-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'"
153+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
154154
run_and_track_test 12 "test_moe_pallas.py" \
155155
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156156
run_and_track_test 13 "test_lora.py" \

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ repos:
1111
hooks:
1212
- id: yapf
1313
args: [--in-place, --verbose]
14+
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
15+
exclude: '(.buildkite|benchmarks|build|examples)/.*'
1416
- repo: https://github.com/astral-sh/ruff-pre-commit
1517
rev: v0.11.7
1618
hooks:

benchmarks/benchmark_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,9 @@ def iterencode(self, o: Any, *args, **kwargs) -> Any:
6666

6767
def write_to_json(filename: str, records: list) -> None:
6868
with open(filename, "w") as f:
69-
json.dump(records, f, cls=InfEncoder)
69+
json.dump(
70+
records,
71+
f,
72+
cls=InfEncoder,
73+
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
74+
)

csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
1515
template <typename InType, typename OutType,
1616
template <typename, typename, typename> typename Epilogue>
1717
struct sm100_fp8_config_default {
18+
// M in (128, inf)
1819
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
1920
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
2021
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
2526
KernelSchedule, EpilogueSchedule>;
2627
};
2728

29+
template <typename InType, typename OutType,
30+
template <typename, typename, typename> typename Epilogue>
31+
struct sm100_fp8_config_M128 {
32+
// M in (64, 128]
33+
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
34+
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
35+
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
36+
using TileShape = Shape<_128, _128, _64>;
37+
using ClusterShape = Shape<_2, _2, _1>;
38+
using Cutlass3xGemm =
39+
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
40+
KernelSchedule, EpilogueSchedule>;
41+
};
42+
43+
template <typename InType, typename OutType,
44+
template <typename, typename, typename> typename Epilogue>
45+
struct sm100_fp8_config_M64 {
46+
// M in [1, 64]
47+
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
48+
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
49+
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
50+
using TileShape = Shape<_64, _64, _256>;
51+
using ClusterShape = Shape<_1, _8, _1>;
52+
using Cutlass3xGemm =
53+
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
54+
KernelSchedule, EpilogueSchedule>;
55+
};
56+
2857
template <typename InType, typename OutType,
2958
template <typename, typename, typename> typename Epilogue,
3059
typename... EpilogueArgs>
@@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
3968
using Cutlass3xGemmDefault =
4069
typename sm100_fp8_config_default<InType, OutType,
4170
Epilogue>::Cutlass3xGemm;
42-
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
43-
out, a, b, std::forward<EpilogueArgs>(args)...);
71+
using Cutlass3xGemmM64 =
72+
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
73+
using Cutlass3xGemmM128 =
74+
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
75+
76+
uint32_t const m = a.size(0);
77+
uint32_t const mp2 =
78+
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
79+
80+
if (mp2 <= 64) {
81+
// m in [1, 64]
82+
return cutlass_gemm_caller<Cutlass3xGemmM64>(
83+
out, a, b, std::forward<EpilogueArgs>(args)...);
84+
} else if (mp2 <= 128) {
85+
// m in (64, 128]
86+
return cutlass_gemm_caller<Cutlass3xGemmM128>(
87+
out, a, b, std::forward<EpilogueArgs>(args)...);
88+
} else {
89+
// m in (128, inf)
90+
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
91+
out, a, b, std::forward<EpilogueArgs>(args)...);
92+
}
4493
}
4594

4695
template <template <typename, typename, typename> typename Epilogue,

docker/Dockerfile.nightly_torch

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,4 +312,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
312312
# Logging to confirm the torch versions
313313
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
314314

315+
# Logging to confirm all the packages are installed
316+
RUN pip freeze
317+
315318
#################### UNITTEST IMAGE #############################

docs/models/extensions/tensorizer.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
1010

1111
For more information on CoreWeave's Tensorizer, please refer to
1212
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
13-
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
13+
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
1414

1515
!!! note
1616
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
import asyncio
3+
from typing import Optional
4+
5+
from vllm.engine.arg_utils import AsyncEngineArgs
6+
from vllm.engine.async_llm_engine import AsyncLLMEngine
7+
from vllm.outputs import RequestOutput
8+
from vllm.sampling_params import SamplingParams
9+
10+
"""
11+
To run this example, run the following commands simultaneously with
12+
different CUDA_VISIBLE_DEVICES:
13+
python examples/online_serving/multi_instance_data_parallel.py
14+
15+
vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
16+
--data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
17+
--data-parallel-size-local 1 --enforce-eager --headless
18+
19+
Once both instances have completed the handshake, this example will
20+
send a request to the instance with DP rank 1.
21+
"""
22+
23+
24+
async def main():
25+
engine_args = AsyncEngineArgs(
26+
model="ibm-research/PowerMoE-3b",
27+
data_parallel_size=2,
28+
dtype="auto",
29+
max_model_len=2048,
30+
data_parallel_address="127.0.0.1",
31+
data_parallel_rpc_port=62300,
32+
data_parallel_size_local=1,
33+
enforce_eager=True,
34+
)
35+
36+
engine_client = AsyncLLMEngine.from_engine_args(engine_args)
37+
38+
sampling_params = SamplingParams(
39+
temperature=0.7,
40+
top_p=0.9,
41+
max_tokens=100,
42+
)
43+
44+
prompt = "Who won the 2004 World Series?"
45+
final_output: Optional[RequestOutput] = None
46+
async for output in engine_client.generate(
47+
prompt=prompt,
48+
sampling_params=sampling_params,
49+
request_id="abcdef",
50+
data_parallel_rank=1,
51+
):
52+
final_output = output
53+
if final_output:
54+
print(final_output.outputs[0].text)
55+
56+
57+
if __name__ == "__main__":
58+
asyncio.run(main())

requirements/nightly_torch_test.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ pytest-shard
99
pytest-timeout
1010

1111
librosa # required by audio tests in entrypoints/openai
12-
sentence-transformers
12+
sentence-transformers # required for embedding tests
13+
transformers==4.51.3
14+
transformers_stream_generator # required for qwen-vl test
1315
numba == 0.61.2; python_version > '3.9'
1416
# testing utils
1517
boto3
@@ -38,4 +40,7 @@ matplotlib # required for qwen-vl test
3840
# required for Multi-Modal Models Test (Standard)
3941
num2words # required for smolvlm test
4042
pqdm
41-
timm # required for internvl test
43+
timm # required for internvl test
44+
45+
schemathesis>=3.39.15 # Required for openai schema test.
46+
mteb>=1.38.11, <2 # required for mteb test

0 commit comments

Comments
 (0)