Skip to content

Commit f176443

Browse files
tjtanaakhluu
authored andcommitted
[Release] [CI] Optim release pipeline (#33156)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> (cherry picked from commit f9d0359)
1 parent fe18ce4 commit f176443

File tree

5 files changed

+399
-24
lines changed

5 files changed

+399
-24
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,9 +638,93 @@ steps:
638638
depends_on:
639639
- step: upload-rocm-wheels
640640
allow_failure: true
641+
- step: input-release-version
642+
allow_failure: true
641643
agents:
642644
queue: cpu_queue_postmerge
643645
commands:
644646
- "bash .buildkite/scripts/annotate-rocm-release.sh"
645647
env:
646648
S3_BUCKET: "vllm-wheels"
649+
650+
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
651+
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
652+
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
653+
- block: "Generate Root Index for ROCm Wheels for Release"
654+
key: block-generate-root-index-rocm-wheels
655+
depends_on: upload-rocm-wheels
656+
657+
- label: ":package: Generate Root Index for ROCm Wheels for Release"
658+
depends_on: block-generate-root-index-rocm-wheels
659+
id: generate-root-index-rocm-wheels
660+
agents:
661+
queue: cpu_queue_postmerge
662+
commands:
663+
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
664+
env:
665+
S3_BUCKET: "vllm-wheels"
666+
VARIANT: "rocm700"
667+
668+
# ROCm Job 5: Build ROCm Release Docker Image
669+
- label: ":rocm: :docker: Build ROCm Release Docker Image"
670+
id: build-rocm-release-image
671+
depends_on:
672+
- step: build-rocm-base-wheels
673+
allow_failure: false
674+
agents:
675+
queue: cpu_queue_postmerge
676+
timeout_in_minutes: 60
677+
commands:
678+
- |
679+
set -euo pipefail
680+
681+
# Login to ECR
682+
aws ecr-public get-login-password --region us-east-1 | \
683+
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
684+
685+
# Download Docker image from S3 (set by build-rocm-base-wheels)
686+
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
687+
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
688+
echo "ERROR: rocm-docker-image-s3-path metadata not found"
689+
exit 1
690+
fi
691+
692+
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
693+
mkdir -p artifacts/rocm-docker-image
694+
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
695+
696+
# Load base Docker image
697+
echo "Loading base Docker image..."
698+
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
699+
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
700+
echo "Loaded base image: $${BASE_IMAGE_TAG}"
701+
702+
# Tag and push the base image to ECR
703+
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
704+
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
705+
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
706+
707+
# Get GPU architectures from meta-data
708+
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
709+
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
710+
711+
# Build vLLM ROCm release image using cached base
712+
DOCKER_BUILDKIT=1 docker build \
713+
--build-arg max_jobs=16 \
714+
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
715+
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
716+
--build-arg USE_SCCACHE=1 \
717+
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
718+
--build-arg SCCACHE_REGION_NAME=us-west-2 \
719+
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
720+
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
721+
--target vllm-openai \
722+
--progress plain \
723+
-f docker/Dockerfile.rocm .
724+
725+
# Push to ECR
726+
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
727+
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
728+
env:
729+
DOCKER_BUILDKIT: "1"
730+
S3_BUCKET: "vllm-wheels"

.buildkite/scripts/annotate-release.sh

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ To download and upload the image:
3232
\`\`\`
3333
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
3434
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
35+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
3536
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
3637
3738
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
@@ -46,11 +47,17 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4647
docker push vllm/vllm-openai:latest-aarch64
4748
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4849
49-
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
50-
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
51-
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
52-
docker push vllm/vllm-openai:latest-rocm
53-
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
50+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
51+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
52+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
53+
docker push vllm/vllm-openai-rocm:latest-base
54+
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
55+
56+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
57+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
58+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
59+
docker push vllm/vllm-openai-rocm:latest
60+
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
5461
5562
docker manifest rm vllm/vllm-openai:latest
5663
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64

.buildkite/scripts/annotate-rocm-release.sh

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,32 @@
33
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
44
#
55
# Generate Buildkite annotation for ROCm wheel release
6-
76
set -ex
87

98
# Get build configuration from meta-data
109
# Extract ROCm version dynamically from Dockerfile.rocm_base
11-
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
10+
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
1211
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
1312
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
1413
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
1514

15+
# TODO: Enable the nightly build for ROCm
16+
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
17+
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
18+
if [ -z "${RELEASE_VERSION}" ]; then
19+
RELEASE_VERSION="1.0.0.dev"
20+
fi
21+
1622
# S3 URLs
1723
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
1824
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
19-
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
20-
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
25+
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
2126

27+
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
28+
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
29+
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
2230
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
23-
## :rocm: ROCm Wheel Release
24-
31+
## ROCm Wheel and Docker Image Releases
2532
### Build Configuration
2633
| Setting | Value |
2734
|---------|-------|
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
3441
### :package: Installation
3542
3643
**Install from this build (by commit):**
44+
3745
\`\`\`bash
38-
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
46+
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
3947
40-
# Example:
41-
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
48+
# Example for ROCm ${ROCM_VERSION}:
49+
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
4250
\`\`\`
4351
4452
**Install from nightly (if published):**
53+
4554
\`\`\`bash
46-
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
55+
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
4756
\`\`\`
4857
4958
### :floppy_disk: Download Wheels Directly
5059
5160
\`\`\`bash
5261
# List all ROCm wheels
53-
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
54-
62+
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
5563
# Download specific wheels
56-
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
57-
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
58-
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
59-
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
60-
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
64+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
65+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
66+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
67+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
68+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
69+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
70+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
71+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
72+
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
6173
\`\`\`
6274
6375
### :gear: Included Packages
6476
- **vllm**: vLLM with ROCm support
6577
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
66-
- **triton_rocm**: Triton built for ROCm
78+
- **triton**: Triton
79+
- **triton-kernels**: Triton kernels
6780
- **torchvision**: TorchVision for ROCm PyTorch
81+
- **torchaudio**: Torchaudio for ROCm PyTorch
6882
- **amdsmi**: AMD SMI Python bindings
83+
- **aiter**: Aiter for ROCm
84+
- **flash-attn**: Flash Attention for ROCm
6985
7086
### :warning: Notes
7187
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
7288
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
7389
- Platform: Linux x86_64 only
90+
91+
### :package: Docker Image Release
92+
93+
To download and upload the image:
94+
95+
\`\`\`
96+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
97+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
98+
99+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
100+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
101+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
102+
docker push vllm/vllm-openai-rocm:latest-base
103+
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
104+
105+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
106+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
107+
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
108+
docker push vllm/vllm-openai-rocm:latest
109+
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
110+
\`\`\`
111+
74112
EOF

docker/Dockerfile.rocm

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
227227
# This ensures setuptools_scm sees clean repo state for version detection
228228
RUN --mount=type=bind,source=.git,target=vllm/.git \
229229
cd vllm \
230-
&& pip install setuptools_scm \
230+
&& pip install setuptools_scm regex \
231231
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
232232
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
233233
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
@@ -342,6 +342,19 @@ RUN mkdir src && mv vllm src/vllm
342342
FROM base AS final
343343

344344
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
345+
346+
# Clean up sccache from release image (not needed at runtime)
347+
# This removes the binary and wrappers that may have been installed during build
348+
RUN rm -f /usr/bin/sccache || true \
349+
&& rm -rf /opt/sccache-wrappers || true
350+
351+
# Unset sccache environment variables for the release image
352+
# This prevents S3 bucket config from leaking into production images
353+
ENV SCCACHE_BUCKET=
354+
ENV SCCACHE_REGION=
355+
ENV SCCACHE_S3_NO_CREDENTIALS=
356+
ENV SCCACHE_IDLE_TIMEOUT=
357+
345358
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
346359
# Manually remove it so that later steps of numpy upgrade can continue
347360
RUN case "$(which python3)" in \

0 commit comments

Comments
 (0)