Skip to content

Commit 93d7a4d

Browse files
authored
Merge pull request ROCm#575 from ROCm/upstream_merge_2025_06_09
Upstream merge 2025 06 09
2 parents cdfe72b + 6ec2533 commit 93d7a4d

File tree

155 files changed

+6430
-1766
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+6430
-1766
lines changed

.buildkite/scripts/ci-clean-log.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# Usage: ./ci_clean_log.sh ci.log
3+
# This script strips timestamps and color codes from CI log files.
4+
5+
# Check if argument is given
6+
if [ $# -lt 1 ]; then
7+
echo "Usage: $0 ci.log"
8+
exit 1
9+
fi
10+
11+
INPUT_FILE="$1"
12+
13+
# Strip timestamps
14+
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
15+
16+
# Strip colorization
17+
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ set -ex
77
# Setup cleanup
88
remove_docker_container() {
99
if [[ -n "$container_id" ]]; then
10+
podman stop --all -t0
1011
podman rm -f "$container_id" || true
1112
fi
1213
podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
3738
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
3839
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
3940
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
40-
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
41+
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
4142
}
4243

4344
# All of CPU tests are expected to be finished less than 40 mins.

.buildkite/scripts/rerun-test.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# Usage: ./rerun_test.sh path/to/test.py::test_name
4+
5+
# Check if argument is given
6+
if [ $# -lt 1 ]; then
7+
echo "Usage: $0 path/to/test.py::test_name"
8+
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
9+
exit 1
10+
fi
11+
12+
TEST=$1
13+
COUNT=1
14+
15+
while pytest -sv "$TEST"; do
16+
COUNT=$((COUNT + 1))
17+
echo "RUN NUMBER ${COUNT}"
18+
done
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
docker_root=$(docker info -f '{{.DockerRootDir}}')
6+
if [ -z "$docker_root" ]; then
7+
echo "Failed to determine Docker root directory."
8+
exit 1
9+
fi
10+
echo "Docker root directory: $docker_root"
11+
# Check disk usage of the filesystem where Docker's root directory is located
12+
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
13+
# Define the threshold
14+
threshold=70
15+
if [ "$disk_usage" -gt "$threshold" ]; then
16+
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
17+
# Remove dangling images (those that are not tagged and not used by any container)
18+
docker image prune -f
19+
# Remove unused volumes / force the system prune for old images as well.
20+
docker volume prune -f && docker system prune --force --filter "until=72h" --all
21+
echo "Docker images and volumes cleanup completed."
22+
else
23+
echo "Disk usage is below $threshold%. No cleanup needed."
24+
fi
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Environment config
2+
TEST_NAME=llama8b
3+
CONTAINER_NAME=vllm-tpu
4+
5+
# vllm config
6+
MODEL=meta-llama/Llama-3.1-8B-Instruct
7+
MAX_NUM_SEQS=512
8+
MAX_NUM_BATCHED_TOKENS=512
9+
TENSOR_PARALLEL_SIZE=1
10+
MAX_MODEL_LEN=2048
11+
DOWNLOAD_DIR=/mnt/disks/persist
12+
EXPECTED_THROUGHPUT=8.0
13+
INPUT_LEN=1800
14+
OUTPUT_LEN=128
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/bin/bash
2+
3+
if [ ! -f "$1" ]; then
4+
echo "Error: The env file '$1' does not exist."
5+
exit 1 # Exit the script with a non-zero status to indicate an error
6+
fi
7+
8+
ENV_FILE=$1
9+
10+
# For testing on local vm, use `set -a` to export all variables
11+
source /etc/environment
12+
source $ENV_FILE
13+
14+
remove_docker_container() {
15+
docker rm -f tpu-test || true;
16+
docker rm -f vllm-tpu || true;
17+
docker rm -f $CONTAINER_NAME || true;
18+
}
19+
20+
trap remove_docker_container EXIT
21+
22+
# Remove the container that might not be cleaned up in the previous run.
23+
remove_docker_container
24+
25+
# Build docker image.
26+
# TODO: build the image outside the script and share the image with other
27+
# tpu test if building time is too long.
28+
DOCKER_BUILDKIT=1 docker build \
29+
--build-arg max_jobs=16 \
30+
--build-arg USE_SCCACHE=1 \
31+
--build-arg GIT_REPO_CHECK=0 \
32+
--tag vllm/vllm-tpu-bm \
33+
--progress plain -f docker/Dockerfile.tpu .
34+
35+
LOG_ROOT=$(mktemp -d)
36+
# If mktemp fails, set -e will cause the script to exit.
37+
echo "Results will be stored in: $LOG_ROOT"
38+
39+
if [ -z "$HF_TOKEN" ]; then
40+
echo "Error: HF_TOKEN is not set or is empty."
41+
exit 1
42+
fi
43+
44+
# Make sure mounted disk or dir exists
45+
if [ ! -d "$DOWNLOAD_DIR" ]; then
46+
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
47+
exit 1
48+
fi
49+
50+
echo "Run model $MODEL"
51+
echo
52+
53+
echo "starting docker...$CONTAINER_NAME"
54+
echo
55+
docker run \
56+
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
57+
--env-file $ENV_FILE \
58+
-e HF_TOKEN="$HF_TOKEN" \
59+
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
60+
-e MODEL=$MODEL \
61+
-e WORKSPACE=/workspace \
62+
--name $CONTAINER_NAME \
63+
-d \
64+
--privileged \
65+
--network host \
66+
-v /dev/shm:/dev/shm \
67+
vllm/vllm-tpu-bm tail -f /dev/null
68+
69+
echo "run script..."
70+
echo
71+
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
72+
73+
echo "copy result back..."
74+
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
75+
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
76+
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
77+
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
78+
79+
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
80+
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
81+
82+
if [ "$BUILDKITE" = "true" ]; then
83+
echo "Running inside Buildkite"
84+
buildkite-agent artifact upload "$VLLM_LOG"
85+
buildkite-agent artifact upload "$BM_LOG"
86+
else
87+
echo "Not running inside Buildkite"
88+
fi
89+
90+
#
91+
# compare the throughput with EXPECTED_THROUGHPUT
92+
# and assert meeting the expectation
93+
#
94+
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
95+
echo "Failed to get the throughput"
96+
exit 1
97+
fi
98+
99+
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
100+
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
101+
exit 1
102+
fi

.buildkite/scripts/tpu/run_bm.sh

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
VLLM_LOG="$WORKSPACE/vllm_log.txt"
6+
BM_LOG="$WORKSPACE/bm_log.txt"
7+
8+
if [ -n "$TARGET_COMMIT" ]; then
9+
head_hash=$(git rev-parse HEAD)
10+
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
11+
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
12+
exit 1
13+
fi
14+
fi
15+
16+
echo "model: $MODEL"
17+
echo
18+
19+
#
20+
# create a log folder
21+
#
22+
mkdir "$WORKSPACE/log"
23+
24+
# TODO: Move to image building.
25+
pip install pandas
26+
pip install datasets
27+
28+
#
29+
# create sonnet_4x
30+
#
31+
echo "Create sonnet_4x.txt"
32+
echo "" > benchmarks/sonnet_4x.txt
33+
for _ in {1..4}
34+
do
35+
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
36+
done
37+
38+
#
39+
# start vllm service in backend
40+
#
41+
echo "lanching vllm..."
42+
echo "logging to $VLLM_LOG"
43+
echo
44+
45+
VLLM_USE_V1=1 vllm serve $MODEL \
46+
--seed 42 \
47+
--disable-log-requests \
48+
--max-num-seqs $MAX_NUM_SEQS \
49+
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
50+
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
51+
--no-enable-prefix-caching \
52+
--download_dir $DOWNLOAD_DIR \
53+
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
54+
55+
56+
echo "wait for 20 minutes.."
57+
echo
58+
# sleep 1200
59+
# wait for 10 minutes...
60+
for i in {1..120}; do
61+
# TODO: detect other type of errors.
62+
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
63+
echo "Detected RuntimeError, exiting."
64+
exit 1
65+
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
66+
echo "Application started"
67+
break
68+
else
69+
echo "wait for 10 seconds..."
70+
sleep 10
71+
fi
72+
done
73+
74+
#
75+
# run test
76+
#
77+
echo "run benchmark test..."
78+
echo "logging to $BM_LOG"
79+
echo
80+
python benchmarks/benchmark_serving.py \
81+
--backend vllm \
82+
--model $MODEL \
83+
--dataset-name sonnet \
84+
--dataset-path benchmarks/sonnet_4x.txt \
85+
--sonnet-input-len $INPUT_LEN \
86+
--sonnet-output-len $OUTPUT_LEN \
87+
--ignore-eos > "$BM_LOG"
88+
89+
echo "completed..."
90+
echo
91+
92+
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
93+
echo "throughput: $throughput"
94+
echo

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ steps:
424424
- vllm/model_executor/layers/quantization
425425
- tests/quantization
426426
commands:
427+
# temporary install here since we need nightly, will move to requirements/test.in
428+
# after torchao 0.12 release
429+
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
427430
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
428431

429432
- label: LM Eval Small Models # 53min

.github/mergify.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,20 @@ pull_request_rules:
3636
add:
3737
- frontend
3838

39+
- name: label-llama
40+
description: Automatically apply llama label
41+
conditions:
42+
- or:
43+
- files~=^examples/.*llama.*\.py
44+
- files~=^tests/.*llama.*\.py
45+
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
46+
- files~=^vllm/model_executor/models/.*llama.*\.py
47+
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
48+
actions:
49+
label:
50+
add:
51+
- llama
52+
3953
- name: label-multi-modality
4054
description: Automatically apply multi-modality label
4155
conditions:

CMakeLists.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
308308
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
309309
# are not supported by Machete yet.
310310
# 9.0 for latest bf16 atomicAdd PTX
311-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
311+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
312312
if (MARLIN_ARCHS)
313313

314314
#
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
454454
# kernels for the remaining archs that are not already built for 3x.
455455
# (Build 8.9 for FP8)
456456
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
457-
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
457+
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
458458
# subtract out the archs that are already built for 3x
459459
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
460460
if (SCALED_MM_2X_ARCHS)
@@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
543543
# CUTLASS MoE kernels
544544

545545
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
546-
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
547-
# to compile MoE kernels that use its output.
546+
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
547+
# if it's possible to compile MoE kernels that use its output.
548548
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
549549
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
550550
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
684684

685685
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
686686
# 9.0 for latest bf16 atomicAdd PTX
687-
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
687+
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
688688
if (MARLIN_MOE_ARCHS)
689689

690690
#

0 commit comments

Comments
 (0)