Skip to content

Commit c9c47c5

Browse files
authored
Merge pull request #52 from heyselbi/sync-v0.10.0-rhoai-rocm
Sync v0.10.0 rhoai rocm
2 parents 7c63a5c + 9651df2 commit c9c47c5

File tree

937 files changed

+56302
-42553
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

937 files changed

+56302
-42553
lines changed

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
4646
done
4747

4848
lm_eval --model vllm \
49-
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
49+
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
5050
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
5151
--batch_size "$BATCH_SIZE"

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818

1919
def launch_lm_eval(eval_config, tp_size):
2020
trust_remote_code = eval_config.get("trust_remote_code", False)
21+
max_model_len = eval_config.get("max_model_len", 4096)
2122
model_args = (
2223
f"pretrained={eval_config['model_name']},"
2324
f"tensor_parallel_size={tp_size},"
2425
f"enforce_eager=true,"
2526
f"add_bos_token=true,"
26-
f"trust_remote_code={trust_remote_code}"
27+
f"trust_remote_code={trust_remote_code},"
28+
f"max_model_len={max_model_len}"
2729
)
2830
results = lm_eval.simple_evaluate(
2931
model="vllm",

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ fi
108108
if [[ $commands == *" kernels/attention"* ]]; then
109109
commands="${commands} \
110110
--ignore=kernels/attention/test_attention_selector.py \
111-
--ignore=kernels/attention/test_blocksparse_attention.py \
112111
--ignore=kernels/attention/test_encoder_decoder_attn.py \
113112
--ignore=kernels/attention/test_flash_attn.py \
114113
--ignore=kernels/attention/test_flashinfer.py \

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
# used for TP/PP E2E test
910
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
1011
NUMA_NODE=${NUMA_NODE:-1}
1112

@@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2425
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2526

2627
# Run the image, setting --shm-size=4g for tensor parallel.
27-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
29+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
2930

3031
function cpu_tests() {
3132
set -e
@@ -48,10 +49,16 @@ function cpu_tests() {
4849
# Run basic model test
4950
docker exec cpu-test-"$NUMA_NODE" bash -c "
5051
set -e
51-
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
52-
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
53-
pytest -v -s tests/models/language/generation -m cpu_model
54-
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
52+
# Note: disable until supports V1
53+
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
54+
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
55+
56+
# Note: disable Bart until supports V1
57+
pytest -v -s tests/models/language/generation -m cpu_model \
58+
--ignore=tests/models/language/generation/test_bart.py
59+
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
60+
--ignore=tests/models/language/generation/test_bart.py
61+
5562
pytest -v -s tests/models/language/pooling -m cpu_model
5663
pytest -v -s tests/models/multimodal/generation \
5764
--ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,33 +69,26 @@ function cpu_tests() {
6269
docker exec cpu-test-"$NUMA_NODE" bash -c "
6370
set -e
6471
pytest -s -v \
65-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
66-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
72+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
6773

74+
# Note: disable it until supports V1
6875
# Run AWQ test
69-
docker exec cpu-test-"$NUMA_NODE" bash -c "
70-
set -e
71-
VLLM_USE_V1=0 pytest -s -v \
72-
tests/quantization/test_ipex_quant.py"
73-
74-
# Run chunked-prefill and prefix-cache test
75-
docker exec cpu-test-"$NUMA_NODE" bash -c "
76-
set -e
77-
pytest -s -v -k cpu_model \
78-
tests/basic_correctness/test_chunked_prefill.py"
76+
# docker exec cpu-test-"$NUMA_NODE" bash -c "
77+
# set -e
78+
# VLLM_USE_V1=0 pytest -s -v \
79+
# tests/quantization/test_ipex_quant.py"
7980

8081
# online serving
81-
docker exec cpu-test-"$NUMA_NODE" bash -c "
82+
docker exec cpu-test-"$NUMA_NODE" bash -c '
8283
set -e
83-
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
84-
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
85-
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
84+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
85+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
86+
python3 benchmarks/benchmark_serving.py \
8687
--backend vllm \
8788
--dataset-name random \
88-
--model facebook/opt-125m \
89+
--model meta-llama/Llama-3.2-3B-Instruct \
8990
--num-prompts 20 \
90-
--endpoint /v1/completions \
91-
--tokenizer facebook/opt-125m"
91+
--endpoint /v1/completions'
9292

9393
# Run multi-lora tests
9494
docker exec cpu-test-"$NUMA_NODE" bash -c "

.buildkite/scripts/hardware_ci/run-hpu-test.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,17 @@ set -exuo pipefail
66

77
# Try building the docker image
88
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
9-
FROM 1.22-413-pt2.7.1:latest
9+
FROM gaudi-base-image:latest
1010
1111
COPY ./ /workspace/vllm
1212
1313
WORKDIR /workspace/vllm
1414
15-
RUN pip install -v -r requirements/hpu.txt
16-
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
17-
1815
ENV no_proxy=localhost,127.0.0.1
1916
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
2017
21-
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
18+
RUN VLLM_TARGET_DEVICE=empty pip install .
19+
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
2220
2321
# install development dependencies (for testing)
2422
RUN python3 -m pip install -e tests/vllm_test_utils

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,16 @@ echo "Results will be stored in: $RESULTS_DIR"
6262
echo "--- Installing Python dependencies ---"
6363
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
6464
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
65-
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
65+
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
66+
&& python3 -m pip install --progress-bar off hf-transfer
6667
echo "--- Python dependencies installed ---"
6768
export VLLM_USE_V1=1
6869
export VLLM_XLA_CHECK_RECOMPILATION=1
6970
export VLLM_XLA_CACHE_PATH=
7071
echo "Using VLLM V1"
7172
7273
echo "--- Hardware Information ---"
73-
tpu-info
74+
# tpu-info
7475
echo "--- Starting Tests ---"
7576
set +e
7677
overall_script_exit_code=0
@@ -150,7 +151,7 @@ run_and_track_test 9 "test_multimodal.py" \
150151
run_and_track_test 10 "test_pallas.py" \
151152
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
152153
run_and_track_test 11 "test_struct_output_generate.py" \
153-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
154+
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
154155
run_and_track_test 12 "test_moe_pallas.py" \
155156
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156157
run_and_track_test 13 "test_lora.py" \

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
1111
docker build -t ${image_name} -f docker/Dockerfile.xpu .
1212

1313
# Setup cleanup
14-
remove_docker_container() {
15-
docker rm -f "${container_name}" || true;
14+
remove_docker_container() {
15+
docker rm -f "${container_name}" || true;
1616
docker image rm -f "${image_name}" || true;
1717
docker system prune -f || true;
1818
}
@@ -26,7 +26,18 @@ docker run \
2626
--name "${container_name}" \
2727
"${image_name}" \
2828
sh -c '
29-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
30-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
3129
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
30+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
31+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
32+
cd tests
33+
pytest -v -s v1/core
34+
pytest -v -s v1/engine
35+
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
36+
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
37+
pytest -v -s v1/structured_output
38+
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
39+
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
40+
pytest -v -s v1/test_serial_utils.py
41+
pytest -v -s v1/test_utils.py
42+
pytest -v -s v1/test_metrics_reader.py
3243
'

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ trap remove_docker_container EXIT
2222
# Remove the container that might not be cleaned up in the previous run.
2323
remove_docker_container
2424

25-
# Build docker image.
26-
# TODO: build the image outside the script and share the image with other
27-
# tpu test if building time is too long.
28-
DOCKER_BUILDKIT=1 docker build \
29-
--build-arg max_jobs=16 \
30-
--build-arg USE_SCCACHE=1 \
31-
--build-arg GIT_REPO_CHECK=0 \
32-
--tag vllm/vllm-tpu-bm \
33-
--progress plain -f docker/Dockerfile.tpu .
34-
3525
LOG_ROOT=$(mktemp -d)
3626
# If mktemp fails, set -e will cause the script to exit.
3727
echo "Results will be stored in: $LOG_ROOT"

0 commit comments

Comments
 (0)