Skip to content

Commit 9b5c9f9

Browse files
[CI/Build] AMD CI pipeline with extended set of tests. (#4267)
Co-authored-by: simon-mo <[email protected]>
1 parent 32881f3 commit 9b5c9f9

File tree

5 files changed

+67
-45
lines changed

5 files changed

+67
-45
lines changed

.buildkite/run-amd-test.sh

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
# This script build the ROCm docker image and run the API server inside the container.
2-
# It serves a sanity check for compilation and basic model usage.
1+
# This script build the ROCm docker image and runs test inside it.
32
set -ex
43

54
# Print ROCm version
5+
echo "--- ROCm info"
66
rocminfo
77

8+
echo "--- Resetting GPUs"
89

910
echo "reset" > /opt/amdgpu/etc/gpu_state
1011

@@ -16,37 +17,28 @@ while true; do
1617
fi
1718
done
1819

20+
echo "--- Building container"
21+
sha=$(git rev-parse --short HEAD)
22+
container_name=rocm_${sha}
23+
docker build \
24+
-t ${container_name} \
25+
-f Dockerfile.rocm \
26+
--progress plain \
27+
.
28+
29+
remove_docker_container() {
30+
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
31+
}
32+
trap remove_docker_container EXIT
1933

34+
echo "--- Running container"
2035

21-
# Try building the docker image
22-
docker build -t rocm -f Dockerfile.rocm .
23-
24-
# Setup cleanup
25-
remove_docker_container() { docker rm -f rocm || true; }
26-
trap remove_docker_container EXIT
27-
remove_docker_container
28-
29-
# Run the image
30-
export HIP_VISIBLE_DEVICES=1
31-
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
32-
33-
# Wait for the server to start
34-
wait_for_server_to_start() {
35-
timeout=300
36-
counter=0
37-
38-
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
39-
sleep 1
40-
counter=$((counter + 1))
41-
if [ $counter -ge $timeout ]; then
42-
echo "Timeout after $timeout seconds"
43-
break
44-
fi
45-
done
46-
}
47-
wait_for_server_to_start
36+
docker run \
37+
--device /dev/kfd --device /dev/dri \
38+
--network host \
39+
--rm \
40+
-e HF_TOKEN \
41+
--name ${container_name} \
42+
${container_name} \
43+
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
4844

49-
# Test a simple prompt
50-
curl -X POST -H "Content-Type: application/json" \
51-
localhost:8000/generate \
52-
-d '{"prompt": "San Francisco is a"}'

.buildkite/run-benchmarks.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
5353
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
5454
echo '```' >> benchmark_results.md
5555

56+
# if the agent binary is not found, skip uploading the results, exit 0
57+
if [ ! -f /workspace/buildkite-agent ]; then
58+
exit 0
59+
fi
60+
5661
# upload the results to buildkite
5762
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
5863

.buildkite/test-pipeline.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ steps:
2020
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
2121

2222
- label: Core Test
23+
mirror_hardwares: [amd]
2324
command: pytest -v -s core
2425

2526
- label: Distributed Comm Ops Test
@@ -29,7 +30,10 @@ steps:
2930

3031
- label: Distributed Tests
3132
working_dir: "/vllm-workspace/tests/distributed"
32-
num_gpus: 2
33+
34+
num_gpus: 2 # only support 1 or 2 for now.
35+
mirror_hardwares: [amd]
36+
3337
commands:
3438
- pytest -v -s test_pynccl_library.py
3539
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
@@ -44,6 +48,7 @@ steps:
4448
- pytest -v -s test_pynccl.py
4549

4650
- label: Engine Test
51+
mirror_hardwares: [amd]
4752
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
4853

4954
- label: Entrypoints Test
@@ -54,6 +59,7 @@ steps:
5459

5560
- label: Examples Test
5661
working_dir: "/vllm-workspace/examples"
62+
mirror_hardwares: [amd]
5763
commands:
5864
# install aws cli for llava_example.py
5965
- pip install awscli
@@ -67,29 +73,35 @@ steps:
6773
parallelism: 4
6874

6975
- label: Models Test
76+
mirror_hardwares: [amd]
7077
commands:
7178
- bash ../.buildkite/download-images.sh
7279
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
7380

7481
- label: Llava Test
82+
mirror_hardwares: [amd]
7583
commands:
7684
- bash ../.buildkite/download-images.sh
7785
- pytest -v -s models/test_llava.py
7886

7987
- label: Prefix Caching Test
88+
mirror_hardwares: [amd]
8089
commands:
8190
- pytest -v -s prefix_caching
8291

8392
- label: Samplers Test
8493
command: pytest -v -s samplers
8594

8695
- label: LogitsProcessor Test
96+
mirror_hardwares: [amd]
8797
command: pytest -v -s test_logits_processor.py
8898

8999
- label: Worker Test
100+
mirror_hardwares: [amd]
90101
command: pytest -v -s worker
91102

92103
- label: Speculative decoding tests
104+
mirror_hardwares: [amd]
93105
command: pytest -v -s spec_decode
94106

95107
- label: LoRA Test %N
@@ -107,6 +119,7 @@ steps:
107119

108120
- label: Benchmarks
109121
working_dir: "/vllm-workspace/.buildkite"
122+
mirror_hardwares: [amd]
110123
commands:
111124
- pip install aiohttp
112125
- bash run-benchmarks.sh

.buildkite/test-template.j2

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,29 @@ steps:
1616
limit: 5
1717
- wait
1818

19-
- label: "AMD Test"
20-
agents:
21-
queue: amd
22-
command: bash .buildkite/run-amd-test.sh
19+
- group: "AMD Tests"
20+
depends_on: ~
21+
steps:
22+
{% for step in steps %}
23+
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
24+
- label: "AMD: {{ step.label }}"
25+
agents:
26+
queue: amd
27+
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
28+
env:
29+
DOCKER_BUILDKIT: "1"
30+
{% endif %}
31+
{% endfor %}
2332

2433
- label: "Neuron Test"
34+
depends_on: ~
2535
agents:
2636
queue: neuron
2737
command: bash .buildkite/run-neuron-test.sh
2838
soft_fail: true
2939

30-
- label: "CPU Test"
40+
- label: "Intel Test"
41+
depends_on: ~
3142
command: bash .buildkite/run-cpu-test.sh
3243

3344
{% for step in steps %}

Dockerfile.rocm

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
4646

4747
### Mount Point ###
4848
# When launching the container, mount the code directory to /app
49-
ARG APP_MOUNT=/app
49+
ARG APP_MOUNT=/vllm-workspace
5050
VOLUME [ ${APP_MOUNT} ]
5151
WORKDIR ${APP_MOUNT}
5252

@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
8989
&& cd ../..; \
9090
fi
9191

92-
COPY ./ /app/vllm
92+
WORKDIR /vllm-workspace
93+
COPY . .
9394

9495
RUN python3 -m pip install --upgrade pip numba
9596

96-
RUN cd /app \
97-
&& cd vllm \
98-
&& pip install -U -r requirements-rocm.txt \
99-
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
97+
RUN --mount=type=cache,target=/root/.cache/pip \
98+
pip install -U -r requirements-rocm.txt \
99+
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
100100
&& python3 setup.py install \
101+
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
101102
&& cd ..
102103

103104
RUN python3 -m pip install --upgrade pip

0 commit comments

Comments
 (0)