Skip to content

Commit e51de38

Browse files
authored
[Platform][CI] Added OOT platform interface e2e test that running on Ascend NPU (vllm-project#25470)
Signed-off-by: leo-pony <[email protected]>
1 parent cc253b7 commit e51de38

File tree

1 file changed

+191
-0
lines changed

1 file changed

+191
-0
lines changed
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#!/bin/bash
2+
3+
# This script build the Ascend NPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Base ubuntu image with basic ascend development libraries and python installed
8+
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
9+
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
10+
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
11+
VLLM_ASCEND_TMP_DIR=
12+
# Get the test run configuration file from the vllm-ascend repository
13+
fetch_vllm_test_cfg() {
14+
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
15+
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
16+
cleanup() {
17+
rm -rf "${VLLM_ASCEND_TMP_DIR}"
18+
}
19+
trap cleanup EXIT
20+
21+
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
22+
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
23+
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
24+
exit 1
25+
fi
26+
27+
# If the file already exists locally, just overwrite it
28+
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
29+
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
30+
31+
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
32+
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
33+
rm -rf "${VLLM_ASCEND_TMP_DIR}"
34+
trap - EXIT
35+
}
36+
37+
# Downloads test run configuration file from a remote URL.
38+
# Loads the configuration into the current script environment.
39+
get_config() {
40+
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
41+
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
42+
exit 1
43+
fi
44+
source "${TEST_RUN_CONFIG_FILE}"
45+
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
46+
return 0
47+
}
48+
49+
# get test running configuration.
50+
fetch_vllm_test_cfg
51+
get_config
52+
# Check if the function call was successful. If not, exit the script.
53+
if [ $? -ne 0 ]; then
54+
exit 1
55+
fi
56+
57+
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
58+
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
59+
60+
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
61+
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
62+
echo "agent_idx: ${agent_idx}"
63+
builder_name="cachebuilder${agent_idx}"
64+
builder_cache_dir="/mnt/docker-cache${agent_idx}"
65+
mkdir -p ${builder_cache_dir}
66+
67+
# Try building the docker image
68+
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
69+
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
70+
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
71+
--cache-to type=local,dest=${builder_cache_dir},mode=max \
72+
--progress=plain --load -t ${image_name} -f - .
73+
FROM ${BASE_IMAGE_NAME}
74+
75+
# Define environments
76+
ENV DEBIAN_FRONTEND=noninteractive
77+
78+
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
79+
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
80+
apt-get update -y && \
81+
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
82+
rm -rf /var/cache/apt/* && \
83+
rm -rf /var/lib/apt/lists/*
84+
85+
# Install for pytest to make the docker build cache layer always valid
86+
RUN --mount=type=cache,target=/root/.cache/pip \
87+
pip install pytest>=6.0 modelscope
88+
89+
WORKDIR /workspace/vllm
90+
91+
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
92+
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
93+
RUN --mount=type=cache,target=/root/.cache/pip \
94+
pip install -r requirements/common.txt
95+
96+
COPY . .
97+
98+
# Install vLLM
99+
RUN --mount=type=cache,target=/root/.cache/pip \
100+
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
101+
python3 -m pip uninstall -y triton
102+
103+
# Install vllm-ascend
104+
WORKDIR /workspace
105+
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
106+
ARG VLLM_ASCEND_TAG=main
107+
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
108+
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
109+
110+
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
111+
RUN --mount=type=cache,target=/root/.cache/pip \
112+
pip install -r /workspace/vllm-ascend/requirements.txt
113+
114+
RUN --mount=type=cache,target=/root/.cache/pip \
115+
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
116+
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
117+
source /usr/local/Ascend/nnal/atb/set_env.sh && \
118+
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
119+
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
120+
121+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
122+
ENV VLLM_USE_MODELSCOPE=True
123+
124+
WORKDIR /workspace/vllm-ascend
125+
126+
CMD ["/bin/bash"]
127+
128+
EOF
129+
130+
# Setup cleanup
131+
remove_docker_container() {
132+
docker rm -f "${container_name}" || true;
133+
docker image rm -f "${image_name}" || true;
134+
docker system prune -f || true;
135+
}
136+
trap remove_docker_container EXIT
137+
138+
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
139+
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
140+
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
141+
# returns --device /dev/davinci0 --device /dev/davinci1
142+
parse_and_gen_devices() {
143+
local input="$1"
144+
local index cards_num
145+
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
146+
index="${BASH_REMATCH[1]}"
147+
cards_num="${BASH_REMATCH[2]}"
148+
else
149+
echo "parse error" >&2
150+
return 1
151+
fi
152+
153+
local devices=""
154+
local i=0
155+
while (( i < cards_num )); do
156+
local dev_idx=$(((index - 1)*cards_num + i ))
157+
devices="$devices --device /dev/davinci${dev_idx}"
158+
((i++))
159+
done
160+
161+
# trim leading space
162+
devices="${devices#"${devices%%[![:space:]]*}"}"
163+
# Output devices: assigned to the caller variable
164+
printf '%s' "$devices"
165+
}
166+
167+
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
168+
169+
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
170+
# This test checks whether the OOT platform interface is functioning properly in conjunction with
171+
# the hardware plugin vllm-ascend.
172+
model_cache_dir=/mnt/modelscope${agent_idx}
173+
mkdir -p ${model_cache_dir}
174+
docker run \
175+
${devices} \
176+
--device /dev/davinci_manager \
177+
--device /dev/devmm_svm \
178+
--device /dev/hisi_hdc \
179+
-v /usr/local/dcmi:/usr/local/dcmi \
180+
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
181+
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
182+
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
183+
-v /etc/ascend_install.info:/etc/ascend_install.info \
184+
-v ${model_cache_dir}:/root/.cache/modelscope \
185+
--entrypoint="" \
186+
--name "${container_name}" \
187+
"${image_name}" \
188+
bash -c '
189+
set -e
190+
pytest -v -s tests/e2e/vllm_interface/
191+
'

0 commit comments

Comments
 (0)