Skip to content

Commit ba2db86

Browse files
committed
add multi-node ci
Signed-off-by: wangli <[email protected]>
1 parent 992271b commit ba2db86

File tree

5 files changed

+413
-0
lines changed

5 files changed

+413
-0
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
export WORKSPACE="/home/workspace"
5+
6+
check_npu_info() {
7+
npu-smi info
8+
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
9+
}
10+
11+
check_and_config() {
12+
# config mirror
13+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
14+
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
15+
16+
}
17+
18+
install_sys_dependencies() {
19+
echo "====> Install system dependencies"
20+
cd $WORKSPACE
21+
# install sys dependencies
22+
apt-get update -y
23+
apt-get -y install `cat /root/workspace/packages.txt`
24+
apt-get -y install gcc g++ cmake libnuma-dev iproute2
25+
# kimi-k2 dependency
26+
pip install blobfile
27+
}
28+
29+
install_vllm() {
30+
# install vllm
31+
cd $WORKSPACE/vllm-empty
32+
VLLM_TARGET_DEVICE=empty pip install -e .
33+
34+
# install vllm-ascend
35+
cd $WORKSPACE
36+
pip install -e .
37+
}
38+
39+
wait_for_server() {
40+
echo "====> Waiting for server to start"
41+
}
42+
43+
main() {
44+
NODE_TYPE=$1
45+
if [ -n "${2:-}" ]; then
46+
export MASTER_ADDR="$2"
47+
fi
48+
check_npu_info
49+
check_and_config
50+
install_sys_dependencies
51+
install_vllm
52+
echo "====> Installation completed successfully"
53+
echo "====> Starting multi node tests"
54+
# test data parallel on mp backend
55+
. $WORKSPACE/examples/online_serving/multi_node_dp.sh "$NODE_TYPE"
56+
57+
# test pipline parallel on ray backend
58+
sleep 1000
59+
}
60+
61+
main "$@"
62+
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
IMAGE_NAME="quay.nju.edu.cn/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11"
3+
sudo docker pull $IMAGE_NAME
4+
5+
CONTAINER_NAME="ascend_ci_a3"
6+
7+
if docker ps -a --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then
8+
echo "Container '$CONTAINER_NAME' exists. Removing it..."
9+
10+
if docker ps --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then
11+
echo "Stopping container '$CONTAINER_NAME'..."
12+
docker stop "$CONTAINER_NAME"
13+
fi
14+
15+
docker rm "$CONTAINER_NAME"
16+
echo "Container '$CONTAINER_NAME' has been removed."
17+
fi
18+
19+
echo "starting ascend NPU-A3 container"
20+
21+
# Run the container using the defined variables
22+
docker run -itd \
23+
--name "$CONTAINER_NAME" \
24+
--net=host \
25+
--device /dev/davinci0 \
26+
--device /dev/davinci1 \
27+
--device /dev/davinci2 \
28+
--device /dev/davinci3 \
29+
--device /dev/davinci4 \
30+
--device /dev/davinci5 \
31+
--device /dev/davinci6 \
32+
--device /dev/davinci7 \
33+
--device /dev/davinci8 \
34+
--device /dev/davinci9 \
35+
--device /dev/davinci10 \
36+
--device /dev/davinci11 \
37+
--device /dev/davinci12 \
38+
--device /dev/davinci13 \
39+
--device /dev/davinci14 \
40+
--device /dev/davinci15 \
41+
--device /dev/davinci_manager \
42+
--device /dev/devmm_svm \
43+
--device /dev/hisi_hdc \
44+
-e CLUSTER_SIZE:$CLUSTER_SIZE \
45+
-e MASTER_ADDR:$MASTER_ADDR \
46+
-v $GITHUB_WORKSPACE:$WORKSPACE \
47+
-v /usr/local/dcmi:/usr/local/dcmi \
48+
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
49+
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
50+
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
51+
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
52+
-v /etc/ascend_install.info:/etc/ascend_install.info \
53+
-v /root/actions-runner/.cache:/root/actions-runner/.cache \
54+
-v /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks:/root/.cache \
55+
$IMAGE_NAME bash
56+
57+
# Check if container start successfully
58+
if [ $? -eq 0 ]; then
59+
echo "Container $CONTAINER_NAME start successfully"
60+
else
61+
echo "Container $CONTAINER_NAME start failed, please check if the images exist or permission"
62+
exit 1
63+
fi
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# This file is a part of the vllm-ascend project.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
name: 'Multi-Node-Test'
19+
# This workflow runs nightly benchmarks for vllm-ascend.
20+
21+
on:
22+
workflow_dispatch:
23+
# Allow manual triggering of the workflow
24+
25+
26+
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
27+
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
28+
# It's used to activate ascend-toolkit environment variables.
29+
defaults:
30+
run:
31+
shell: bash -el {0}
32+
33+
jobs:
34+
get_header_node_ip:
35+
# This job is used to get the header node IP address.
36+
name: 'Get Header Node IP'
37+
runs-on: linux-aarch64-a3-node0
38+
outputs:
39+
header_ip: ${{ steps.get_header_node_ip.outputs.MASTER_IP }}
40+
steps:
41+
- name: Get header node IP
42+
id: get_header_node_ip
43+
run: |
44+
echo "MASTER_IP=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT
45+
test_multi_node:
46+
# Currently, we run multi-node tests only on: vllm==main, vllm-ascend==main.
47+
name: 'Multi-Node-Test / DP'
48+
needs: get_header_node_ip
49+
strategy:
50+
matrix:
51+
runner: [linux-aarch64-a3-node0, linux-aarch64-a3-node1]
52+
runs-on: ${{matrix.runner}}
53+
env:
54+
CONTAINER_NAME: ascend_ci_a3
55+
WORKSPACE: /home/workspace
56+
CLUSTER_SIZE: 2
57+
MASTER_IP: ${{ needs.get_header_node_ip.outputs.header_ip }}
58+
steps:
59+
60+
- name: Set config
61+
run: |
62+
git config --global --add safe.directory "$GITHUB_WORKSPACE"
63+
64+
- name: check user
65+
run: |
66+
whoami
67+
68+
- name: Checkout vllm-ascend
69+
uses: actions/checkout@v4
70+
with:
71+
repository: Potabk/vllm-ascend
72+
ref: multi_node_ci
73+
path: ./
74+
clean: true
75+
76+
- name: Checkout vllm
77+
uses: actions/checkout@v4
78+
with:
79+
repository: vllm-project/vllm
80+
ref: main
81+
path: ./vllm-empty
82+
83+
- name: Start container
84+
run: |
85+
bash .github/workflows/scripts/start_container.sh
86+
87+
- name: Run multi-node test
88+
run: |
89+
SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh"
90+
if [ "${{ matrix.runner }}" == "linux-aarch64-a3-node0" ]; then
91+
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH header"
92+
else
93+
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH worker $MASTER_IP"
94+
fi
95+
96+
- name: Docker post test cleanup
97+
if: always()
98+
run: |
99+
docker rm -f ascend_ci_a3 2>/dev/null || true
100+
sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} +
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
run_node() {
6+
NODE_TYPE=$1
7+
echo "====> Running $NODE_TYPE"
8+
9+
local_ip=$(hostname -I | awk '{print $1}')
10+
iface=$(ip -o -4 addr show | awk -v ip="$local_ip" '$4 ~ ip"/" {print $2}')
11+
12+
export HCCL_IF_IP=$local_ip
13+
export GLOO_SOCKET_IFNAME=$iface
14+
export TP_SOCKET_IFNAME=$iface
15+
export HCCL_SOCKET_IFNAME=$iface
16+
export OMP_PROC_BIND=false
17+
export OMP_NUM_THREADS=100
18+
export VLLM_USE_V1=1
19+
export HCCL_BUFFSIZE=1024
20+
21+
if [ "$NODE_TYPE" == "header" ]; then
22+
echo "====> Running header node"
23+
vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \
24+
--host 0.0.0.0 \
25+
--port 8004 \
26+
--data-parallel-size 4 \
27+
--api-server-count 2 \
28+
--data-parallel-size-local 2 \
29+
--data-parallel-address $local_ip \
30+
--data-parallel-rpc-port 13389 \
31+
--seed 1024 \
32+
--served-model-name kimi \
33+
--quantization ascend \
34+
--tensor-parallel-size 8 \
35+
--enable-expert-parallel \
36+
--max-num-seqs 16 \
37+
--max-model-len 32768 \
38+
--max-num-batched-tokens 4096 \
39+
--trust-remote-code \
40+
--no-enable-prefix-caching \
41+
--gpu-memory-utilization 0.9 \
42+
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
43+
else
44+
echo "====> Running worker node"
45+
vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \
46+
--host 0.0.0.0 \
47+
--port 8004 \
48+
--headless \
49+
--data-parallel-size 4 \
50+
--data-parallel-size-local 2 \
51+
--data-parallel-start-rank 2 \
52+
--data-parallel-address $MASTER_ADDR \
53+
--data-parallel-rpc-port 13389 \
54+
--seed 1024 \
55+
--tensor-parallel-size 8 \
56+
--served-model-name kimi \
57+
--max-num-seqs 16 \
58+
--max-model-len 32768 \
59+
--quantization ascend \
60+
--max-num-batched-tokens 4096 \
61+
--enable-expert-parallel \
62+
--trust-remote-code \
63+
--no-enable-prefix-caching \
64+
--gpu-memory-utilization 0.92 \
65+
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
66+
fi
67+
}
68+
69+
run_node "$@"

0 commit comments

Comments
 (0)