-
Notifications
You must be signed in to change notification settings - Fork 392
[WIP][CI] Add multi node ci #2345
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
|
||
set -euo pipefail | ||
export WORKSPACE="/home/workspace" | ||
|
||
check_npu_info() { | ||
npu-smi info | ||
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info | ||
} | ||
|
||
check_and_config() { | ||
# config mirror | ||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple | ||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi | ||
|
||
} | ||
|
||
install_sys_dependencies() { | ||
echo "====> Install system dependencies" | ||
cd $WORKSPACE | ||
# install sys dependencies | ||
apt-get update -y | ||
apt-get -y install `cat /root/workspace/packages.txt` | ||
apt-get -y install gcc g++ cmake libnuma-dev iproute2 | ||
# kimi-k2 dependency | ||
pip install blobfile | ||
} | ||
|
||
install_vllm() { | ||
# install vllm | ||
cd $WORKSPACE/vllm-empty | ||
VLLM_TARGET_DEVICE=empty pip install -e . | ||
|
||
# install vllm-ascend | ||
cd $WORKSPACE | ||
pip install -e . | ||
} | ||
|
||
wait_for_server() { | ||
echo "====> Waiting for server to start" | ||
} | ||
|
||
main() { | ||
NODE_TYPE=$1 | ||
if [ -n "${2:-}" ]; then | ||
export MASTER_ADDR="$2" | ||
fi | ||
check_npu_info | ||
check_and_config | ||
install_sys_dependencies | ||
install_vllm | ||
echo "====> Installation completed successfully" | ||
echo "====> Starting multi node tests" | ||
# test data parallel on mp backend | ||
. $WORKSPACE/examples/online_serving/multi_node_dp.sh "$NODE_TYPE" | ||
|
||
# test pipline parallel on ray backend | ||
sleep 1000 | ||
} | ||
|
||
main "$@" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
IMAGE_NAME="quay.nju.edu.cn/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" | ||
sudo docker pull $IMAGE_NAME | ||
|
||
CONTAINER_NAME="ascend_ci_a3" | ||
|
||
if docker ps -a --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then | ||
echo "Container '$CONTAINER_NAME' exists. Removing it..." | ||
|
||
if docker ps --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then | ||
echo "Stopping container '$CONTAINER_NAME'..." | ||
docker stop "$CONTAINER_NAME" | ||
fi | ||
|
||
docker rm "$CONTAINER_NAME" | ||
echo "Container '$CONTAINER_NAME' has been removed." | ||
fi | ||
|
||
echo "starting ascend NPU-A3 container" | ||
|
||
# Run the container using the defined variables | ||
docker run -itd \ | ||
--name "$CONTAINER_NAME" \ | ||
--net=host \ | ||
--device /dev/davinci0 \ | ||
--device /dev/davinci1 \ | ||
--device /dev/davinci2 \ | ||
--device /dev/davinci3 \ | ||
--device /dev/davinci4 \ | ||
--device /dev/davinci5 \ | ||
--device /dev/davinci6 \ | ||
--device /dev/davinci7 \ | ||
--device /dev/davinci8 \ | ||
--device /dev/davinci9 \ | ||
--device /dev/davinci10 \ | ||
--device /dev/davinci11 \ | ||
--device /dev/davinci12 \ | ||
--device /dev/davinci13 \ | ||
--device /dev/davinci14 \ | ||
--device /dev/davinci15 \ | ||
--device /dev/davinci_manager \ | ||
--device /dev/devmm_svm \ | ||
--device /dev/hisi_hdc \ | ||
-e CLUSTER_SIZE:$CLUSTER_SIZE \ | ||
-e MASTER_ADDR:$MASTER_ADDR \ | ||
-v $GITHUB_WORKSPACE:$WORKSPACE \ | ||
-v /usr/local/dcmi:/usr/local/dcmi \ | ||
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ | ||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ | ||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ | ||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ | ||
-v /etc/ascend_install.info:/etc/ascend_install.info \ | ||
-v /root/actions-runner/.cache:/root/actions-runner/.cache \ | ||
-v /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks:/root/.cache \ | ||
$IMAGE_NAME bash | ||
|
||
# Check if container start successfully | ||
if [ $? -eq 0 ]; then | ||
echo "Container $CONTAINER_NAME start successfully" | ||
else | ||
echo "Container $CONTAINER_NAME start failed, please check if the images exist or permission" | ||
exit 1 | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# | ||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. | ||
# This file is a part of the vllm-ascend project. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
name: 'Multi-Node-Test' | ||
# This workflow runs nightly benchmarks for vllm-ascend. | ||
|
||
on: | ||
workflow_dispatch: | ||
# Allow manual triggering of the workflow | ||
|
||
|
||
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly | ||
# declared as "shell: bash -el {0}" on steps that need to be properly activated. | ||
# It's used to activate ascend-toolkit environment variables. | ||
defaults: | ||
run: | ||
shell: bash -el {0} | ||
|
||
jobs: | ||
get_header_node_ip: | ||
# This job is used to get the header node IP address. | ||
name: 'Get Header Node IP' | ||
runs-on: linux-aarch64-a3-node0 | ||
Check failure on line 37 in .github/workflows/vllm_ascend_multi_node_test.yaml
|
||
outputs: | ||
header_ip: ${{ steps.get_header_node_ip.outputs.MASTER_IP }} | ||
steps: | ||
- name: Get header node IP | ||
id: get_header_node_ip | ||
run: | | ||
echo "MASTER_IP=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT | ||
test_multi_node: | ||
# Currently, we run multi-node tests only on: vllm==main, vllm-ascend==main. | ||
name: 'Multi-Node-Test / DP' | ||
needs: get_header_node_ip | ||
strategy: | ||
matrix: | ||
runner: [linux-aarch64-a3-node0, linux-aarch64-a3-node1] | ||
Check failure on line 51 in .github/workflows/vllm_ascend_multi_node_test.yaml
|
||
runs-on: ${{matrix.runner}} | ||
env: | ||
CONTAINER_NAME: ascend_ci_a3 | ||
WORKSPACE: /home/workspace | ||
CLUSTER_SIZE: 2 | ||
MASTER_IP: ${{ needs.get_header_node_ip.outputs.header_ip }} | ||
steps: | ||
|
||
- name: Set config | ||
run: | | ||
git config --global --add safe.directory "$GITHUB_WORKSPACE" | ||
|
||
- name: check user | ||
run: | | ||
whoami | ||
|
||
- name: Checkout vllm-ascend | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: Potabk/vllm-ascend | ||
ref: multi_node_ci | ||
path: ./ | ||
clean: true | ||
|
||
- name: Checkout vllm | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: vllm-project/vllm | ||
ref: main | ||
path: ./vllm-empty | ||
|
||
- name: Start container | ||
run: | | ||
bash .github/workflows/scripts/start_container.sh | ||
|
||
- name: Run multi-node test | ||
run: | | ||
SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh" | ||
if [ "${{ matrix.runner }}" == "linux-aarch64-a3-node0" ]; then | ||
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH header" | ||
else | ||
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH worker $MASTER_IP" | ||
fi | ||
|
||
- name: Docker post test cleanup | ||
if: always() | ||
run: | | ||
docker rm -f ascend_ci_a3 2>/dev/null || true | ||
sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} + |
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,69 @@ | ||||||||||||||
#!/bin/bash | ||||||||||||||
|
||||||||||||||
set -euo pipefail | ||||||||||||||
|
||||||||||||||
run_node() { | ||||||||||||||
NODE_TYPE=$1 | ||||||||||||||
echo "====> Running $NODE_TYPE" | ||||||||||||||
|
||||||||||||||
local_ip=$(hostname -I | awk '{print $1}') | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||||
iface=$(ip -o -4 addr show | awk -v ip="$local_ip" '$4 ~ ip"/" {print $2}') | ||||||||||||||
|
||||||||||||||
export HCCL_IF_IP=$local_ip | ||||||||||||||
export GLOO_SOCKET_IFNAME=$iface | ||||||||||||||
export TP_SOCKET_IFNAME=$iface | ||||||||||||||
export HCCL_SOCKET_IFNAME=$iface | ||||||||||||||
export OMP_PROC_BIND=false | ||||||||||||||
export OMP_NUM_THREADS=100 | ||||||||||||||
export VLLM_USE_V1=1 | ||||||||||||||
export HCCL_BUFFSIZE=1024 | ||||||||||||||
|
||||||||||||||
if [ "$NODE_TYPE" == "header" ]; then | ||||||||||||||
echo "====> Running header node" | ||||||||||||||
vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \ | ||||||||||||||
--host 0.0.0.0 \ | ||||||||||||||
--port 8004 \ | ||||||||||||||
--data-parallel-size 4 \ | ||||||||||||||
--api-server-count 2 \ | ||||||||||||||
--data-parallel-size-local 2 \ | ||||||||||||||
--data-parallel-address $local_ip \ | ||||||||||||||
--data-parallel-rpc-port 13389 \ | ||||||||||||||
--seed 1024 \ | ||||||||||||||
--served-model-name kimi \ | ||||||||||||||
--quantization ascend \ | ||||||||||||||
--tensor-parallel-size 8 \ | ||||||||||||||
--enable-expert-parallel \ | ||||||||||||||
--max-num-seqs 16 \ | ||||||||||||||
--max-model-len 32768 \ | ||||||||||||||
--max-num-batched-tokens 4096 \ | ||||||||||||||
--trust-remote-code \ | ||||||||||||||
--no-enable-prefix-caching \ | ||||||||||||||
--gpu-memory-utilization 0.9 \ | ||||||||||||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' | ||||||||||||||
else | ||||||||||||||
echo "====> Running worker node" | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||||
vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \ | ||||||||||||||
--host 0.0.0.0 \ | ||||||||||||||
--port 8004 \ | ||||||||||||||
--headless \ | ||||||||||||||
--data-parallel-size 4 \ | ||||||||||||||
--data-parallel-size-local 2 \ | ||||||||||||||
--data-parallel-start-rank 2 \ | ||||||||||||||
--data-parallel-address $MASTER_ADDR \ | ||||||||||||||
--data-parallel-rpc-port 13389 \ | ||||||||||||||
--seed 1024 \ | ||||||||||||||
--tensor-parallel-size 8 \ | ||||||||||||||
--served-model-name kimi \ | ||||||||||||||
--max-num-seqs 16 \ | ||||||||||||||
--max-model-len 32768 \ | ||||||||||||||
--quantization ascend \ | ||||||||||||||
--max-num-batched-tokens 4096 \ | ||||||||||||||
--enable-expert-parallel \ | ||||||||||||||
--trust-remote-code \ | ||||||||||||||
--no-enable-prefix-caching \ | ||||||||||||||
--gpu-memory-utilization 0.92 \ | ||||||||||||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' | ||||||||||||||
fi | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
run_node "$@" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The model path is hardcoded on lines 23 and 45, which limits the script's reusability. It's better to pass the model path as a command-line argument and validate it. You can then use the variable (e.g.,
$MODEL_PATH
) in thevllm serve
commands on lines 23 and 45.