From ba2db86b8875f61286ba6dbecd69ef663ffe5df1 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 Aug 2025 11:27:38 +0800 Subject: [PATCH 1/5] add multi-node ci Signed-off-by: wangli --- .github/workflows/scripts/install_and_test.sh | 62 +++++++++ .github/workflows/scripts/start_container.sh | 63 ++++++++++ .../vllm_ascend_multi_node_test.yaml | 100 +++++++++++++++ examples/online_serving/multi_node_dp.sh | 69 ++++++++++ examples/online_serving/multi_node_ray.sh | 119 ++++++++++++++++++ 5 files changed, 413 insertions(+) create mode 100644 .github/workflows/scripts/install_and_test.sh create mode 100644 .github/workflows/scripts/start_container.sh create mode 100644 .github/workflows/vllm_ascend_multi_node_test.yaml create mode 100644 examples/online_serving/multi_node_dp.sh create mode 100644 examples/online_serving/multi_node_ray.sh diff --git a/.github/workflows/scripts/install_and_test.sh b/.github/workflows/scripts/install_and_test.sh new file mode 100644 index 0000000000..d9690e76ef --- /dev/null +++ b/.github/workflows/scripts/install_and_test.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -euo pipefail +export WORKSPACE="/home/workspace" + +check_npu_info() { + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info +} + +check_and_config() { + # config mirror + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi + +} + +install_sys_dependencies() { + echo "====> Install system dependencies" + cd $WORKSPACE + # install sys dependencies + apt-get update -y + apt-get -y install `cat /root/workspace/packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev iproute2 + # kimi-k2 dependency + pip install blobfile +} + +install_vllm() { + # install vllm + cd $WORKSPACE/vllm-empty + VLLM_TARGET_DEVICE=empty pip install -e . + + # install vllm-ascend + cd $WORKSPACE + pip install -e . +} + +wait_for_server() { + echo "====> Waiting for server to start" +} + +main() { + NODE_TYPE=$1 + if [ -n "${2:-}" ]; then + export MASTER_ADDR="$2" + fi + check_npu_info + check_and_config + install_sys_dependencies + install_vllm + echo "====> Installation completed successfully" + echo "====> Starting multi node tests" + # test data parallel on mp backend + . $WORKSPACE/examples/online_serving/multi_node_dp.sh "$NODE_TYPE" + + # test pipline parallel on ray backend + sleep 1000 +} + +main "$@" + diff --git a/.github/workflows/scripts/start_container.sh b/.github/workflows/scripts/start_container.sh new file mode 100644 index 0000000000..25ec1616af --- /dev/null +++ b/.github/workflows/scripts/start_container.sh @@ -0,0 +1,63 @@ +#!/bin/bash +IMAGE_NAME="quay.nju.edu.cn/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" +sudo docker pull $IMAGE_NAME + +CONTAINER_NAME="ascend_ci_a3" + +if docker ps -a --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then + echo "Container '$CONTAINER_NAME' exists. Removing it..." + + if docker ps --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then + echo "Stopping container '$CONTAINER_NAME'..." + docker stop "$CONTAINER_NAME" + fi + + docker rm "$CONTAINER_NAME" + echo "Container '$CONTAINER_NAME' has been removed." +fi + +echo "starting ascend NPU-A3 container" + +# Run the container using the defined variables +docker run -itd \ + --name "$CONTAINER_NAME" \ + --net=host \ + --device /dev/davinci0 \ + --device /dev/davinci1 \ + --device /dev/davinci2 \ + --device /dev/davinci3 \ + --device /dev/davinci4 \ + --device /dev/davinci5 \ + --device /dev/davinci6 \ + --device /dev/davinci7 \ + --device /dev/davinci8 \ + --device /dev/davinci9 \ + --device /dev/davinci10 \ + --device /dev/davinci11 \ + --device /dev/davinci12 \ + --device /dev/davinci13 \ + --device /dev/davinci14 \ + --device /dev/davinci15 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -e CLUSTER_SIZE:$CLUSTER_SIZE \ + -e MASTER_ADDR:$MASTER_ADDR \ + -v $GITHUB_WORKSPACE:$WORKSPACE \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/actions-runner/.cache:/root/actions-runner/.cache \ + -v /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks:/root/.cache \ + $IMAGE_NAME bash + +# Check if container start successfully +if [ $? -eq 0 ]; then + echo "Container $CONTAINER_NAME start successfully" +else + echo "Container $CONTAINER_NAME start failed, please check if the images exist or permission" + exit 1 +fi diff --git a/.github/workflows/vllm_ascend_multi_node_test.yaml b/.github/workflows/vllm_ascend_multi_node_test.yaml new file mode 100644 index 0000000000..f97bb31b30 --- /dev/null +++ b/.github/workflows/vllm_ascend_multi_node_test.yaml @@ -0,0 +1,100 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: 'Multi-Node-Test' +# This workflow runs nightly benchmarks for vllm-ascend. + +on: + workflow_dispatch: + # Allow manual triggering of the workflow + + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +jobs: + get_header_node_ip: + # This job is used to get the header node IP address. + name: 'Get Header Node IP' + runs-on: linux-aarch64-a3-node0 + outputs: + header_ip: ${{ steps.get_header_node_ip.outputs.MASTER_IP }} + steps: + - name: Get header node IP + id: get_header_node_ip + run: | + echo "MASTER_IP=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT + test_multi_node: + # Currently, we run multi-node tests only on: vllm==main, vllm-ascend==main. + name: 'Multi-Node-Test / DP' + needs: get_header_node_ip + strategy: + matrix: + runner: [linux-aarch64-a3-node0, linux-aarch64-a3-node1] + runs-on: ${{matrix.runner}} + env: + CONTAINER_NAME: ascend_ci_a3 + WORKSPACE: /home/workspace + CLUSTER_SIZE: 2 + MASTER_IP: ${{ needs.get_header_node_ip.outputs.header_ip }} + steps: + + - name: Set config + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: check user + run: | + whoami + + - name: Checkout vllm-ascend + uses: actions/checkout@v4 + with: + repository: Potabk/vllm-ascend + ref: multi_node_ci + path: ./ + clean: true + + - name: Checkout vllm + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: main + path: ./vllm-empty + + - name: Start container + run: | + bash .github/workflows/scripts/start_container.sh + + - name: Run multi-node test + run: | + SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh" + if [ "${{ matrix.runner }}" == "linux-aarch64-a3-node0" ]; then + docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH header" + else + docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH worker $MASTER_IP" + fi + + - name: Docker post test cleanup + if: always() + run: | + docker rm -f ascend_ci_a3 2>/dev/null || true + sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} + diff --git a/examples/online_serving/multi_node_dp.sh b/examples/online_serving/multi_node_dp.sh new file mode 100644 index 0000000000..4471bd7028 --- /dev/null +++ b/examples/online_serving/multi_node_dp.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -euo pipefail + +run_node() { + NODE_TYPE=$1 + echo "====> Running $NODE_TYPE" + + local_ip=$(hostname -I | awk '{print $1}') + iface=$(ip -o -4 addr show | awk -v ip="$local_ip" '$4 ~ ip"/" {print $2}') + + export HCCL_IF_IP=$local_ip + export GLOO_SOCKET_IFNAME=$iface + export TP_SOCKET_IFNAME=$iface + export HCCL_SOCKET_IFNAME=$iface + export OMP_PROC_BIND=false + export OMP_NUM_THREADS=100 + export VLLM_USE_V1=1 + export HCCL_BUFFSIZE=1024 + + if [ "$NODE_TYPE" == "header" ]; then + echo "====> Running header node" + vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \ + --host 0.0.0.0 \ + --port 8004 \ + --data-parallel-size 4 \ + --api-server-count 2 \ + --data-parallel-size-local 2 \ + --data-parallel-address $local_ip \ + --data-parallel-rpc-port 13389 \ + --seed 1024 \ + --served-model-name kimi \ + --quantization ascend \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --max-num-seqs 16 \ + --max-model-len 32768 \ + --max-num-batched-tokens 4096 \ + --trust-remote-code \ + --no-enable-prefix-caching \ + --gpu-memory-utilization 0.9 \ + --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' + else + echo "====> Running worker node" + vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \ + --host 0.0.0.0 \ + --port 8004 \ + --headless \ + --data-parallel-size 4 \ + --data-parallel-size-local 2 \ + --data-parallel-start-rank 2 \ + --data-parallel-address $MASTER_ADDR \ + --data-parallel-rpc-port 13389 \ + --seed 1024 \ + --tensor-parallel-size 8 \ + --served-model-name kimi \ + --max-num-seqs 16 \ + --max-model-len 32768 \ + --quantization ascend \ + --max-num-batched-tokens 4096 \ + --enable-expert-parallel \ + --trust-remote-code \ + --no-enable-prefix-caching \ + --gpu-memory-utilization 0.92 \ + --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' + fi +} + +run_node "$@" \ No newline at end of file diff --git a/examples/online_serving/multi_node_ray.sh b/examples/online_serving/multi_node_ray.sh new file mode 100644 index 0000000000..e8ad8d3de5 --- /dev/null +++ b/examples/online_serving/multi_node_ray.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# +# Helper script to manually start or join a Ray cluster for online serving of vLLM models. +# This script is first executed on the head node, and then on each worker node with the IP address +# of the head node. +# +# Subcommands: +# leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers). +# worker: Starts a worker node that connects to an existing Ray head node. +# +# Example usage: +# On the head node machine, start the Ray head node process and run a vLLM server. +# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size= [] && \ +# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2 +# +# On each worker node, start the Ray worker node process. +# ./multi-node-serving.sh worker --ray_address= --ray_port=6379 [] +# +# About Ray: +# Ray is an open-source distributed execution framework that simplifies +# distributed computing. Learn more: +# https://ray.io/ + + +subcommand=$1 # Either "leader" or "worker". +shift # Remove the subcommand from the argument list. + +ray_port=6379 # Port used by the Ray head node. +ray_init_timeout=300 # Seconds to wait before timing out. +declare -a start_params # Parameters forwarded to the underlying 'ray start' command. + +# Handle the worker subcommand. +case "$subcommand" in + worker) + ray_address="" + while [ $# -gt 0 ]; do + case "$1" in + --ray_address=*) + ray_address="${1#*=}" + ;; + --ray_port=*) + ray_port="${1#*=}" + ;; + --ray_init_timeout=*) + ray_init_timeout="${1#*=}" + ;; + *) + start_params+=("$1") + esac + shift + done + + if [ -z "$ray_address" ]; then + echo "Error: Missing argument --ray_address" + exit 1 + fi + + # Retry until the worker node connects to the head node or the timeout expires. + for (( i=0; i < $ray_init_timeout; i+=5 )); do + ray start --address=$ray_address:$ray_port --block "${start_params[@]}" + if [ $? -eq 0 ]; then + echo "Worker: Ray runtime started with head address $ray_address:$ray_port" + exit 0 + fi + echo "Waiting until the ray worker is active..." + sleep 5s; + done + echo "Ray worker starts timeout, head address: $ray_address:$ray_port" + exit 1 + ;; + + # Handle the leader subcommand. + leader) + ray_cluster_size="" + while [ $# -gt 0 ]; do + case "$1" in + --ray_port=*) + ray_port="${1#*=}" + ;; + --ray_cluster_size=*) + ray_cluster_size="${1#*=}" + ;; + --ray_init_timeout=*) + ray_init_timeout="${1#*=}" + ;; + *) + start_params+=("$1") + esac + shift + done + + if [ -z "$ray_cluster_size" ]; then + echo "Error: Missing argument --ray_cluster_size" + exit 1 + fi + + # Start the Ray head node. + ray start --head --port=$ray_port "${start_params[@]}" + + # Poll Ray until every worker node is active. + for (( i=0; i < $ray_init_timeout; i+=5 )); do + active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'` + if [ $active_nodes -eq $ray_cluster_size ]; then + echo "All ray workers are active and the ray cluster is initialized successfully." + exit 0 + fi + echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active" + sleep 5s; + done + + echo "Waiting for all ray workers to be active timed out." + exit 1 + ;; + + *) + echo "unknown subcommand: $subcommand" + exit 1 + ;; +esac From 86154cc26118fafe689486d40e9d598053a3fc36 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 Aug 2025 14:12:57 +0800 Subject: [PATCH 2/5] remove clean Signed-off-by: wangli --- .github/workflows/vllm_ascend_multi_node_test.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/vllm_ascend_multi_node_test.yaml b/.github/workflows/vllm_ascend_multi_node_test.yaml index f97bb31b30..6347aee927 100644 --- a/.github/workflows/vllm_ascend_multi_node_test.yaml +++ b/.github/workflows/vllm_ascend_multi_node_test.yaml @@ -61,17 +61,12 @@ jobs: run: | git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: check user - run: | - whoami - - name: Checkout vllm-ascend uses: actions/checkout@v4 with: repository: Potabk/vllm-ascend ref: multi_node_ci path: ./ - clean: true - name: Checkout vllm uses: actions/checkout@v4 @@ -97,4 +92,4 @@ jobs: if: always() run: | docker rm -f ascend_ci_a3 2>/dev/null || true - sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} + + # sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} + From 4e7f8390ad071b184c2029e23dc252ab4cae5dc1 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 Aug 2025 14:23:51 +0800 Subject: [PATCH 3/5] remove root files Signed-off-by: wangli --- .github/workflows/scripts/start_container.sh | 3 ++- .github/workflows/vllm_ascend_multi_node_test.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/start_container.sh b/.github/workflows/scripts/start_container.sh index 25ec1616af..684d5fe023 100644 --- a/.github/workflows/scripts/start_container.sh +++ b/.github/workflows/scripts/start_container.sh @@ -43,7 +43,8 @@ docker run -itd \ --device /dev/hisi_hdc \ -e CLUSTER_SIZE:$CLUSTER_SIZE \ -e MASTER_ADDR:$MASTER_ADDR \ - -v $GITHUB_WORKSPACE:$WORKSPACE \ + -e WORKSPACE="/home/workspace" \ + -v $GITHUB_WORKSPACE:/home/workspace \ -v /usr/local/dcmi:/usr/local/dcmi \ -v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ diff --git a/.github/workflows/vllm_ascend_multi_node_test.yaml b/.github/workflows/vllm_ascend_multi_node_test.yaml index 6347aee927..3095cdeab9 100644 --- a/.github/workflows/vllm_ascend_multi_node_test.yaml +++ b/.github/workflows/vllm_ascend_multi_node_test.yaml @@ -91,5 +91,5 @@ jobs: - name: Docker post test cleanup if: always() run: | + docker exec -i $CONTAINER_NAME bash -lc 'find "$WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} +' docker rm -f ascend_ci_a3 2>/dev/null || true - # sudo find "$GITHUB_WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} + From 6cbe2d68980aa457783e4a4e2dd6dfc88697ec1b Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 Aug 2025 14:26:41 +0800 Subject: [PATCH 4/5] fix turn Signed-off-by: wangli --- .github/workflows/vllm_ascend_multi_node_test.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/vllm_ascend_multi_node_test.yaml b/.github/workflows/vllm_ascend_multi_node_test.yaml index 3095cdeab9..4ea87a7e5a 100644 --- a/.github/workflows/vllm_ascend_multi_node_test.yaml +++ b/.github/workflows/vllm_ascend_multi_node_test.yaml @@ -61,6 +61,10 @@ jobs: run: | git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Start container + run: | + bash .github/workflows/scripts/start_container.sh + - name: Checkout vllm-ascend uses: actions/checkout@v4 with: @@ -75,10 +79,6 @@ jobs: ref: main path: ./vllm-empty - - name: Start container - run: | - bash .github/workflows/scripts/start_container.sh - - name: Run multi-node test run: | SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh" From 55766b1ac831a1522d47a05e0991933d2aac25ff Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 Aug 2025 14:29:26 +0800 Subject: [PATCH 5/5] Revert "fix turn" This reverts commit 6cbe2d68980aa457783e4a4e2dd6dfc88697ec1b. Signed-off-by: wangli --- .github/workflows/vllm_ascend_multi_node_test.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/vllm_ascend_multi_node_test.yaml b/.github/workflows/vllm_ascend_multi_node_test.yaml index 4ea87a7e5a..3095cdeab9 100644 --- a/.github/workflows/vllm_ascend_multi_node_test.yaml +++ b/.github/workflows/vllm_ascend_multi_node_test.yaml @@ -61,10 +61,6 @@ jobs: run: | git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Start container - run: | - bash .github/workflows/scripts/start_container.sh - - name: Checkout vllm-ascend uses: actions/checkout@v4 with: @@ -79,6 +75,10 @@ jobs: ref: main path: ./vllm-empty + - name: Start container + run: | + bash .github/workflows/scripts/start_container.sh + - name: Run multi-node test run: | SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh"