Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions .github/workflows/scripts/install_and_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

set -euo pipefail
export WORKSPACE="/home/workspace"

check_npu_info() {
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
}

check_and_config() {
# config mirror
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi

}

install_sys_dependencies() {
echo "====> Install system dependencies"
cd $WORKSPACE
# install sys dependencies
apt-get update -y
apt-get -y install `cat /root/workspace/packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev iproute2
# kimi-k2 dependency
pip install blobfile
}

install_vllm() {
# install vllm
cd $WORKSPACE/vllm-empty
VLLM_TARGET_DEVICE=empty pip install -e .

# install vllm-ascend
cd $WORKSPACE
pip install -e .
}

wait_for_server() {
echo "====> Waiting for server to start"
}

main() {
NODE_TYPE=$1
if [ -n "${2:-}" ]; then
export MASTER_ADDR="$2"
fi
check_npu_info
check_and_config
install_sys_dependencies
install_vllm
echo "====> Installation completed successfully"
echo "====> Starting multi node tests"
# test data parallel on mp backend
. $WORKSPACE/examples/online_serving/multi_node_dp.sh "$NODE_TYPE"

# test pipline parallel on ray backend
sleep 1000
}

main "$@"

64 changes: 64 additions & 0 deletions .github/workflows/scripts/start_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash
IMAGE_NAME="quay.nju.edu.cn/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11"
sudo docker pull $IMAGE_NAME

CONTAINER_NAME="ascend_ci_a3"

if docker ps -a --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then
echo "Container '$CONTAINER_NAME' exists. Removing it..."

if docker ps --format '{{.Names}}' | grep -qw "^${CONTAINER_NAME}$"; then
echo "Stopping container '$CONTAINER_NAME'..."
docker stop "$CONTAINER_NAME"
fi

docker rm "$CONTAINER_NAME"
echo "Container '$CONTAINER_NAME' has been removed."
fi

echo "starting ascend NPU-A3 container"

# Run the container using the defined variables
docker run -itd \
--name "$CONTAINER_NAME" \
--net=host \
--device /dev/davinci0 \
--device /dev/davinci1 \
--device /dev/davinci2 \
--device /dev/davinci3 \
--device /dev/davinci4 \
--device /dev/davinci5 \
--device /dev/davinci6 \
--device /dev/davinci7 \
--device /dev/davinci8 \
--device /dev/davinci9 \
--device /dev/davinci10 \
--device /dev/davinci11 \
--device /dev/davinci12 \
--device /dev/davinci13 \
--device /dev/davinci14 \
--device /dev/davinci15 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-e CLUSTER_SIZE:$CLUSTER_SIZE \
-e MASTER_ADDR:$MASTER_ADDR \
-e WORKSPACE="/home/workspace" \
-v $GITHUB_WORKSPACE:/home/workspace \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/actions-runner/.cache:/root/actions-runner/.cache \
-v /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks:/root/.cache \
$IMAGE_NAME bash

# Check if container start successfully
if [ $? -eq 0 ]; then
echo "Container $CONTAINER_NAME start successfully"
else
echo "Container $CONTAINER_NAME start failed, please check if the images exist or permission"
exit 1
fi
95 changes: 95 additions & 0 deletions .github/workflows/vllm_ascend_multi_node_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

name: 'Multi-Node-Test'
# This workflow runs nightly benchmarks for vllm-ascend.

on:
workflow_dispatch:
# Allow manual triggering of the workflow


# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}

jobs:
get_header_node_ip:
# This job is used to get the header node IP address.
name: 'Get Header Node IP'
runs-on: linux-aarch64-a3-node0

Check failure on line 37 in .github/workflows/vllm_ascend_multi_node_test.yaml

View workflow job for this annotation

GitHub Actions / lint / pre-commit

label "linux-aarch64-a3-node0" is unknown. available labels are "windows-latest", "windows-latest-8-cores", "windows-2025", "windows-2022", "windows-2019", "ubuntu-latest", "ubuntu-latest-4-cores", "ubuntu-latest-8-cores", "ubuntu-latest-16-cores", "ubuntu-24.04", "ubuntu-24.04-arm", "ubuntu-22.04", "ubuntu-22.04-arm", "ubuntu-20.04", "macos-latest", "macos-latest-xl", "macos-latest-xlarge", "macos-latest-large", "macos-15-xlarge", "macos-15-large", "macos-15", "macos-14-xl", "macos-14-xlarge", "macos-14-large", "macos-14", "macos-13-xl", "macos-13-xlarge", "macos-13-large", "macos-13", "self-hosted", "x64", "arm", "arm64", "linux", "macos", "windows", "linux-aarch64-a2-0", "linux-aarch64-a2-1", "linux-aarch64-a2-2", "linux-aarch64-a2-4", "linux-aarch64-a2-8", "linux-arm64-npu-static-8", "linux-aarch64-310p-1", "linux-aarch64-310p-2", "linux-aarch64-310p-4", "ubuntu-24.04-arm". if it is a custom label for self-hosted runner, set list of labels in actionlint.yaml config file
outputs:
header_ip: ${{ steps.get_header_node_ip.outputs.MASTER_IP }}
steps:
- name: Get header node IP
id: get_header_node_ip
run: |
echo "MASTER_IP=$(hostname -I | awk '{print $1}')" >> $GITHUB_OUTPUT
test_multi_node:
# Currently, we run multi-node tests only on: vllm==main, vllm-ascend==main.
name: 'Multi-Node-Test / DP'
needs: get_header_node_ip
strategy:
matrix:
runner: [linux-aarch64-a3-node0, linux-aarch64-a3-node1]

Check failure on line 51 in .github/workflows/vllm_ascend_multi_node_test.yaml

View workflow job for this annotation

GitHub Actions / lint / pre-commit

label "linux-aarch64-a3-node1" is unknown. available labels are "windows-latest", "windows-latest-8-cores", "windows-2025", "windows-2022", "windows-2019", "ubuntu-latest", "ubuntu-latest-4-cores", "ubuntu-latest-8-cores", "ubuntu-latest-16-cores", "ubuntu-24.04", "ubuntu-24.04-arm", "ubuntu-22.04", "ubuntu-22.04-arm", "ubuntu-20.04", "macos-latest", "macos-latest-xl", "macos-latest-xlarge", "macos-latest-large", "macos-15-xlarge", "macos-15-large", "macos-15", "macos-14-xl", "macos-14-xlarge", "macos-14-large", "macos-14", "macos-13-xl", "macos-13-xlarge", "macos-13-large", "macos-13", "self-hosted", "x64", "arm", "arm64", "linux", "macos", "windows", "linux-aarch64-a2-0", "linux-aarch64-a2-1", "linux-aarch64-a2-2", "linux-aarch64-a2-4", "linux-aarch64-a2-8", "linux-arm64-npu-static-8", "linux-aarch64-310p-1", "linux-aarch64-310p-2", "linux-aarch64-310p-4", "ubuntu-24.04-arm". if it is a custom label for self-hosted runner, set list of labels in actionlint.yaml config file

Check failure on line 51 in .github/workflows/vllm_ascend_multi_node_test.yaml

View workflow job for this annotation

GitHub Actions / lint / pre-commit

label "linux-aarch64-a3-node0" is unknown. available labels are "windows-latest", "windows-latest-8-cores", "windows-2025", "windows-2022", "windows-2019", "ubuntu-latest", "ubuntu-latest-4-cores", "ubuntu-latest-8-cores", "ubuntu-latest-16-cores", "ubuntu-24.04", "ubuntu-24.04-arm", "ubuntu-22.04", "ubuntu-22.04-arm", "ubuntu-20.04", "macos-latest", "macos-latest-xl", "macos-latest-xlarge", "macos-latest-large", "macos-15-xlarge", "macos-15-large", "macos-15", "macos-14-xl", "macos-14-xlarge", "macos-14-large", "macos-14", "macos-13-xl", "macos-13-xlarge", "macos-13-large", "macos-13", "self-hosted", "x64", "arm", "arm64", "linux", "macos", "windows", "linux-aarch64-a2-0", "linux-aarch64-a2-1", "linux-aarch64-a2-2", "linux-aarch64-a2-4", "linux-aarch64-a2-8", "linux-arm64-npu-static-8", "linux-aarch64-310p-1", "linux-aarch64-310p-2", "linux-aarch64-310p-4", "ubuntu-24.04-arm". if it is a custom label for self-hosted runner, set list of labels in actionlint.yaml config file
runs-on: ${{matrix.runner}}
env:
CONTAINER_NAME: ascend_ci_a3
WORKSPACE: /home/workspace
CLUSTER_SIZE: 2
MASTER_IP: ${{ needs.get_header_node_ip.outputs.header_ip }}
steps:

- name: Set config
run: |
git config --global --add safe.directory "$GITHUB_WORKSPACE"

- name: Checkout vllm-ascend
uses: actions/checkout@v4
with:
repository: Potabk/vllm-ascend
ref: multi_node_ci
path: ./

- name: Checkout vllm
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: main
path: ./vllm-empty

- name: Start container
run: |
bash .github/workflows/scripts/start_container.sh

- name: Run multi-node test
run: |
SCRIPT_PATH="$WORKSPACE/.github/workflows/scripts/install_and_test.sh"
if [ "${{ matrix.runner }}" == "linux-aarch64-a3-node0" ]; then
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH header"
else
docker exec -i $CONTAINER_NAME bash -lc "bash $SCRIPT_PATH worker $MASTER_IP"
fi

- name: Docker post test cleanup
if: always()
run: |
docker exec -i $CONTAINER_NAME bash -lc 'find "$WORKSPACE" -mindepth 1 -maxdepth 1 -xdev -exec rm -rf {} +'
docker rm -f ascend_ci_a3 2>/dev/null || true
69 changes: 69 additions & 0 deletions examples/online_serving/multi_node_dp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

set -euo pipefail

run_node() {
NODE_TYPE=$1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The model path is hardcoded on lines 23 and 45, which limits the script's reusability. It's better to pass the model path as a command-line argument and validate it. You can then use the variable (e.g., $MODEL_PATH) in the vllm serve commands on lines 23 and 45.

Suggested change
NODE_TYPE=$1
NODE_TYPE=$1
MODEL_PATH=$2
if [ -z "$MODEL_PATH" ]; then
echo "Error: model path must be provided as the second argument" >&2
exit 1
fi

echo "====> Running $NODE_TYPE"

local_ip=$(hostname -I | awk '{print $1}')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The hostname -I command can fail and return an empty string, which would cause issues in subsequent commands. It's safer to check if an IP was actually found.

Suggested change
local_ip=$(hostname -I | awk '{print $1}')
local_ip=$(hostname -I | awk '{print $1}')
if [ -z "$local_ip" ]; then
echo "Error: Could not determine local IP address." >&2
exit 1
fi

iface=$(ip -o -4 addr show | awk -v ip="$local_ip" '$4 ~ ip"/" {print $2}')

export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$iface
export TP_SOCKET_IFNAME=$iface
export HCCL_SOCKET_IFNAME=$iface
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024

if [ "$NODE_TYPE" == "header" ]; then
echo "====> Running header node"
vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \
--host 0.0.0.0 \
--port 8004 \
--data-parallel-size 4 \
--api-server-count 2 \
--data-parallel-size-local 2 \
--data-parallel-address $local_ip \
--data-parallel-rpc-port 13389 \
--seed 1024 \
--served-model-name kimi \
--quantization ascend \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--max-num-seqs 16 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.9 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
else
echo "====> Running worker node"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The MASTER_ADDR variable is used on line 52 but it is not defined. This will cause an error when running as a worker. Please add a check to ensure this variable is set.

Suggested change
echo "====> Running worker node"
if [ -z "${MASTER_ADDR-}" ]; then
echo "Error: MASTER_ADDR environment variable must be set for worker node." >&2
exit 1
fi
echo "====> Running worker node"

vllm serve /root/.cache/weights/Kimi-K2-Instruct-W8A8 \
--host 0.0.0.0 \
--port 8004 \
--headless \
--data-parallel-size 4 \
--data-parallel-size-local 2 \
--data-parallel-start-rank 2 \
--data-parallel-address $MASTER_ADDR \
--data-parallel-rpc-port 13389 \
--seed 1024 \
--tensor-parallel-size 8 \
--served-model-name kimi \
--max-num-seqs 16 \
--max-model-len 32768 \
--quantization ascend \
--max-num-batched-tokens 4096 \
--enable-expert-parallel \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.92 \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
fi
}

run_node "$@"
Loading
Loading