|
| 1 | +# Hybrid-Node (DeepSeek) |
| 2 | + |
| 3 | +## Verify Multi-Node Communication Environment |
| 4 | + |
| 5 | +### Physical Layer Requirements: |
| 6 | + |
| 7 | +- The physical machines must be located on the same WLAN, with network connectivity. |
| 8 | +- All NPUs are connected with optical modules, and the connection status must be normal. |
| 9 | + |
| 10 | +### Verification Process: |
| 11 | + |
| 12 | +Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`: |
| 13 | + |
| 14 | +```bash |
| 15 | + # Check the remote switch ports |
| 16 | + for i in {0..15}; do hccn_tool -i $i -lldp -g | grep Ifname; done |
| 17 | + # Get the link status of the Ethernet ports (UP or DOWN) |
| 18 | + for i in {0..15}; do hccn_tool -i $i -link -g ; done |
| 19 | + # Check the network health status |
| 20 | + for i in {0..15}; do hccn_tool -i $i -net_health -g ; done |
| 21 | + # View the network detected IP configuration |
| 22 | + for i in {0..15}; do hccn_tool -i $i -netdetect -g ; done |
| 23 | + # View gateway configuration |
| 24 | + for i in {0..15}; do hccn_tool -i $i -gateway -g ; done |
| 25 | + # View NPU network configuration |
| 26 | + cat /etc/hccn.conf |
| 27 | +``` |
| 28 | + |
| 29 | +### NPU Interconnect Verification: |
| 30 | +#### 1. Get NPU IP Addresses |
| 31 | + |
| 32 | +```bash |
| 33 | +for i in {0..15}; do hccn_tool -i $i -ip -g | grep ipaddr; done |
| 34 | +``` |
| 35 | + |
| 36 | +#### 2. Cross-Node PING Test |
| 37 | + |
| 38 | +```bash |
| 39 | +# Execute on the target node (replace with actual IP) |
| 40 | +hccn_tool -i 0 -ping -g address 10.20.0.20 |
| 41 | +``` |
| 42 | + |
| 43 | +## Run with docker |
| 44 | +Assume you have two Atlas 800 A3(64G*16) nodes, and want to deploy the `deepseek-v3.1-w8a8` quantitative model across multi-node. |
| 45 | + |
| 46 | +```{code-block} bash |
| 47 | + :substitutions: |
| 48 | +# Update the vllm-ascend image |
| 49 | +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| |
| 50 | +export NAME=vllm-ascend |
| 51 | +
|
| 52 | +# Run the container using the defined variables |
| 53 | +# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance |
| 54 | +docker run --rm \ |
| 55 | +--name $NAME \ |
| 56 | +--net=host \ |
| 57 | +--device /dev/davinci0 \ |
| 58 | +--device /dev/davinci1 \ |
| 59 | +--device /dev/davinci2 \ |
| 60 | +--device /dev/davinci3 \ |
| 61 | +--device /dev/davinci4 \ |
| 62 | +--device /dev/davinci5 \ |
| 63 | +--device /dev/davinci6 \ |
| 64 | +--device /dev/davinci7 \ |
| 65 | +--device /dev/davinci8 \ |
| 66 | +--device /dev/davinci9 \ |
| 67 | +--device /dev/davinci10 \ |
| 68 | +--device /dev/davinci11 \ |
| 69 | +--device /dev/davinci12 \ |
| 70 | +--device /dev/davinci13 \ |
| 71 | +--device /dev/davinci14 \ |
| 72 | +--device /dev/davinci15 \ |
| 73 | +--device /dev/davinci_manager \ |
| 74 | +--device /dev/devmm_svm \ |
| 75 | +--device /dev/hisi_hdc \ |
| 76 | +-v /usr/local/dcmi:/usr/local/dcmi \ |
| 77 | +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ |
| 78 | +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ |
| 79 | +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ |
| 80 | +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ |
| 81 | +-v /etc/ascend_install.info:/etc/ascend_install.info \ |
| 82 | +-v /mnt/sfs_turbo/.cache:/root/.cache \ |
| 83 | +-it $IMAGE bash |
| 84 | +``` |
| 85 | + |
| 86 | +:::::{tab-set} |
| 87 | +::::{tab-item} DeepSeek-V3.1-BF16 |
| 88 | + |
| 89 | +Run the following scripts on two nodes respectively |
| 90 | + |
| 91 | +:::{note} |
| 92 | +Before launch the inference server, ensure the following environment variables are set for multi node communication |
| 93 | +::: |
| 94 | + |
| 95 | +**node0** |
| 96 | + |
| 97 | +```shell |
| 98 | +#!/bin/sh |
| 99 | + |
| 100 | +# this obtained through ifconfig |
| 101 | +# nic_name is the network interface name corresponding to local_ip |
| 102 | +nic_name="xxxx" |
| 103 | +local_ip="xxxx" |
| 104 | + |
| 105 | +export VLLM_USE_MODELSCOPE=True |
| 106 | +export HCCL_IF_IP=$local_ip |
| 107 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 108 | +export TP_SOCKET_IFNAME=$nic_name |
| 109 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 110 | +export OMP_PROC_BIND=false |
| 111 | +export OMP_NUM_THREADS=100 |
| 112 | +export HCCL_BUFFSIZE=1024 |
| 113 | + |
| 114 | +vllm serve unsloth/DeepSeek-V3.1-BF16 \ |
| 115 | +--host 0.0.0.0 \ |
| 116 | +--port 8004 \ |
| 117 | +--data-parallel-size 2 \ |
| 118 | +--data-parallel-size-local 1 \ |
| 119 | +--data-parallel-address $local_ip \ |
| 120 | +--data-parallel-rpc-port 13389 \ |
| 121 | +--tensor-parallel-size 16 \ |
| 122 | +--seed 1024 \ |
| 123 | +--served-model-name deepseek_v3.1 \ |
| 124 | +--enable-expert-parallel \ |
| 125 | +--max-num-seqs 16 \ |
| 126 | +--max-model-len 32768 \ |
| 127 | +--max-num-batched-tokens 32768 \ |
| 128 | +--trust-remote-code \ |
| 129 | +--no-enable-prefix-caching \ |
| 130 | +--gpu-memory-utilization 0.9 \ |
| 131 | +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' |
| 132 | +``` |
| 133 | + |
| 134 | +**node1** |
| 135 | + |
| 136 | +```shell |
| 137 | +#!/bin/sh |
| 138 | + |
| 139 | +nic_name="xxx" |
| 140 | +local_ip="xxx" |
| 141 | + |
| 142 | +export VLLM_USE_MODELSCOPE=True |
| 143 | +export HCCL_IF_IP=$local_ip |
| 144 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 145 | +export TP_SOCKET_IFNAME=$nic_name |
| 146 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 147 | +export OMP_PROC_BIND=false |
| 148 | +export OMP_NUM_THREADS=100 |
| 149 | +export HCCL_BUFFSIZE=1024 |
| 150 | + |
| 151 | +vllm serve unsloth/DeepSeek-V3.1-BF16 \ |
| 152 | +--host 0.0.0.0 \ |
| 153 | +--port 8004 \ |
| 154 | +--headless \ |
| 155 | +--data-parallel-size 2 \ |
| 156 | +--data-parallel-size-local 1 \ |
| 157 | +--data-parallel-start-rank 1 \ |
| 158 | +--data-parallel-address <node0_ip> \ |
| 159 | +--data-parallel-rpc-port 13389 \ |
| 160 | +--tensor-parallel-size 16 \ |
| 161 | +--seed 1024 \ |
| 162 | +--served-model-name deepseek_v3.1 \ |
| 163 | +--max-num-seqs 16 \ |
| 164 | +--max-model-len 32768 \ |
| 165 | +--max-num-batched-tokens 32768 \ |
| 166 | +--enable-expert-parallel \ |
| 167 | +--trust-remote-code \ |
| 168 | +--no-enable-prefix-caching \ |
| 169 | +--gpu-memory-utilization 0.92 \ |
| 170 | +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' |
| 171 | +``` |
| 172 | + |
| 173 | +:::: |
| 174 | + |
| 175 | +::::{tab-item} DeepSeek-V3.1-W8A8 |
| 176 | + |
| 177 | +```shell |
| 178 | +#!/bin/sh |
| 179 | + |
| 180 | +vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ |
| 181 | +--host 0.0.0.0 \ |
| 182 | +--port 8004 \ |
| 183 | +--tensor-parallel-size 16 \ |
| 184 | +--seed 1024 \ |
| 185 | +--quantization ascend \ |
| 186 | +--served-model-name deepseek_v3.1 \ |
| 187 | +--max-num-seqs 16 \ |
| 188 | +--max-model-len 32768 \ |
| 189 | +--max-num-batched-tokens 32768 \ |
| 190 | +--enable-expert-parallel \ |
| 191 | +--trust-remote-code \ |
| 192 | +--no-enable-prefix-caching \ |
| 193 | +--gpu-memory-utilization 0.92 \ |
| 194 | +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' |
| 195 | +``` |
| 196 | + |
| 197 | +:::: |
| 198 | +::::: |
| 199 | + |
| 200 | +Once your server is started, you can query the model with input prompts: |
| 201 | + |
| 202 | +```shell |
| 203 | +curl http://<node0_ip>:<port>/v1/completions \ |
| 204 | + -H "Content-Type: application/json" \ |
| 205 | + -d '{ |
| 206 | + "model": "deepseek_v3.1", |
| 207 | + "prompt": "The future of AI is", |
| 208 | + "max_tokens": 50, |
| 209 | + "temperature": 0 |
| 210 | + }' |
| 211 | +``` |
0 commit comments