|
| 1 | +# Multi-Node-DP (Qwen3-VL-235B-A22B) |
| 2 | + |
| 3 | +## Verify Multi-Node Communication Environment |
| 4 | + |
| 5 | +referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process) |
| 6 | + |
| 7 | +## Run with docker |
| 8 | +Assume you have an Atlas 800 A3(64G*16) nodes(or 2 * A2), and want to deploy the `Qwen3-VL-235B-A22B-Instruct` model across multi-node. |
| 9 | + |
| 10 | +```{code-block} bash |
| 11 | + :substitutions: |
| 12 | +# Update the vllm-ascend image |
| 13 | +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| |
| 14 | +docker run --rm \ |
| 15 | +--name vllm-ascend \ |
| 16 | +--net=host \ |
| 17 | +--device /dev/davinci0 \ |
| 18 | +--device /dev/davinci1 \ |
| 19 | +--device /dev/davinci2 \ |
| 20 | +--device /dev/davinci3 \ |
| 21 | +--device /dev/davinci4 \ |
| 22 | +--device /dev/davinci5 \ |
| 23 | +--device /dev/davinci6 \ |
| 24 | +--device /dev/davinci7 \ |
| 25 | +--device /dev/davinci8 \ |
| 26 | +--device /dev/davinci9 \ |
| 27 | +--device /dev/davinci10 \ |
| 28 | +--device /dev/davinci11 \ |
| 29 | +--device /dev/davinci12 \ |
| 30 | +--device /dev/davinci13 \ |
| 31 | +--device /dev/davinci14 \ |
| 32 | +--device /dev/davinci15 \ |
| 33 | +--device /dev/davinci_manager \ |
| 34 | +--device /dev/devmm_svm \ |
| 35 | +--device /dev/hisi_hdc \ |
| 36 | +-v /usr/local/dcmi:/usr/local/dcmi \ |
| 37 | +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ |
| 38 | +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ |
| 39 | +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ |
| 40 | +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ |
| 41 | +-v /etc/ascend_install.info:/etc/ascend_install.info \ |
| 42 | +-v /root/.cache:/root/.cache \ |
| 43 | +-p 8000:8000 \ |
| 44 | +-it $IMAGE bash |
| 45 | +``` |
| 46 | + |
| 47 | +Run the following scripts on two nodes respectively |
| 48 | + |
| 49 | +:::{note} |
| 50 | +Before launch the inference server, ensure the following environment variables are set for multi node communication |
| 51 | +::: |
| 52 | + |
| 53 | +node0 |
| 54 | + |
| 55 | +```shell |
| 56 | +#!/bin/sh |
| 57 | +# this obtained through ifconfig |
| 58 | +# nic_name is the network interface name corresponding to local_ip |
| 59 | +nic_name="xxxx" |
| 60 | +local_ip="xxxx" |
| 61 | + |
| 62 | +export HCCL_IF_IP=$local_ip |
| 63 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 64 | +export TP_SOCKET_IFNAME=$nic_name |
| 65 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 66 | +export OMP_PROC_BIND=false |
| 67 | +export OMP_NUM_THREADS=100 |
| 68 | +export VLLM_USE_V1=1 |
| 69 | +export HCCL_BUFFSIZE=1024 |
| 70 | + |
| 71 | +vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ |
| 72 | +--host 0.0.0.0 \ |
| 73 | +--port 8000 \ |
| 74 | +--data-parallel-size 2 \ |
| 75 | +--api-server-count 2 \ |
| 76 | +--data-parallel-size-local 1 \ |
| 77 | +--data-parallel-address $local_ip \ |
| 78 | +--data-parallel-rpc-port 13389 \ |
| 79 | +--seed 1024 \ |
| 80 | +--served-model-name qwen3vl \ |
| 81 | +--tensor-parallel-size 8 \ |
| 82 | +--enable-expert-parallel \ |
| 83 | +--max-num-seqs 16 \ |
| 84 | +--max-model-len 32768 \ |
| 85 | +--max-num-batched-tokens 4096 \ |
| 86 | +--trust-remote-code \ |
| 87 | +--no-enable-prefix-caching \ |
| 88 | +--gpu-memory-utilization 0.8 \ |
| 89 | +``` |
| 90 | + |
| 91 | +node1 |
| 92 | + |
| 93 | +```shell |
| 94 | +#!/bin/sh |
| 95 | + |
| 96 | +nic_name="xxxx" |
| 97 | +local_ip="xxxx" |
| 98 | +node0_ip="xxxx" |
| 99 | + |
| 100 | +export HCCL_IF_IP=$local_ip |
| 101 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 102 | +export TP_SOCKET_IFNAME=$nic_name |
| 103 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 104 | +export OMP_PROC_BIND=false |
| 105 | +export OMP_NUM_THREADS=100 |
| 106 | +export VLLM_USE_V1=1 |
| 107 | +export HCCL_BUFFSIZE=1024 |
| 108 | + |
| 109 | +vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ |
| 110 | +--host 0.0.0.0 \ |
| 111 | +--port 8000 \ |
| 112 | +--headless \ |
| 113 | +--data-parallel-size 2 \ |
| 114 | +--data-parallel-size-local 1 \ |
| 115 | +--data-parallel-start-rank 1 \ |
| 116 | +--data-parallel-address $node0_ip \ |
| 117 | +--data-parallel-rpc-port 13389 \ |
| 118 | +--seed 1024 \ |
| 119 | +--tensor-parallel-size 8 \ |
| 120 | +--served-model-name qwen3vl \ |
| 121 | +--max-num-seqs 16 \ |
| 122 | +--max-model-len 32768 \ |
| 123 | +--max-num-batched-tokens 4096 \ |
| 124 | +--enable-expert-parallel \ |
| 125 | +--trust-remote-code \ |
| 126 | +--no-enable-prefix-caching \ |
| 127 | +--gpu-memory-utilization 0.8 \ |
| 128 | +``` |
| 129 | + |
| 130 | +If the service starts successfully, the following information will be displayed on node0: |
| 131 | + |
| 132 | +```shell |
| 133 | +INFO: Started server process [44610] |
| 134 | +INFO: Waiting for application startup. |
| 135 | +INFO: Application startup complete. |
| 136 | +INFO: Started server process [44611] |
| 137 | +INFO: Waiting for application startup. |
| 138 | +INFO: Application startup complete. |
| 139 | +``` |
| 140 | + |
| 141 | +Once your server is started, you can query the model with input prompts: |
| 142 | + |
| 143 | +```shell |
| 144 | +curl http://localhost:8000/v1/chat/completions \ |
| 145 | + -H "Content-Type: application/json" \ |
| 146 | + -d '{ |
| 147 | + "model": "qwen3vl", |
| 148 | + "messages": [ |
| 149 | + {"role": "system", "content": "You are a helpful assistant."}, |
| 150 | + {"role": "user", "content": [ |
| 151 | + {"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}}, |
| 152 | + {"type": "text", "text": "What is the text in the illustrate?"} |
| 153 | + ]} |
| 154 | + ] |
| 155 | + }' |
| 156 | +``` |
0 commit comments