|
| 1 | +# Multi-Node-DP (Kimi-K2) |
| 2 | + |
| 3 | +## Verify Multi-Node Communication Environment |
| 4 | + |
| 5 | +referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process) |
| 6 | + |
| 7 | +## Run with docker |
| 8 | +Assume you have two Atlas 800 A3(64G*16) nodes(or 4 *A2* 8), and want to deploy the `Kimi-K2-Instruct-W8A8` quantitative model across multi-node. |
| 9 | + |
| 10 | +```{code-block} bash |
| 11 | + :substitutions: |
| 12 | +# Update the vllm-ascend image |
| 13 | +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| |
| 14 | +export NAME=vllm-ascend |
| 15 | +
|
| 16 | +# Run the container using the defined variables |
| 17 | +# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance |
| 18 | +docker run --rm \ |
| 19 | +--name $NAME \ |
| 20 | +--net=host \ |
| 21 | +--device /dev/davinci0 \ |
| 22 | +--device /dev/davinci1 \ |
| 23 | +--device /dev/davinci2 \ |
| 24 | +--device /dev/davinci3 \ |
| 25 | +--device /dev/davinci4 \ |
| 26 | +--device /dev/davinci5 \ |
| 27 | +--device /dev/davinci6 \ |
| 28 | +--device /dev/davinci7 \ |
| 29 | +--device /dev/davinci8 \ |
| 30 | +--device /dev/davinci9 \ |
| 31 | +--device /dev/davinci10 \ |
| 32 | +--device /dev/davinci11 \ |
| 33 | +--device /dev/davinci12 \ |
| 34 | +--device /dev/davinci13 \ |
| 35 | +--device /dev/davinci14 \ |
| 36 | +--device /dev/davinci15 \ |
| 37 | +--device /dev/davinci_manager \ |
| 38 | +--device /dev/devmm_svm \ |
| 39 | +--device /dev/hisi_hdc \ |
| 40 | +-v /usr/local/dcmi:/usr/local/dcmi \ |
| 41 | +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ |
| 42 | +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ |
| 43 | +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ |
| 44 | +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ |
| 45 | +-v /etc/ascend_install.info:/etc/ascend_install.info \ |
| 46 | +-v /mnt/sfs_turbo/.cache:/home/cache \ |
| 47 | +-it $IMAGE bash |
| 48 | +``` |
| 49 | + |
| 50 | +Run the following scripts on two nodes respectively |
| 51 | + |
| 52 | +:::{note} |
| 53 | +Before launch the inference server, ensure the following environment variables are set for multi node communication |
| 54 | +::: |
| 55 | + |
| 56 | +**node0** |
| 57 | + |
| 58 | +```shell |
| 59 | +#!/bin/sh |
| 60 | + |
| 61 | +# this obtained through ifconfig |
| 62 | +# nic_name is the network interface name corresponding to local_ip |
| 63 | +nic_name="xxxx" |
| 64 | +local_ip="xxxx" |
| 65 | + |
| 66 | +export HCCL_IF_IP=$local_ip |
| 67 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 68 | +export TP_SOCKET_IFNAME=$nic_name |
| 69 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 70 | +export OMP_PROC_BIND=false |
| 71 | +export OMP_NUM_THREADS=100 |
| 72 | +export VLLM_USE_V1=1 |
| 73 | +export HCCL_BUFFSIZE=1024 |
| 74 | + |
| 75 | +# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8 |
| 76 | +# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html |
| 77 | +vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ |
| 78 | +--host 0.0.0.0 \ |
| 79 | +--port 8004 \ |
| 80 | +--data-parallel-size 4 \ |
| 81 | +--api-server-count 2 \ |
| 82 | +--data-parallel-size-local 2 \ |
| 83 | +--data-parallel-address $local_ip \ |
| 84 | +--data-parallel-rpc-port 13389 \ |
| 85 | +--seed 1024 \ |
| 86 | +--served-model-name kimi \ |
| 87 | +--quantization ascend \ |
| 88 | +--tensor-parallel-size 8 \ |
| 89 | +--enable-expert-parallel \ |
| 90 | +--max-num-seqs 16 \ |
| 91 | +--max-model-len 32768 \ |
| 92 | +--max-num-batched-tokens 4096 \ |
| 93 | +--trust-remote-code \ |
| 94 | +--no-enable-prefix-caching \ |
| 95 | +--gpu-memory-utilization 0.9 \ |
| 96 | +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' |
| 97 | +``` |
| 98 | + |
| 99 | +**node1** |
| 100 | + |
| 101 | +```shell |
| 102 | +#!/bin/sh |
| 103 | + |
| 104 | +nic_name="xxxx" |
| 105 | +local_ip="xxxx" |
| 106 | + |
| 107 | +export HCCL_IF_IP=$local_ip |
| 108 | +export GLOO_SOCKET_IFNAME=$nic_name |
| 109 | +export TP_SOCKET_IFNAME=$nic_name |
| 110 | +export HCCL_SOCKET_IFNAME=$nic_name |
| 111 | +export OMP_PROC_BIND=false |
| 112 | +export OMP_NUM_THREADS=100 |
| 113 | +export VLLM_USE_V1=1 |
| 114 | +export HCCL_BUFFSIZE=1024 |
| 115 | + |
| 116 | +vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ |
| 117 | +--host 0.0.0.0 \ |
| 118 | +--port 8004 \ |
| 119 | +--headless \ |
| 120 | +--data-parallel-size 4 \ |
| 121 | +--data-parallel-size-local 2 \ |
| 122 | +--data-parallel-start-rank 2 \ |
| 123 | +--data-parallel-address $node0_ip \ |
| 124 | +--data-parallel-rpc-port 13389 \ |
| 125 | +--seed 1024 \ |
| 126 | +--tensor-parallel-size 8 \ |
| 127 | +--served-model-name kimi \ |
| 128 | +--max-num-seqs 16 \ |
| 129 | +--max-model-len 32768 \ |
| 130 | +--quantization ascend \ |
| 131 | +--max-num-batched-tokens 4096 \ |
| 132 | +--enable-expert-parallel \ |
| 133 | +--trust-remote-code \ |
| 134 | +--no-enable-prefix-caching \ |
| 135 | +--gpu-memory-utilization 0.92 \ |
| 136 | +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' |
| 137 | +``` |
| 138 | + |
| 139 | +The Deployment view looks like: |
| 140 | + |
| 141 | + |
| 142 | +Once your server is started, you can query the model with input prompts: |
| 143 | + |
| 144 | +```shell |
| 145 | +curl http://{ node0 ip:8004 }/v1/completions \ |
| 146 | + -H "Content-Type: application/json" \ |
| 147 | + -d '{ |
| 148 | + "model": "kimi", |
| 149 | + "prompt": "The future of AI is", |
| 150 | + "max_tokens": 50, |
| 151 | + "temperature": 0 |
| 152 | + }' |
| 153 | +``` |
0 commit comments