1+ #! /bin/bash
2+
3+ if [[ -z " $NODE " ]]; then
4+ echo " ERROR: Please set NODE=N before running. N should be 0 for master node; 1,2,3... for workers. Note the IPs and environment variables in the script should be modified accordingly. "
5+ echo " Usage: NODE=0 ./run_vllm.sh"
6+ exit 1
7+ fi
8+
9+ load_config () {
10+ local config_file
11+ config_file=" $( dirname " ${BASH_SOURCE[0]} " ) /config.properties"
12+ if [[ ! -f " $config_file " ]]; then
13+ echo " ERROR: Config file '$config_file ' not found!" >&2
14+ exit 1
15+ fi
16+
17+ while IFS= read -r line; do
18+ line=$( echo " $line " | sed ' s/^[[:space:]]*//;s/[[:space:]]*$//' )
19+ [[ -z " $line " || " $line " == \# * ]] && continue
20+
21+ if [[ " $line " == export\ * ]]; then
22+ rest=" ${line# export } "
23+ eval " export $rest "
24+ else
25+ if [[ " $line " == * = * ]]; then
26+ key=" ${line%% =* } "
27+ value=" ${line#* =} "
28+ key=$( echo " $key " | sed ' s/^[[:space:]]*//;s/[[:space:]]*$//' )
29+ value=$( echo " $value " | sed ' s/^[[:space:]]*//;s/[[:space:]]*$//' )
30+ eval " $key =\$ value"
31+ else
32+ echo " WARNING: Invalid config line (no '=' found): $line " >&2
33+ fi
34+ fi
35+ done < " $config_file "
36+ }
37+
38+ ensure_ifconfig_installed () {
39+ if command -v ifconfig > /dev/null 2>&1 ; then
40+ return 0
41+ fi
42+
43+ echo " ifconfig not found. Attempting to install net-tools..."
44+
45+ if command -v apt-get > /dev/null 2>&1 ; then
46+ echo " Detected apt-get (Debian/Ubuntu). Installing net-tools..."
47+ sudo apt-get update && sudo apt-get install -y net-tools
48+ elif command -v yum > /dev/null 2>&1 ; then
49+ echo " Detected yum (RHEL/CentOS). Installing net-tools..."
50+ sudo yum install -y net-tools
51+ elif command -v dnf > /dev/null 2>&1 ; then
52+ echo " Detected dnf (Fedora). Installing net-tools..."
53+ sudo dnf install -y net-tools
54+ else
55+ echo " ERROR: No supported package manager (apt/yum/dnf) found."
56+ echo " Please install 'net-tools' manually or use a system with 'ip' command."
57+ exit 1
58+ fi
59+
60+ if ! command -v ifconfig > /dev/null 2>&1 ; then
61+ echo " ERROR: Failed to install ifconfig. Please check permissions or network."
62+ exit 1
63+ fi
64+
65+ echo " ✅ ifconfig is now available."
66+ }
67+
68+ get_interface_by_ip () {
69+ local target_ip=" $1 "
70+ ifconfig | awk -v target=" $target_ip " '
71+ /^[[:alnum:]]/ {
72+ iface = $1
73+ sub(/:$/, "", iface)
74+ }
75+ /inet / {
76+ for (i = 1; i <= NF; i++) {
77+ gsub(/addr:/, "", $i)
78+ if ($i == target) {
79+ print iface
80+ exit
81+ }
82+ }
83+ }
84+ '
85+ }
86+
87+ start_server () {
88+ # Ascend environment variables
89+ if [[ " $NODE " == " 0" ]]; then
90+ export TARGET_IP=" $master_ip "
91+ else
92+ export TARGET_IP=" $worker_ip "
93+ fi
94+
95+ IFACE=$( get_interface_by_ip " $TARGET_IP " )
96+
97+ if [[ -z " $IFACE " ]]; then
98+ echo " WARNING: Could not find interface with IP $TARGET_IP via ifconfig. Falling back to 'eth0'."
99+ IFACE=" eth0"
100+ else
101+ echo " ✅ Detected interface: $IFACE (bound to IP $TARGET_IP )"
102+ fi
103+
104+ export HCCL_IF_IP=" $TARGET_IP "
105+ export HCCL_SOCKET_IFNAME=" $IFACE "
106+ export GLOO_SOCKET_IFNAME=" $IFACE "
107+ export TP_SOCKET_IFNAME=" $IFACE "
108+
109+ # vLLM parameters
110+ [[ -z " $model " ]] && { echo " ERROR: model not set in config.properties" >&2 ; exit 1; }
111+
112+ if [[ " $ucm_enable " == " true" ]]; then
113+ [[ -z " $ucm_config_yaml_path " ]] && {
114+ echo " ERROR: ucm_config_yaml_path not set but ucm_enable=true" >&2
115+ exit 1
116+ }
117+ LOG_FILE=" vllm_ucm.log"
118+ else
119+ LOG_FILE=" vllm.log"
120+ fi
121+
122+ echo " "
123+ echo " ===== vllm server configuration ====="
124+ echo " node = $NODE "
125+ echo " master_ip = $master_ip "
126+ echo " local_ip = $TARGET_IP "
127+ echo " network_interface = $IFACE "
128+ echo " model = $model "
129+ echo " served_model_name = ${served_model_name:- <default>} "
130+ echo " tp_size = $tp_size "
131+ echo " dp_size = $dp_size "
132+ echo " pp_size = $pp_size "
133+ echo " dp_size_local = $dp_size_local "
134+ echo " dp_start_rank = $(( dp_size_local * NODE)) "
135+ echo " dp_address = $master_ip "
136+ echo " enable_expert_parallel = $enable_expert_parallel "
137+ echo " max_model_len = $max_model_len "
138+ echo " max_num_batched_tokens = $max_num_batch_tokens "
139+ echo " max_num_seqs = $max_num_seqs "
140+ echo " block_size = $block_size "
141+ echo " gpu_memory_utilization = $gpu_memory_utilization "
142+ echo " quantization = $quantization "
143+ echo " server_host = $server_host "
144+ echo " server_port = $server_port "
145+ echo " distributed_backend = $distributed_executor_backend "
146+ echo " enable_prefix_caching = $enable_prefix_caching "
147+ echo " async_scheduling = $async_scheduling "
148+ echo " graph_mode = $graph_mode "
149+ if [[ " $ucm_enable " == " true" ]]; then
150+ echo " ucm_config_file = $ucm_config_yaml_path "
151+ fi
152+ echo " log_file = $LOG_FILE "
153+ echo " ====================================="
154+ echo " "
155+
156+ CMD=(
157+ vllm serve " $model "
158+ --max-model-len " $max_model_len "
159+ --tensor-parallel-size " $tp_size "
160+ --data-parallel-size " $dp_size "
161+ --data-parallel-size-local " $dp_size_local "
162+ --data-parallel-start-rank " $(( dp_size_local * NODE)) "
163+ --data-parallel-address " $master_ip "
164+ --data-parallel-rpc-port " $dp_rpc_port "
165+ --seed " $seed "
166+ --pipeline-parallel-size " $pp_size "
167+ --gpu-memory-utilization " $gpu_memory_utilization "
168+ --trust-remote-code
169+ --max-num-batched-tokens " $max_num_batch_tokens "
170+ --max-num-seqs " $max_num_seqs "
171+ --block-size " $block_size "
172+ --host " $server_host "
173+ --port " $server_port "
174+ )
175+ if [[ " $NODE " != " 0" ]]; then CMD+=(" --headless" ); fi
176+
177+ if [[ " $enable_expert_parallel " == " true" ]]; then CMD+=(" --enable-expert-parallel" ); fi
178+
179+ if [[ " $enable_prefix_caching " == " false" ]]; then CMD+=(" --no-enable-prefix-caching" ); fi
180+
181+ if [[ " $async_scheduling " == " true" ]]; then CMD+=(" --async-scheduling" ); fi
182+
183+ [[ -n " $served_model_name " ]] && CMD+=(" --served-model-name" " $served_model_name " )
184+
185+ [[ " $quantization " != " NONE" ]] && CMD+=(" --quantization" " $quantization " )
186+
187+ if [[ -n " $graph_mode " ]]; then
188+ COMPILATION_CONFIG=' {"cudagraph_mode": "' " $graph_mode " ' "}'
189+ CMD+=(" --compilation-config" " $COMPILATION_CONFIG " )
190+ fi
191+
192+ if [[ -n " $method " ]]; then
193+ SPECULATIVE_CONFIG=' {"num_speculative_tokens": 1, "method":"' " $method " ' "}'
194+ CMD+=(" --compilation-config" " $SPECULATIVE_CONFIG " )
195+ fi
196+
197+ ADDITIONAL_CONFIG=' {"ascend_scheduler_config":{"enabled":' " $enable_ascend_scheduler " ' },"torchair_graph_config":{"enabled":' " $enable_torchair_graph " ' }}'
198+ CMD+=(" --additional-config" " $ADDITIONAL_CONFIG " )
199+
200+ if [[ " $ucm_enable " == " true" ]]; then
201+ KV_CONFIG_JSON=" {
202+ \" kv_connector\" :\" UCMConnector\" ,
203+ \" kv_connector_module_path\" :\" ucm.integration.vllm.ucm_connector\" ,
204+ \" kv_role\" :\" kv_both\" ,
205+ \" kv_connector_extra_config\" :{\" UCM_CONFIG_FILE\" :\" $ucm_config_yaml_path \" }
206+ }"
207+ CMD+=(" --kv-transfer-config" " $KV_CONFIG_JSON " )
208+ fi
209+
210+ echo " Executing command: ${CMD[*]} "
211+ echo " "
212+
213+ " ${CMD[@]} " 2>&1 | tee " $LOG_FILE "
214+ }
215+
216+ load_config
217+ start_server
0 commit comments