Skip to content

Commit b4af757

Browse files
authored
Merge pull request #44 from MooreThreads/add_vllm_musa_demo
add run.sh for vllm_musa
2 parents 906b606 + 55196e8 commit b4af757

File tree

1 file changed

+101
-0
lines changed

1 file changed

+101
-0
lines changed

vllm_musa/run.sh

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/bin/bash
2+
3+
if [[ $1 == "--help" ]]; then
4+
echo "Usage: run.sh [TP_SIZE] [PP_SIZE] [MODEL_PATH] [HOSTFILE] [VLLM_PP_LAYER_PARTITION]"
5+
echo ""
6+
echo "Parameters:"
7+
echo " TP_SIZE Number of Tensor Parallelism"
8+
echo " PP_SIZE Number of Pipeline Parallelism"
9+
echo " MODEL_PATH Path to the model"
10+
echo " HOSTFILE Host file for distributed training"
11+
echo " VLLM_PP_LAYER_PARTITION Optional partition scheme (comma-separated values); omit to skip"
12+
echo ""
13+
echo "Example:"
14+
echo " ./run.sh 2 4 /path/to/model /path/to/hostfile 13,12,12,12,12"
15+
exit 0
16+
fi
17+
18+
set -u
19+
TP_SIZE=$1
20+
PP_SIZE=$2
21+
MODEL_PATH=$3
22+
HOSTFILE=$4
23+
VLLM_PP_LAYER_PARTITION="${5:-}"
24+
set +u
25+
26+
MODEL_NAME=deepseek
27+
MAX_MODEL_LEN=8192
28+
GPU_MEMORY_UTILIZATION=0.90
29+
WORLD_SIZE=$(($PP_SIZE * $TP_SIZE))
30+
SSH_PORT=62262
31+
RAY_PORT=63794
32+
33+
34+
# For S5000 RoCE
35+
# MUSA_BLOCK_SCHEDULE_MODE=1
36+
# MCCL_IB_GID_INDEX=3
37+
38+
env_array=(
39+
MCCL_PROTOS=2
40+
MUSA_PRINT_ENV=1
41+
MUSA_HOME="/usr/local/musa"
42+
MTHREADS_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
43+
TRITON_CACHE_DIR="/tmp/triton"
44+
LIBRARY_PATH="/opt/intel/oneapi/mkl/lib/intel64:${LIBRARY_PATH}"
45+
LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/musa/lib"
46+
VLLM_NCCL_SO_PATH="/usr/local/musa/lib/libmccl.so.2"
47+
)
48+
49+
if [[ -n "$VLLM_PP_LAYER_PARTITION" ]]; then
50+
env_array+=(VLLM_PP_LAYER_PARTITION="$VLLM_PP_LAYER_PARTITION")
51+
fi
52+
53+
for item in "${env_array[@]}"; do
54+
echo "export $item"
55+
eval "export $item"
56+
done
57+
58+
pkill -f /opt/conda/envs/py310/bin/python3
59+
ray stop
60+
rm -rf ${TRITON_CACHE_DIR}/*
61+
62+
CURRENT_TIME=$(date "+%Y-%m-%d_%H:%M:%S")
63+
echo $CURRENT_TIME
64+
mkdir -p ./output/$CURRENT_TIME
65+
66+
set -u
67+
WORK_HOME="$PWD"
68+
EXPNAME="${MODEL_NAME}_pp${PP_SIZE}_tp${TP_SIZE}_gpus${WORLD_SIZE}"
69+
LOG_FILE=$WORK_HOME/output/$CURRENT_TIME/$EXPNAME.log
70+
set +u
71+
72+
hostlist=$(grep -v '^#\|^$' $HOSTFILE | awk '{print $1}' | xargs)
73+
74+
first_host=true
75+
first_host_ip=127.0.0.1
76+
for host in ${hostlist[@]}; do
77+
echo ray start $host
78+
((COUNT++))
79+
if $first_host; then
80+
first_host=false
81+
first_host_ip=$host
82+
ssh -p $SSH_PORT $host "${env_array[@]} ray start --head --port=${RAY_PORT} --dashboard-host='0.0.0.0' --num-gpus 8"
83+
sleep 3s
84+
else
85+
ssh -p $SSH_PORT $host "${env_array[@]} ray start --address ${first_host_ip}:${RAY_PORT} --num-gpus 8"
86+
fi
87+
done
88+
89+
ray status
90+
91+
vllm serve $MODEL_PATH \
92+
--trust-remote-code \
93+
--max-num-seqs 64 \
94+
--max_model_len $MAX_MODEL_LEN \
95+
--num-gpu-blocks-override $MAX_MODEL_LEN \
96+
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
97+
--served-model-name $MODEL_NAME \
98+
--distributed-executor-backend ray \
99+
--port 8000 \
100+
-tp $TP_SIZE \
101+
-pp $PP_SIZE 2>&1 | tee -a $LOG_FILE

0 commit comments

Comments
 (0)