|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +if [[ $1 == "--help" ]]; then |
| 4 | + echo "Usage: run.sh [TP_SIZE] [PP_SIZE] [MODEL_PATH] [HOSTFILE] [VLLM_PP_LAYER_PARTITION]" |
| 5 | + echo "" |
| 6 | + echo "Parameters:" |
| 7 | + echo " TP_SIZE Number of Tensor Parallelism" |
| 8 | + echo " PP_SIZE Number of Pipeline Parallelism" |
| 9 | + echo " MODEL_PATH Path to the model" |
| 10 | + echo " HOSTFILE Host file for distributed training" |
| 11 | + echo " VLLM_PP_LAYER_PARTITION Optional partition scheme (comma-separated values); omit to skip" |
| 12 | + echo "" |
| 13 | + echo "Example:" |
| 14 | + echo " ./run.sh 2 4 /path/to/model /path/to/hostfile 13,12,12,12,12" |
| 15 | + exit 0 |
| 16 | +fi |
| 17 | + |
| 18 | +set -u |
| 19 | +TP_SIZE=$1 |
| 20 | +PP_SIZE=$2 |
| 21 | +MODEL_PATH=$3 |
| 22 | +HOSTFILE=$4 |
| 23 | +VLLM_PP_LAYER_PARTITION="${5:-}" |
| 24 | +set +u |
| 25 | + |
| 26 | +MODEL_NAME=deepseek |
| 27 | +MAX_MODEL_LEN=8192 |
| 28 | +GPU_MEMORY_UTILIZATION=0.90 |
| 29 | +WORLD_SIZE=$(($PP_SIZE * $TP_SIZE)) |
| 30 | +SSH_PORT=62262 |
| 31 | +RAY_PORT=63794 |
| 32 | + |
| 33 | + |
| 34 | +# For S5000 RoCE |
| 35 | +# MUSA_BLOCK_SCHEDULE_MODE=1 |
| 36 | +# MCCL_IB_GID_INDEX=3 |
| 37 | + |
| 38 | +env_array=( |
| 39 | + MCCL_PROTOS=2 |
| 40 | + MUSA_PRINT_ENV=1 |
| 41 | + MUSA_HOME="/usr/local/musa" |
| 42 | + MTHREADS_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
| 43 | + TRITON_CACHE_DIR="/tmp/triton" |
| 44 | + LIBRARY_PATH="/opt/intel/oneapi/mkl/lib/intel64:${LIBRARY_PATH}" |
| 45 | + LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/musa/lib" |
| 46 | + VLLM_NCCL_SO_PATH="/usr/local/musa/lib/libmccl.so.2" |
| 47 | +) |
| 48 | + |
| 49 | +if [[ -n "$VLLM_PP_LAYER_PARTITION" ]]; then |
| 50 | + env_array+=(VLLM_PP_LAYER_PARTITION="$VLLM_PP_LAYER_PARTITION") |
| 51 | +fi |
| 52 | + |
| 53 | +for item in "${env_array[@]}"; do |
| 54 | + echo "export $item" |
| 55 | + eval "export $item" |
| 56 | +done |
| 57 | + |
| 58 | +pkill -f /opt/conda/envs/py310/bin/python3 |
| 59 | +ray stop |
| 60 | +rm -rf ${TRITON_CACHE_DIR}/* |
| 61 | + |
| 62 | +CURRENT_TIME=$(date "+%Y-%m-%d_%H:%M:%S") |
| 63 | +echo $CURRENT_TIME |
| 64 | +mkdir -p ./output/$CURRENT_TIME |
| 65 | + |
| 66 | +set -u |
| 67 | + WORK_HOME="$PWD" |
| 68 | + EXPNAME="${MODEL_NAME}_pp${PP_SIZE}_tp${TP_SIZE}_gpus${WORLD_SIZE}" |
| 69 | + LOG_FILE=$WORK_HOME/output/$CURRENT_TIME/$EXPNAME.log |
| 70 | +set +u |
| 71 | + |
| 72 | +hostlist=$(grep -v '^#\|^$' $HOSTFILE | awk '{print $1}' | xargs) |
| 73 | + |
| 74 | +first_host=true |
| 75 | +first_host_ip=127.0.0.1 |
| 76 | +for host in ${hostlist[@]}; do |
| 77 | + echo ray start $host |
| 78 | + ((COUNT++)) |
| 79 | + if $first_host; then |
| 80 | + first_host=false |
| 81 | + first_host_ip=$host |
| 82 | + ssh -p $SSH_PORT $host "${env_array[@]} ray start --head --port=${RAY_PORT} --dashboard-host='0.0.0.0' --num-gpus 8" |
| 83 | + sleep 3s |
| 84 | + else |
| 85 | + ssh -p $SSH_PORT $host "${env_array[@]} ray start --address ${first_host_ip}:${RAY_PORT} --num-gpus 8" |
| 86 | + fi |
| 87 | +done |
| 88 | + |
| 89 | +ray status |
| 90 | + |
| 91 | +vllm serve $MODEL_PATH \ |
| 92 | + --trust-remote-code \ |
| 93 | + --max-num-seqs 64 \ |
| 94 | + --max_model_len $MAX_MODEL_LEN \ |
| 95 | + --num-gpu-blocks-override $MAX_MODEL_LEN \ |
| 96 | + --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ |
| 97 | + --served-model-name $MODEL_NAME \ |
| 98 | + --distributed-executor-backend ray \ |
| 99 | + --port 8000 \ |
| 100 | + -tp $TP_SIZE \ |
| 101 | + -pp $PP_SIZE 2>&1 | tee -a $LOG_FILE |
0 commit comments