ModelEngine-Group
diff --git a/‎examples/dev/ascend-data-parallel/config.properties‎
Lines changed: 57 additions & 0 deletions b/‎examples/dev/ascend-data-parallel/config.properties‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎examples/dev/ascend-data-parallel/run_vllm.sh‎
Lines changed: 217 additions & 0 deletions b/‎examples/dev/ascend-data-parallel/run_vllm.sh‎
Lines changed: 217 additions & 0 deletions
diff --git a/‎examples/dev/config.properties‎
Lines changed: 50 additions & 39 deletions b/‎examples/dev/config.properties‎
Lines changed: 50 additions & 39 deletions
@@ -0,0 +1,57 @@
+#****************************************
+#       Multi-node Configuration        *
+#****************************************
+master_ip=192.168.0.205
+worker_ip=192.168.0.127
+export HCCL_OP_EXPANSION_MODE="AIV"
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export HCCL_BUFFSIZE=200
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export VLLM_ASCEND_ENABLE_MLAPO=1
+export HCCL_INTRA_PCIE_ENABLE=1
+export HCCL_INTRA_ROCE_ENABLE=0
+
+#****************************************
+#          vLLM Configuration           *
+#****************************************
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export VLLM_LOGGING_LEVEL=INFO
+model=/home/models/QwQ-32B
+# If not specified, the model name will be the same as the --model argument.
+# served_model_name=qwen
+server_host=0.0.0.0
+server_port=7850
+tp_size=4
+dp_size=4
+dp_size_local=2
+dp_rpc_port=13389
+pp_size=1
+seed=1024
+enable_expert_parallel=false
+enable_prefix_caching=false
+max_model_len=20000
+max_num_batch_tokens=20000
+max_num_seqs=64
+block_size=128
+gpu_memory_utilization=0.87
+# NONE | PIECEWISE | FULL | FULL_DECODE_ONLY | FULL_AND_PIECEWISE
+graph_mode=FULL_DECODE_ONLY
+quantization=ascend
+async_scheduling=false
+
+#****************************************
+#  extra vLLM Configuration for Ascend  *
+#****************************************
+# method=deepseek_mtp
+enable_ascend_scheduler=false
+enable_torchair_graph=false
+
+#****************************************
+#          UCM  Configuration           *
+#****************************************
+# set true to enable UCM
+ucm_enable=false
+ucm_config_yaml_path=/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml
+
+
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+if [[ -z "$NODE" ]]; then
+    echo "ERROR: Please set NODE=N before running. N should be 0 for master node; 1,2,3... for workers. Note the IPs and environment variables in the script should be modified accordingly. "
+    echo "Usage: NODE=0 ./run_vllm.sh"
+    exit 1
+fi
+
+load_config() {
+    local config_file
+    config_file="$(dirname "${BASH_SOURCE[0]}")/config.properties"
+    if [[ ! -f "$config_file" ]]; then
+        echo "ERROR: Config file '$config_file' not found!" >&2
+        exit 1
+    fi
+
+    while IFS= read -r line; do
+        line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        [[ -z "$line" || "$line" == \#* ]] && continue
+
+        if [[ "$line" == export\ * ]]; then
+            rest="${line#export }"
+            eval "export $rest"
+        else
+            if [[ "$line" == *=* ]]; then
+                key="${line%%=*}"
+                value="${line#*=}"
+                key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+                value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+                eval "$key=\$value"
+            else
+                echo "WARNING: Invalid config line (no '=' found): $line" >&2
+            fi
+        fi
+    done < "$config_file"
+}
+
+ensure_ifconfig_installed() {
+    if command -v ifconfig >/dev/null 2>&1; then
+        return 0
+    fi
+
+    echo "ifconfig not found. Attempting to install net-tools..."
+
+    if command -v apt-get >/dev/null 2>&1; then
+        echo "Detected apt-get (Debian/Ubuntu). Installing net-tools..."
+        sudo apt-get update && sudo apt-get install -y net-tools
+    elif command -v yum >/dev/null 2>&1; then
+        echo "Detected yum (RHEL/CentOS). Installing net-tools..."
+        sudo yum install -y net-tools
+    elif command -v dnf >/dev/null 2>&1; then
+        echo "Detected dnf (Fedora). Installing net-tools..."
+        sudo dnf install -y net-tools
+    else
+        echo "ERROR: No supported package manager (apt/yum/dnf) found."
+        echo "Please install 'net-tools' manually or use a system with 'ip' command."
+        exit 1
+    fi
+
+    if ! command -v ifconfig >/dev/null 2>&1; then
+        echo "ERROR: Failed to install ifconfig. Please check permissions or network."
+        exit 1
+    fi
+
+    echo "✅ ifconfig is now available."
+}
+
+get_interface_by_ip() {
+    local target_ip="$1"
+    ifconfig | awk -v target="$target_ip" '
+        /^[[:alnum:]]/ {
+            iface = $1
+            sub(/:$/, "", iface)  
+        }
+        /inet / {
+            for (i = 1; i <= NF; i++) {
+                gsub(/addr:/, "", $i)
+                if ($i == target) {
+                    print iface
+                    exit
+                }
+            }
+        }
+    '
+}
+
+start_server() {
+    # Ascend environment variables
+    if [[ "$NODE" == "0" ]]; then
+        export TARGET_IP="$master_ip"
+    else
+        export TARGET_IP="$worker_ip"
+    fi
+
+    IFACE=$(get_interface_by_ip "$TARGET_IP")
+
+    if [[ -z "$IFACE" ]]; then
+        echo "WARNING: Could not find interface with IP $TARGET_IP via ifconfig. Falling back to 'eth0'."
+        IFACE="eth0"
+    else
+        echo "✅ Detected interface: $IFACE (bound to IP $TARGET_IP)"
+    fi
+
+    export HCCL_IF_IP="$TARGET_IP"
+    export HCCL_SOCKET_IFNAME="$IFACE"
+    export GLOO_SOCKET_IFNAME="$IFACE"
+    export TP_SOCKET_IFNAME="$IFACE"
+
+    # vLLM parameters 
+    [[ -z "$model" ]] && { echo "ERROR: model not set in config.properties" >&2; exit 1; }
+
+    if [[ "$ucm_enable" == "true" ]]; then
+        [[ -z "$ucm_config_yaml_path" ]] && {
+            echo "ERROR: ucm_config_yaml_path not set but ucm_enable=true" >&2
+            exit 1
+        }
+        LOG_FILE="vllm_ucm.log"
+    else
+        LOG_FILE="vllm.log"
+    fi
+
+    echo ""
+    echo "===== vllm server configuration ====="
+    echo "node                     = $NODE"
+    echo "master_ip                = $master_ip"
+    echo "local_ip                 = $TARGET_IP"
+    echo "network_interface        = $IFACE"
+    echo "model                    = $model"
+    echo "served_model_name        = ${served_model_name:-<default>}"
+    echo "tp_size                  = $tp_size"
+    echo "dp_size                  = $dp_size"
+    echo "pp_size                  = $pp_size"
+    echo "dp_size_local            = $dp_size_local"
+    echo "dp_start_rank            = $((dp_size_local * NODE))"
+    echo "dp_address               = $master_ip"
+    echo "enable_expert_parallel   = $enable_expert_parallel"
+    echo "max_model_len            = $max_model_len"
+    echo "max_num_batched_tokens   = $max_num_batch_tokens"
+    echo "max_num_seqs             = $max_num_seqs"
+    echo "block_size               = $block_size"
+    echo "gpu_memory_utilization   = $gpu_memory_utilization"
+    echo "quantization             = $quantization"
+    echo "server_host              = $server_host"
+    echo "server_port              = $server_port"
+    echo "distributed_backend      = $distributed_executor_backend"
+    echo "enable_prefix_caching    = $enable_prefix_caching"
+    echo "async_scheduling         = $async_scheduling"
+    echo "graph_mode               = $graph_mode"
+    if [[ "$ucm_enable" == "true" ]]; then
+        echo "ucm_config_file          = $ucm_config_yaml_path"
+    fi
+    echo "log_file                 = $LOG_FILE"
+    echo "====================================="
+    echo ""
+
+    CMD=(
+        vllm serve "$model"
+        --max-model-len "$max_model_len"
+        --tensor-parallel-size "$tp_size"
+        --data-parallel-size "$dp_size"
+        --data-parallel-size-local "$dp_size_local"
+        --data-parallel-start-rank "$((dp_size_local * NODE))"
+        --data-parallel-address "$master_ip"
+        --data-parallel-rpc-port "$dp_rpc_port"
+        --seed "$seed"
+        --pipeline-parallel-size "$pp_size"
+        --gpu-memory-utilization "$gpu_memory_utilization"
+        --trust-remote-code
+        --max-num-batched-tokens "$max_num_batch_tokens"
+        --max-num-seqs "$max_num_seqs"
+        --block-size "$block_size"
+        --host "$server_host"
+        --port "$server_port"
+    )
+    if [[ "$NODE" != "0" ]]; then CMD+=("--headless"); fi
+
+    if [[ "$enable_expert_parallel" == "true" ]]; then CMD+=("--enable-expert-parallel"); fi
+
+    if [[ "$enable_prefix_caching" == "false" ]]; then CMD+=("--no-enable-prefix-caching"); fi
+
+    if [[ "$async_scheduling" == "true" ]]; then CMD+=("--async-scheduling"); fi
+
+    [[ -n "$served_model_name" ]] && CMD+=("--served-model-name" "$served_model_name")
+    
+    [[ "$quantization" != "NONE" ]] && CMD+=("--quantization" "$quantization")
+
+    if [[ -n "$graph_mode" ]]; then 
+        COMPILATION_CONFIG='{"cudagraph_mode": "'"$graph_mode"'"}'
+        CMD+=("--compilation-config" "$COMPILATION_CONFIG")
+    fi
+
+    if [[ -n "$method" ]]; then
+        SPECULATIVE_CONFIG='{"num_speculative_tokens": 1, "method":"'"$method"'"}'
+        CMD+=("--compilation-config" "$SPECULATIVE_CONFIG")
+    fi
+
+    ADDITIONAL_CONFIG='{"ascend_scheduler_config":{"enabled":'"$enable_ascend_scheduler"'},"torchair_graph_config":{"enabled":'"$enable_torchair_graph"'}}'
+    CMD+=("--additional-config" "$ADDITIONAL_CONFIG")
+
+    if [[ "$ucm_enable" == "true" ]]; then
+        KV_CONFIG_JSON="{
+            \"kv_connector\":\"UCMConnector\",
+            \"kv_connector_module_path\":\"ucm.integration.vllm.ucm_connector\",
+            \"kv_role\":\"kv_both\",
+            \"kv_connector_extra_config\":{\"UCM_CONFIG_FILE\":\"$ucm_config_yaml_path\"}
+        }"
+        CMD+=("--kv-transfer-config" "$KV_CONFIG_JSON")
+    fi
+
+    echo "Executing command: ${CMD[*]}"
+    echo ""
+
+    "${CMD[@]}" 2>&1 | tee "$LOG_FILE"
+}
+
+load_config
+start_server
@@ -1,48 +1,59 @@
-#*****************************
-#     ray   Configuration    *
-#*****************************
-CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7
-ASCEND_RT_VISIBLE_DEVICES=1,2,3,4,5,6,7
+#****************************************
+#          ray Configuration            *
+#****************************************
 # For multi-node and multi-gpu inference
-RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1 
-MASTER_IP=192.168.0.205
-WORKER_IP=192.168.0.127
-# Total number of nodes in multi-node inference
-NODE_NUM=2
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1
+
+# For multi-node and multi-npu inference
+# export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 
 
+master_ip=192.168.0.205
+worker_ip=192.168.0.127
+# Total number of nodes in multi-node inference
+node_num=2
 
-#*****************************
-#     vLLM  Configuration    *
-#*****************************
+#****************************************
+#          vLLM Configuration           *
+#****************************************
 # Avoid the error "RuntimeError: CUDASymmetricMemoryAllocator" during multi-node, multi-GPU inference. See it in the issue: https://github.com/vllm-project/vllm/issues/24694
-VLLM_ALLREDUCE_USE_SYMM_MEM=0
-VLLM_LOGGING_LEVEL=INFO
-MODEL=/home/models/QwQ-32B
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+# Run deepseek v3.1+ on CUDA
+export VLLM_USE_DEEP_GEMM=0
+export VLLM_LOGGING_LEVEL=INFO
+model=/home/models/QwQ-32B
 # If not specified, the model name will be the same as the --model argument.
-# SERVED_MODEL_NAME=qwen
-TP_SIZE=8
-DP_SIZE=1
-PP_SIZE=1
-# 0 | 1 ; Set 1 to enable expert parallel
-ENABLE_EXPERT_PARALLEL=0
-MAX_MODEL_LEN=20000
-MAX_NUM_BATCH_TOKENS=20000
-MAX_NUM_SEQS=64
-BLOCK_SIZE=128
-GPU_MEMORY_UTILIZATION=0.87
-SERVER_HOST=0.0.0.0
-SERVER_PORT=7850
-ENABLE_PREFIX_CACHING=0
-ASYNC_SCHEDULING=0
+# served_model_name=qwen
+server_host=0.0.0.0
+server_port=7850
+tp_size=4
+dp_size=1
+pp_size=1
+enable_expert_parallel=false
+enable_prefix_caching=false
+max_model_len=20000
+max_num_batch_tokens=20000
+max_num_seqs=64
+block_size=128
+gpu_memory_utilization=0.87
+async_scheduling=false
 # NONE | PIECEWISE | FULL | FULL_DECODE_ONLY | FULL_AND_PIECEWISE
-GRAPH_MODE=FULL_DECODE_ONLY
-QUANTIZATION=None
+graph_mode=FULL_DECODE_ONLY
+quantization=NONE
 # mp | ray ; Set mp to start single-node inference
-DISTRIBUTED_EXECUTOR_BACKEND=mp
+distributed_executor_backend=ray
+
+#****************************************
+#  extra vLLM Configuration for Ascend  *
+#****************************************
+
+
+#****************************************
+#          UCM  Configuration           *
+#****************************************
+# set true to enable UCM
+ucm_enable=true
+ucm_config_yaml_path=/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml
 
 
-#*****************************
-#     UCM  Configuration     *
-#*****************************
-UCM_ENABLE=1
-UCM_CONFIG_YAML_PATH=/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml