forked from vllm-project/vllm-ascend
-
Notifications
You must be signed in to change notification settings - Fork 9
Open
Description
🚀 The feature, motivation and pitch
基线
A3 单机 decode-only DP16 EP16
case1——max-num-seqs 16
run service
base_decode_only.sh
# # Node 0 (with ip address 141.61.73.132)
# vllm serve $MODEL --data-parallel-size 16 --data-parallel-size-local 8 \
# --data-parallel-address 141.61.73.132 --data-parallel-rpc-port 13345
# # Node 1
# vllm serve $MODEL --headless --data-parallel-size 16 --data-parallel-size-local 8 \
# --data-parallel-start-rank 8 \
# --data-parallel-address 141.61.73.132 --data-parallel-rpc-port 13345
#!/bin/bash
unset http_proxy
unset https_proxy
clear
ulimit -u unlimited
pkill -9 vllm
pkill -9 VLLM
pkill -9 python
# (需配置项)权重路径
# MODEL_PATH="/home/c00945949/weight/DeepSeek-V3.1_w8a8mix_mtp/"
MODEL_PATH="/home/lxf/DSV2LiteWeight"
IF_NAME="enp8s0f4u1"
LOCAL_IP="141.61.73.132"
export HCCL_IF_IP=${LOCAL_IP}
export HCCL_SOCKET_IFNAME=${IF_NAME}
export GLOO_SOCKET_IFNAME=${IF_NAME}
export TP_SOCKET_IFNAME=${IF_NAME}
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=600
export TASK_QUEUE_ENABLE=1
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=$1
export HCCL_EXEC_TIMEOUT=10000
export ASCEND_LAUNCH_BLOCKING=0
export TORCHDYNAMO_VERBOSE=1
export VLLM_USE_V1=1
export VLLM_VERSION="v0.11.0"
export VLLM_ENGINE_ITERATION_TIMEOUT_S=600
source /usr/local/Ascend/ascend-toolkit/latest/opp/vendors/CAM/bin/set_env.bash
# 日志设置
# (需配置项)基础日志路径设置
timestamp=$(date +"%Y-%m-%d-%H-%M-%S")
ALL_LOGS=/home/y00889327/workspace-afd/vllm-logs/${timestamp}
# mkdir -p "${ALL_LOGS}"
# # CANN日志设置
mkdir -p "${ALL_LOGS}"/CANN/"${HCCL_IF_IP}"
export ASCEND_PROCESS_LOG_PATH=${ALL_LOGS}/CANN/${HCCL_IF_IP}
# 是否开启日志打屏。开启后,日志将不会保存在log文件中,而是将产生的日志直接打屏显示。
export ASCEND_SLOG_PRINT_TO_STDOUT=0
# 设置日志级别。1为INFO,2为WARNING
export ASCEND_GLOBAL_LOG_LEVEL=3
# 设置应用类日志是否开启Event日志。
export ASCEND_GLOBAL_EVENT_ENABLE=1
# 指定Device侧应用类日志回传到Host侧的延时时间。
# export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=2000
# 指定日志拥塞处理方式。0:默认处理方式,在日志拥塞或IO访问性能差的情况下,为保证业务性能不劣化,系统可能会丢失日志。1:在日志拥塞或IO访问性能差的情况下,不丢失日志。该方式下,为便于问题定位,建议配置为1。
export ASCEND_LOG_SYNC_SAVE=0
# 应用日志设置
export VLLM_LOGGING_LEVEL=WARNING
APP_LOG_PATH=${ALL_LOGS}/"132".log
# --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
# --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"finegrained_tp_config": {"lmhead_tensor_parallel_size":16}}' \
# MooncakeLayerwiseConnector
vllm serve $MODEL_PATH \
--host 0.0.0.0 \
--port $2 \
--data-parallel-size $3 \
--tensor-parallel-size $4 \
--enable-expert-parallel \
--seed 1024 \
--max-model-len 16384 \
--max-num-batched-tokens 4096 \
--max-num-seqs 160 \
--trust-remote-code \
--gpu-memory-utilization 0.90 \
--no-enable-prefix-caching \
--kv-transfer-config \
'{"kv_connector": "SharedStorageConnector",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "2",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 32,
"tp_size": 1
}
}
}'\
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY","cudagraph_capture_sizes":[160]}' \
2>&1 | tee "$APP_LOG_PATH"
拉起vllm服务
bash /home/y00889327/workspace-afd/test-vllm-ascend/base_decode_only.sh 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 8006 16 1
拉测评命令
ais_bench --models vllm_api_stream_chat --datasets gsm8k_gen_0_shot_cot_str_perf --mode perf --summarizer stable_stage --debug
测评脚本
from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content
models = [
dict(
attr="service",
type=VLLMCustomAPIChatStream,
abbr='vllm-api-stream-chat',
path="/home/lxf/DSV2LiteWeight",
model="/home/lxf/DSV2LiteWeight",
request_rate = 0,
retry = 2,
host_ip = "141.61.73.132",
host_port = 8006,
max_out_len = 2048,
batch_size=256,
trust_remote_code=True,
generation_kwargs = dict(
temperature = 0,
ignore_eos = True,
),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]
Alternatives
No response
Additional context
No response
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels
Projects
Status
Todo