Skip to content

[Feature]: Aisbench #111

@chopper0126

Description

@chopper0126

🚀 The feature, motivation and pitch

基线

A3 单机 decode-only DP16 EP16

case1——max-num-seqs 16

run service

base_decode_only.sh

# # Node 0  (with ip address 141.61.73.132)
# vllm serve $MODEL --data-parallel-size 16 --data-parallel-size-local 8 \
#                   --data-parallel-address 141.61.73.132 --data-parallel-rpc-port 13345
# # Node 1
# vllm serve $MODEL --headless --data-parallel-size 16 --data-parallel-size-local 8 \
#                   --data-parallel-start-rank 8 \
#                   --data-parallel-address 141.61.73.132 --data-parallel-rpc-port 13345

#!/bin/bash

unset http_proxy
unset https_proxy
clear
ulimit -u unlimited
pkill -9 vllm
pkill -9 VLLM
pkill -9 python

# (需配置项)权重路径
# MODEL_PATH="/home/c00945949/weight/DeepSeek-V3.1_w8a8mix_mtp/"
MODEL_PATH="/home/lxf/DSV2LiteWeight"

IF_NAME="enp8s0f4u1"
LOCAL_IP="141.61.73.132"

export HCCL_IF_IP=${LOCAL_IP}
export HCCL_SOCKET_IFNAME=${IF_NAME}
export GLOO_SOCKET_IFNAME=${IF_NAME}
export TP_SOCKET_IFNAME=${IF_NAME}
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_BUFFSIZE=600
export TASK_QUEUE_ENABLE=1
export HCCL_OP_EXPANSION_MODE="AIV"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=$1
export HCCL_EXEC_TIMEOUT=10000
export ASCEND_LAUNCH_BLOCKING=0
export TORCHDYNAMO_VERBOSE=1

export VLLM_USE_V1=1
export VLLM_VERSION="v0.11.0"
export VLLM_ENGINE_ITERATION_TIMEOUT_S=600
source /usr/local/Ascend/ascend-toolkit/latest/opp/vendors/CAM/bin/set_env.bash

# 日志设置
# (需配置项)基础日志路径设置
timestamp=$(date +"%Y-%m-%d-%H-%M-%S")
ALL_LOGS=/home/y00889327/workspace-afd/vllm-logs/${timestamp}
# mkdir -p "${ALL_LOGS}"

# # CANN日志设置
mkdir -p "${ALL_LOGS}"/CANN/"${HCCL_IF_IP}"
export ASCEND_PROCESS_LOG_PATH=${ALL_LOGS}/CANN/${HCCL_IF_IP}
# 是否开启日志打屏。开启后,日志将不会保存在log文件中,而是将产生的日志直接打屏显示。
export ASCEND_SLOG_PRINT_TO_STDOUT=0
# 设置日志级别。1为INFO,2为WARNING
export ASCEND_GLOBAL_LOG_LEVEL=3
# 设置应用类日志是否开启Event日志。
export ASCEND_GLOBAL_EVENT_ENABLE=1
# 指定Device侧应用类日志回传到Host侧的延时时间。
# export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=2000
# 指定日志拥塞处理方式。0:默认处理方式,在日志拥塞或IO访问性能差的情况下,为保证业务性能不劣化,系统可能会丢失日志。1:在日志拥塞或IO访问性能差的情况下,不丢失日志。该方式下,为便于问题定位,建议配置为1。
export ASCEND_LOG_SYNC_SAVE=0

# 应用日志设置
export VLLM_LOGGING_LEVEL=WARNING
APP_LOG_PATH=${ALL_LOGS}/"132".log
#    --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
#    --additional-config '{"recompute_scheduler_enable":true,"multistream_overlap_shared_expert": true,"finegrained_tp_config": {"lmhead_tensor_parallel_size":16}}' \
# MooncakeLayerwiseConnector
vllm serve $MODEL_PATH \
    --host 0.0.0.0 \
    --port $2 \
    --data-parallel-size $3 \
    --tensor-parallel-size $4 \
    --enable-expert-parallel \
    --seed 1024 \
    --max-model-len 16384 \
    --max-num-batched-tokens 4096 \
    --max-num-seqs 160 \
    --trust-remote-code \
    --gpu-memory-utilization 0.90  \
    --no-enable-prefix-caching \
    --kv-transfer-config \
    '{"kv_connector": "SharedStorageConnector",
        "kv_role": "kv_consumer",
        "kv_port": "30200",
        "engine_id": "2",
        "kv_connector_extra_config": {
                    "prefill": {
                            "dp_size": 2,
                            "tp_size": 8
                    },
                    "decode": {
                            "dp_size": 32,
                            "tp_size": 1
                    }
            }
    }'\
    --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY","cudagraph_capture_sizes":[160]}'  \
    2>&1 | tee "$APP_LOG_PATH"

拉起vllm服务

bash /home/y00889327/workspace-afd/test-vllm-ascend/base_decode_only.sh 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 8006 16 1

拉测评命令

ais_bench --models vllm_api_stream_chat  --datasets gsm8k_gen_0_shot_cot_str_perf  --mode perf --summarizer stable_stage --debug

测评脚本

from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content

models = [
    dict(
        attr="service",
        type=VLLMCustomAPIChatStream,
        abbr='vllm-api-stream-chat',
        path="/home/lxf/DSV2LiteWeight",
        model="/home/lxf/DSV2LiteWeight",
        request_rate = 0,
        retry = 2,
        host_ip = "141.61.73.132",
        host_port = 8006,
        max_out_len = 2048,
        batch_size=256,
        trust_remote_code=True,
        generation_kwargs = dict(
            temperature = 0,
            ignore_eos = True,
            ),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    )
]

Alternatives

No response

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions