vllm-made-easy/nsys_server.sh at main · pavanimajety/vllm-made-easy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
export VLLM_USE_NCCL_SYMM_MEM=1
export NCCL_NVLS_ENABLE=1
export NCCL_CUMEM_ENABLE=1
# export VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL=1
# export VLLM_ATTENTION_BACKEND=FLASHINFER_MLA
export VLLM_FLASHINFER_MOE_BACKEND=latency
export VLLM_USE_FLASHINFER_MOE_FP4=1
export VLLM_TORCH_PROFILER_DIR=logs/vllm_profile

VLLM_TORCH_CUDA_PROFILE=1 \
    nsys profile -o /workspace/logs/nsys_reports/new-fp8-prefill-compute -f true \
    --trace-fork-before-exec=true --cuda-graph-trace=node --capture-range=cudaProfilerApi \
    --capture-range-end repeat \
     python3 -m vllm.entrypoints.openai.api_server --model /tmp/nvidia-DeepSeek-R1-FP4-v2 \
     --kv-cache-dtype fp8 --tensor-parallel-size 1 --pipeline-parallel-size 1 --data-parallel-size 4 \
     --enable-expert-parallel --swap-space 16 --max-num-seqs 1024 --trust-remote-code --max-model-len 2176 \
     --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --no-enable-prefix-caching \
     --async-scheduling --compilation_config.pass_config.fuse_attn_quant true \
     --compilation_config.pass_config.fuse_allreduce_rms true \
     --compilation_config.max_cudagraph_capture_size 2048 --attention-config.backend=FLASHINFER_MLA --compilation_config.custom_ops+=+rotary_embedding \
     --attention-config.use_prefill_query_quantization=true --attention-config.use_trtllm_ragged_prefill=true

#------------ Works ------------------------------------------
# export VLLM_USE_NCCL_SYMM_MEM=1
# export NCCL_NVLS_ENABLE=1
# export NCCL_CUMEM_ENABLE=1
# export VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL=1
# export VLLM_ATTENTION_BACKEND=FLASHINFER_MLA
# export VLLM_FLASHINFER_MOE_BACKEND=latency
# export VLLM_USE_FLASHINFER_MOE_FP4=1

# vllm serve /tmp/nvidia-DeepSeek-R1-0528-FP4/  --kv-cache-dtype fp8 --tensor-parallel-size 1 --pipeline-parallel-size 1 --data-parallel-size 4 --enable-expert-parallel \
#           --swap-space 16 --max-num-seqs 1024 --trust-remote-code --max-model-len 2176 --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --no-enable-prefix-caching \
#           --async-scheduling --compilation_config.pass_config.enable_fi_allreduce_fusion true --compilation_config.pass_config.enable_attn_fusion true \
#           --compilation_config.max_cudagraph_capture_size 2048 14:15:23 14:18:13