-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnsys_server.sh
More file actions
35 lines (32 loc) · 2.22 KB
/
nsys_server.sh
File metadata and controls
35 lines (32 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
export VLLM_USE_NCCL_SYMM_MEM=1
export NCCL_NVLS_ENABLE=1
export NCCL_CUMEM_ENABLE=1
# export VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL=1
# export VLLM_ATTENTION_BACKEND=FLASHINFER_MLA
export VLLM_FLASHINFER_MOE_BACKEND=latency
export VLLM_USE_FLASHINFER_MOE_FP4=1
export VLLM_TORCH_PROFILER_DIR=logs/vllm_profile
VLLM_TORCH_CUDA_PROFILE=1 \
nsys profile -o /workspace/logs/nsys_reports/new-fp8-prefill-compute -f true \
--trace-fork-before-exec=true --cuda-graph-trace=node --capture-range=cudaProfilerApi \
--capture-range-end repeat \
python3 -m vllm.entrypoints.openai.api_server --model /tmp/nvidia-DeepSeek-R1-FP4-v2 \
--kv-cache-dtype fp8 --tensor-parallel-size 1 --pipeline-parallel-size 1 --data-parallel-size 4 \
--enable-expert-parallel --swap-space 16 --max-num-seqs 1024 --trust-remote-code --max-model-len 2176 \
--gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --no-enable-prefix-caching \
--async-scheduling --compilation_config.pass_config.fuse_attn_quant true \
--compilation_config.pass_config.fuse_allreduce_rms true \
--compilation_config.max_cudagraph_capture_size 2048 --attention-config.backend=FLASHINFER_MLA --compilation_config.custom_ops+=+rotary_embedding \
--attention-config.use_prefill_query_quantization=true --attention-config.use_trtllm_ragged_prefill=true
#------------ Works ------------------------------------------
# export VLLM_USE_NCCL_SYMM_MEM=1
# export NCCL_NVLS_ENABLE=1
# export NCCL_CUMEM_ENABLE=1
# export VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL=1
# export VLLM_ATTENTION_BACKEND=FLASHINFER_MLA
# export VLLM_FLASHINFER_MOE_BACKEND=latency
# export VLLM_USE_FLASHINFER_MOE_FP4=1
# vllm serve /tmp/nvidia-DeepSeek-R1-0528-FP4/ --kv-cache-dtype fp8 --tensor-parallel-size 1 --pipeline-parallel-size 1 --data-parallel-size 4 --enable-expert-parallel \
# --swap-space 16 --max-num-seqs 1024 --trust-remote-code --max-model-len 2176 --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --no-enable-prefix-caching \
# --async-scheduling --compilation_config.pass_config.enable_fi_allreduce_fusion true --compilation_config.pass_config.enable_attn_fusion true \
# --compilation_config.max_cudagraph_capture_size 2048 14:15:23 14:18:13