You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Avoid the error "RuntimeError: CUDASymmetricMemoryAllocator" during multi-node, multi-GPU inference. See it in the issue: https://github.com/vllm-project/vllm/issues/24694
18
+
VLLM_ALLREDUCE_USE_SYMM_MEM=0
19
+
VLLM_LOGGING_LEVEL=INFO
20
+
MODEL=/home/models/QwQ-32B
21
+
# If not specified, the model name will be the same as the --model argument.
22
+
# SERVED_MODEL_NAME=qwen
23
+
TP_SIZE=8
24
+
DP_SIZE=1
25
+
PP_SIZE=1
26
+
# 0 | 1 ; Set 1 to enable expert parallel
27
+
ENABLE_EXPERT_PARALLEL=0
28
+
MAX_MODEL_LEN=20000
29
+
MAX_NUM_BATCH_TOKENS=20000
30
+
MAX_NUM_SEQS=64
31
+
BLOCK_SIZE=128
32
+
GPU_MEMORY_UTILIZATION=0.87
33
+
SERVER_HOST=0.0.0.0
34
+
SERVER_PORT=7850
35
+
ENABLE_PREFIX_CACHING=0
36
+
ASYNC_SCHEDULING=0
37
+
# NONE | PIECEWISE | FULL | FULL_DECODE_ONLY | FULL_AND_PIECEWISE
38
+
GRAPH_MODE=FULL_DECODE_ONLY
39
+
QUANTIZATION=None
40
+
# mp | ray ; Set mp to start single-node inference
echo"ERROR: Please set NODE=N before running. N should be 0 for head node; 1,2,3... for workers. Note the IPs and environment variables in the script should be modified accordingly. "
0 commit comments