You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
echo"ERROR: Please set NODE=N before running. N should be 0 for master node; 1,2,3... for workers. Note the IPs and environment variables in the script should be modified accordingly. "
# Avoid the error "RuntimeError: CUDASymmetricMemoryAllocator" during multi-node, multi-GPU inference. See it in the issue: https://github.com/vllm-project/vllm/issues/24694
18
-
VLLM_ALLREDUCE_USE_SYMM_MEM=0
19
-
VLLM_LOGGING_LEVEL=INFO
20
-
MODEL=/home/models/QwQ-32B
21
+
exportVLLM_ALLREDUCE_USE_SYMM_MEM=0
22
+
# Run deepseek v3.1+ on CUDA
23
+
exportVLLM_USE_DEEP_GEMM=0
24
+
exportVLLM_LOGGING_LEVEL=INFO
25
+
model=/home/models/QwQ-32B
21
26
# If not specified, the model name will be the same as the --model argument.
22
-
# SERVED_MODEL_NAME=qwen
23
-
TP_SIZE=8
24
-
DP_SIZE=1
25
-
PP_SIZE=1
26
-
# 0 | 1 ; Set 1 to enable expert parallel
27
-
ENABLE_EXPERT_PARALLEL=0
28
-
MAX_MODEL_LEN=20000
29
-
MAX_NUM_BATCH_TOKENS=20000
30
-
MAX_NUM_SEQS=64
31
-
BLOCK_SIZE=128
32
-
GPU_MEMORY_UTILIZATION=0.87
33
-
SERVER_HOST=0.0.0.0
34
-
SERVER_PORT=7850
35
-
ENABLE_PREFIX_CACHING=0
36
-
ASYNC_SCHEDULING=0
27
+
# served_model_name=qwen
28
+
server_host=0.0.0.0
29
+
server_port=7850
30
+
tp_size=4
31
+
dp_size=1
32
+
pp_size=1
33
+
enable_expert_parallel=false
34
+
enable_prefix_caching=false
35
+
max_model_len=20000
36
+
max_num_batch_tokens=20000
37
+
max_num_seqs=64
38
+
block_size=128
39
+
gpu_memory_utilization=0.87
40
+
async_scheduling=false
37
41
# NONE | PIECEWISE | FULL | FULL_DECODE_ONLY | FULL_AND_PIECEWISE
38
-
GRAPH_MODE=FULL_DECODE_ONLY
39
-
QUANTIZATION=None
42
+
graph_mode=FULL_DECODE_ONLY
43
+
quantization=NONE
40
44
# mp | ray ; Set mp to start single-node inference
0 commit comments