28
28
--proc-per-node=2
29
29
MOE models:
30
30
python examples/offline_external_launcher.py \
31
- --model="Qwen/Qwen3-0.6B " \
31
+ --model="Qwen/Qwen3-30B-A3B " \
32
32
--tp-size=2 \
33
33
--proc-per-node=2 \
34
34
--enable-expert-parallel
35
35
36
36
Multi-node:
37
37
Node 0 (assume the node has ip of 10.99.48.128):
38
38
python examples/offline_external_launcher.py \
39
- --model="Qwen/Qwen3-0.6B " \
39
+ --model="Qwen/Qwen3-30B-A3B " \
40
40
--tp-size=2 \
41
41
--node-size=2 \
42
42
--node-rank=0 \
46
46
--master-port=13345
47
47
Node 1:
48
48
python examples/offline_external_launcher.py \
49
- --model="Qwen/Qwen3-0.6B " \
49
+ --model="Qwen/Qwen3-30B-A3B " \
50
50
--tp-size=2 \
51
51
--node-size=2 \
52
52
--node-rank=1 \
66
66
from vllm import LLM , SamplingParams
67
67
from vllm .distributed .parallel_state import ( # noqa E402
68
68
destroy_distributed_environment , destroy_model_parallel , get_tp_group )
69
- from vllm .utils import get_open_port
69
+ from vllm .utils import get_open_port , GiB_bytes
70
70
71
71
os .environ ["VLLM_USE_MODELSCOPE" ] = "True"
72
72
os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
@@ -114,20 +114,44 @@ def parse_args():
114
114
parser .add_argument ("--enable-expert-parallel" ,
115
115
action = "store_true" ,
116
116
help = "Enable expert parallel, used in MOE models." )
117
- return parser .parse_args ()
117
+ parser .add_argument ("--enable-sleep-mode" ,
118
+ action = "store_true" ,
119
+ help = "Enable sleep mode for the engine." )
120
+ parser .add_argument ("--temperature" ,
121
+ type = float ,
122
+ default = 0.8 ,
123
+ help = "Float that controls the randomness of the sampling." )
124
+ parser .add_argument ("--model-weight-gib" ,
125
+ type = float ,
126
+ default = None ,
127
+ help = "Model weight memory usage in GiB (e.g., 1.0 for 0.5B model)." )
128
+
129
+ args = parser .parse_args ()
130
+ if args .enable_sleep_mode :
131
+ if args .model_weight_gib is None or args .temperature != 0 :
132
+ parser .error ("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set." )
133
+ if args .model_weight_gib <= 0 :
134
+ parser .error ("model-weight-gib must be greater than 0 when enable-sleep-mode is set." )
135
+ if args .model == parser .get_default ("model" ) and args .model_weight_gib is None :
136
+ parser .error ("model-weight-gib must be provided for default model when enable-sleep-mode is set." )
137
+
138
+ return args
118
139
119
140
120
141
def main (
121
142
local_rank : int ,
122
143
rank : int ,
123
144
master_addr : str ,
124
145
master_port : int ,
146
+ model_weight_gib : float ,
125
147
model : str = "Qwen/Qwen3-0.6B" ,
126
148
world_size : int = 4 ,
127
149
tensor_parallel_size : int = 2 ,
128
150
enable_expert_parallel : bool = False ,
129
151
enforce_eager : bool = False ,
130
152
trust_remote_code : bool = True ,
153
+ enable_sleep_mode : bool = False ,
154
+ temperature : float = 0.8 ,
131
155
):
132
156
os .environ ["MASTER_ADDR" ] = master_addr
133
157
os .environ ["MASTER_PORT" ] = str (master_port )
@@ -147,7 +171,7 @@ def main(
147
171
"The future of AI is" ,
148
172
] * 10
149
173
sampling_params = SamplingParams (
150
- temperature = 0.8 ,
174
+ temperature = temperature ,
151
175
top_p = 0.95 ,
152
176
max_tokens = 10 ,
153
177
)
@@ -159,10 +183,31 @@ def main(
159
183
trust_remote_code = trust_remote_code ,
160
184
distributed_executor_backend = "external_launcher" ,
161
185
seed = 0 ,
186
+ enable_sleep_mode = enable_sleep_mode ,
162
187
)
163
188
tp_ranks = get_tp_group ().ranks
164
189
print (f'TP RANKS: { tp_ranks } ' )
190
+
165
191
outputs = llm .generate (prompts , sampling_params )
192
+
193
+ if enable_sleep_mode :
194
+ if rank == 0 :
195
+ free_bytes_before_sleep , total = torch .npu .mem_get_info ()
196
+ llm .sleep (level = 1 )
197
+ if rank == 0 :
198
+ free_bytes_after_sleep , total = torch .npu .mem_get_info ()
199
+ freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
200
+ print (f"Freed memory: { freed_bytes / 1024 ** 3 :.2f} GiB" )
201
+ # now the freed memory should be larger than the model weights
202
+ assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes
203
+
204
+ llm .wake_up ()
205
+ outputs_after_wakeup = llm .generate (prompts , sampling_params )
206
+ if rank == 0 :
207
+ # cmp output
208
+ assert outputs [0 ].outputs [0 ].text == outputs_after_wakeup [0 ].outputs [0 ].text
209
+ print ("Sleep and wake up successfully!!" )
210
+
166
211
for i , output in enumerate (outputs ):
167
212
if i >= 5 :
168
213
# print only 5 outputs
@@ -214,12 +259,15 @@ def cleanup_env_and_memory():
214
259
rank ,
215
260
master_addr ,
216
261
master_port ,
262
+ args .model_weight_gib ,
217
263
args .model ,
218
264
world_size ,
219
265
tp_size ,
220
266
args .enable_expert_parallel ,
221
267
args .enforce_eager ,
222
268
args .trust_remote_code ,
269
+ args .enable_sleep_mode ,
270
+ args .temperature ,
223
271
))
224
272
225
273
proc .start ()
0 commit comments