@@ -140,6 +140,8 @@ def get_args():
140
140
action = "store_true" ,
141
141
default = False ,
142
142
)
143
+ parser .add_argument ("--tensor_parallelism" , type = int , default = 1 )
144
+ parser .add_argument ("--pipeline_parallelism" , type = int , default = 1 )
143
145
return parser .parse_args ()
144
146
145
147
@@ -243,6 +245,8 @@ def main(args):
243
245
train .trainer .devices = args .train_gpus
244
246
train .trainer .num_nodes = args .train_nodes
245
247
train .trainer .limit_val_batches = 32
248
+ train .trainer .strategy .tensor_model_parallel_size = args .tensor_parallelism
249
+ train .trainer .strategy .pipeline_model_parallel_size = args .pipeline_parallelism
246
250
247
251
# 5. Export
248
252
export = run .Partial (
@@ -257,29 +261,33 @@ def main(args):
257
261
mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
258
262
eval_ptq = run .Script (
259
263
mmlu_script_path ,
260
- args = ["--nemo_ckpt" , ptq_model_out ],
264
+ args = ["--nemo_ckpt" , ptq_model_out , "--tensor_parallelism" , f" { args . ptq_gpus } " ],
261
265
entrypoint = "python" ,
262
266
)
263
267
eval_bf16 = run .Script (
264
268
mmlu_script_path ,
265
- args = ["--nemo_ckpt" , bf16_ckpt_path ],
269
+ args = ["--nemo_ckpt" , bf16_ckpt_path , "--tensor_parallelism" , f" { args . ptq_gpus } " ],
266
270
entrypoint = "python" ,
267
271
)
268
272
eval_sft = run .Script (
269
273
mmlu_script_path ,
270
- args = ["--finetuned_ckpt_dir" , exp_dir ],
274
+ args = ["--finetuned_ckpt_dir" , exp_dir , "--tensor_parallelism" , f" { args . ptq_gpus } " ],
271
275
entrypoint = "python" ,
272
276
)
273
277
274
278
if args .use_slurm :
275
279
cpu_executor = create_slurm_executor (SLURM_CONFIG )
276
- gpu_executor = create_slurm_executor (
280
+ ptq_gpu_executor = create_slurm_executor (
277
281
SLURM_CONFIG , num_gpus = args .ptq_gpus , ntasks_per_node = args .ptq_gpus
278
282
)
283
+ train_gpu_executor = create_slurm_executor (
284
+ SLURM_CONFIG , num_gpus = args .train_gpus , ntasks_per_node = args .train_gpus
285
+ )
279
286
single_gpu_executor = create_slurm_executor (SLURM_CONFIG , num_gpus = 1 , ntasks_per_node = 1 )
280
287
else :
281
288
cpu_executor = single_gpu_executor = run .LocalExecutor ()
282
- gpu_executor = run .LocalExecutor (launcher = "torchrun" , ntasks_per_node = args .ptq_gpus )
289
+ ptq_gpu_executor = run .LocalExecutor (launcher = "torchrun" , ntasks_per_node = args .ptq_gpus )
290
+ train_gpu_executor = run .LocalExecutor (launcher = "torchrun" , ntasks_per_node = args .train_gpus )
283
291
284
292
with run .Experiment (exp_dir , log_level = "INFO" ) as exp :
285
293
if not args .data_path :
@@ -294,45 +302,46 @@ def main(args):
294
302
eval_bf16 ,
295
303
tail_logs = True ,
296
304
name = "02_mmlu_bf16" ,
297
- executor = single_gpu_executor ,
305
+ executor = ptq_gpu_executor ,
298
306
dependencies = [s1 ],
299
307
)
300
308
301
309
# 2. PTQ model and evaluate PTQ model
302
- s2 = exp .add (ptq , tail_logs = True , name = "03_ptq" , executor = gpu_executor , dependencies = [s1 ])
310
+ s2 = exp .add (
311
+ ptq , tail_logs = True , name = "03_ptq" , executor = ptq_gpu_executor , dependencies = [s1 ]
312
+ )
303
313
s3 = exp .add (
304
314
eval_ptq ,
305
315
tail_logs = True ,
306
316
name = "04_mmlu_ptq" ,
307
- executor = single_gpu_executor ,
317
+ executor = ptq_gpu_executor ,
308
318
dependencies = [s2 ],
309
319
)
310
320
# 3. Train PTQ model (QAT or QAD)
311
- if args .use_slurm : # Set training arguments
312
- gpu_executor .nodes = args .train_nodes
313
- gpu_executor .gpus_per_node = gpu_executor .ntasks_per_node = args .train_gpus
314
- else :
315
- gpu_executor .ntasks_per_node = args .train_gpus
316
321
train_dep = [s3 ]
317
322
if not args .data_path :
318
323
train_dep .append (s0 )
319
324
s4 = exp .add (
320
- train , tail_logs = True , name = "05_train" , executor = gpu_executor , dependencies = train_dep
325
+ train ,
326
+ tail_logs = True ,
327
+ name = "05_train" ,
328
+ executor = train_gpu_executor ,
329
+ dependencies = train_dep ,
321
330
)
322
-
323
331
s5 = exp .add (
324
332
eval_sft ,
325
333
tail_logs = True ,
326
334
name = "06_mmlu_sft" ,
327
- executor = single_gpu_executor ,
335
+ executor = ptq_gpu_executor ,
328
336
dependencies = [s4 ],
329
337
)
330
- gpu_executor .ntasks_per_node = 1 # will throw error if more than 1 task during export
338
+ # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo
339
+ train_gpu_executor .ntasks_per_node = 1 # will throw error if more than 1 task during export
331
340
exp .add (
332
341
export ,
333
342
tail_logs = True ,
334
343
name = "07_export_hf" ,
335
- executor = gpu_executor ,
344
+ executor = train_gpu_executor ,
336
345
dependencies = [s5 ],
337
346
)
338
347
exp .run (detach = True )
@@ -356,10 +365,7 @@ def main(args):
356
365
use_local_tunnel = False ,
357
366
host = "" ,
358
367
user = "" ,
359
- container_mounts = [
360
- "/path/to/logs:/path/to/logs" ,
361
- "/path/to/NeMo:/opt/NeMo" ,
362
- ],
368
+ container_mounts = [],
363
369
job_dir = "/path/to/logs" ,
364
370
identity = None ,
365
371
)
@@ -369,7 +375,7 @@ def main(args):
369
375
SEQUENCE_LENGTH = 4096
370
376
MBS = 1
371
377
GBS = 512
372
- TRAIN_STEPS = 200
378
+ TRAIN_STEPS = 400
373
379
VAL_INTERVAL = 50
374
380
# # # # # # # # # # # # # # # # # # # # # #
375
381
0 commit comments