update example

jenchen13 · jenchen13 · commit c97f9bd83a10 · 2025-09-16T20:43:15.000Z
diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py
@@ -140,6 +140,8 @@ def get_args():
         action="store_true",
         default=False,
     )
+    parser.add_argument("--tensor_parallelism", type=int, default=1)
+    parser.add_argument("--pipeline_parallelism", type=int, default=1)
     return parser.parse_args()
 
 
@@ -243,6 +245,8 @@ def main(args):
     train.trainer.devices = args.train_gpus
     train.trainer.num_nodes = args.train_nodes
     train.trainer.limit_val_batches = 32
+    train.trainer.strategy.tensor_model_parallel_size = args.tensor_parallelism
+    train.trainer.strategy.pipeline_model_parallel_size = args.pipeline_parallelism
 
     # 5. Export
     export = run.Partial(
@@ -257,29 +261,33 @@ def main(args):
         mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
     eval_ptq = run.Script(
         mmlu_script_path,
-        args=["--nemo_ckpt", ptq_model_out],
+        args=["--nemo_ckpt", ptq_model_out, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
     eval_bf16 = run.Script(
         mmlu_script_path,
-        args=["--nemo_ckpt", bf16_ckpt_path],
+        args=["--nemo_ckpt", bf16_ckpt_path, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
     eval_sft = run.Script(
         mmlu_script_path,
-        args=["--finetuned_ckpt_dir", exp_dir],
+        args=["--finetuned_ckpt_dir", exp_dir, "--tensor_parallelism", f"{args.ptq_gpus}"],
         entrypoint="python",
     )
 
     if args.use_slurm:
         cpu_executor = create_slurm_executor(SLURM_CONFIG)
-        gpu_executor = create_slurm_executor(
+        ptq_gpu_executor = create_slurm_executor(
             SLURM_CONFIG, num_gpus=args.ptq_gpus, ntasks_per_node=args.ptq_gpus
         )
+        train_gpu_executor = create_slurm_executor(
+            SLURM_CONFIG, num_gpus=args.train_gpus, ntasks_per_node=args.train_gpus
+        )
         single_gpu_executor = create_slurm_executor(SLURM_CONFIG, num_gpus=1, ntasks_per_node=1)
     else:
         cpu_executor = single_gpu_executor = run.LocalExecutor()
-        gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
+        ptq_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus)
+        train_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus)
 
     with run.Experiment(exp_dir, log_level="INFO") as exp:
         if not args.data_path:
@@ -294,45 +302,46 @@ def main(args):
             eval_bf16,
             tail_logs=True,
             name="02_mmlu_bf16",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s1],
         )
 
         # 2. PTQ model and evaluate PTQ model
-        s2 = exp.add(ptq, tail_logs=True, name="03_ptq", executor=gpu_executor, dependencies=[s1])
+        s2 = exp.add(
+            ptq, tail_logs=True, name="03_ptq", executor=ptq_gpu_executor, dependencies=[s1]
+        )
         s3 = exp.add(
             eval_ptq,
             tail_logs=True,
             name="04_mmlu_ptq",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s2],
         )
         # 3. Train PTQ model (QAT or QAD)
-        if args.use_slurm:  # Set training arguments
-            gpu_executor.nodes = args.train_nodes
-            gpu_executor.gpus_per_node = gpu_executor.ntasks_per_node = args.train_gpus
-        else:
-            gpu_executor.ntasks_per_node = args.train_gpus
         train_dep = [s3]
         if not args.data_path:
             train_dep.append(s0)
         s4 = exp.add(
-            train, tail_logs=True, name="05_train", executor=gpu_executor, dependencies=train_dep
+            train,
+            tail_logs=True,
+            name="05_train",
+            executor=train_gpu_executor,
+            dependencies=train_dep,
         )
-
         s5 = exp.add(
             eval_sft,
             tail_logs=True,
             name="06_mmlu_sft",
-            executor=single_gpu_executor,
+            executor=ptq_gpu_executor,
             dependencies=[s4],
         )
-        gpu_executor.ntasks_per_node = 1  # will throw error if more than 1 task during export
+        # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo
+        train_gpu_executor.ntasks_per_node = 1  # will throw error if more than 1 task during export
         exp.add(
             export,
             tail_logs=True,
             name="07_export_hf",
-            executor=gpu_executor,
+            executor=train_gpu_executor,
             dependencies=[s5],
         )
         exp.run(detach=True)
@@ -356,10 +365,7 @@ def main(args):
             use_local_tunnel=False,
             host="",
             user="",
-            container_mounts=[
-                "/path/to/logs:/path/to/logs",
-                "/path/to/NeMo:/opt/NeMo",
-            ],
+            container_mounts=[],
             job_dir="/path/to/logs",
             identity=None,
         )
@@ -369,7 +375,7 @@ def main(args):
     SEQUENCE_LENGTH = 4096
     MBS = 1
     GBS = 512
-    TRAIN_STEPS = 200
+    TRAIN_STEPS = 400
     VAL_INTERVAL = 50
     # # # # # # # # # # # # # # # # # # # # # #