respond to comments

jenchen13 · jenchen13 · commit 14f15abad4e0 · 2025-09-06T22:10:15.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/examples/nemo_run/common/in_memory_mmlu.py b/examples/nemo_run/common/in_memory_mmlu.py
@@ -51,6 +51,7 @@ def parse_args():
         ckpt_path,
         tensor_model_parallel_size=args.tensor_parallelism,
         pipeline_model_parallel_size=args.pipeline_parallelism,
+        devices=args.tensor_parallelism * args.pipeline_parallelism,
     )
     tokenizer = model.tokenizer.tokenizer
     megatron_mmlu(model.module, tokenizer)
diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md
@@ -47,14 +47,16 @@ You can run the example either locally  or on a [Slurm cluster](ADVANCED.md).
 To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
 
 - `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git`
-- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout ddcb75f`
+- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a`
 
 Example docker command:
 
-```
+```bash
 docker run -v  /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash
 ```
 
+You will also need to set your Huggingface token with `export HF_TOKEN=<your-token>`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written.
+
 ### Running the Flow Locally
 
 After launching the NeMo container with the specified mounts, follow these examples to run the flow locally.
diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py
@@ -153,10 +153,17 @@ def main(args):
     exp_dir = f"{args.log_dir.rstrip('/')}/{args.experiment}"
 
     # 1. Process data
+    # TODO figure out path
+    # LOCALLY common/process.py works
+    # On slurm examples/nemo_run/common/process.py works
+
+    openscience_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "../common/process_openscience.py")
+    )
     openscience_data = run.Script(
-        os.path.abspath(
-            os.path.join(os.path.dirname(__file__), "../common/process_openscience.py")
-        ),
+        openscience_path
+        if not args.use_slurm
+        else "examples/nemo_run/common/process_openscience.py",
         entrypoint="python",
         args=["--output-dir", exp_dir],
     )
@@ -226,7 +233,6 @@ def main(args):
         train = distillation_recipe(ptq_model_out, bf16_ckpt_path)
     else:
         train = get_finetune_recipe(args.finetune_recipe)
-        # TODO support resume from previous experiment?
         train.resume.restore_config.path = ptq_model_out
         train.optim.config.lr = args.learning_rate
     train.tokenizer = "data"
@@ -236,7 +242,7 @@ def main(args):
     train.trainer.max_steps = TRAIN_STEPS
     train.trainer.devices = args.train_gpus
     train.trainer.num_nodes = args.train_nodes
-    train.trainer.limit_val_batches = 2  # TODO remove
+    train.trainer.limit_val_batches = 32
 
     # 5. Export
     export = run.Partial(
@@ -247,6 +253,8 @@ def main(args):
     mmlu_script_path = os.path.abspath(
         os.path.join(os.path.dirname(__file__), "../common/in_memory_mmlu.py")
     )
+    if args.use_slurm:
+        mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
     eval_ptq = run.Script(
         mmlu_script_path,
         args=["--nemo_ckpt", ptq_model_out],
@@ -343,7 +351,7 @@ def main(args):
             time="240",
             container_image="nvcr.io/nvidia/nemo:25.07",
             env_vars={
-                "HF_TOKEN": "<your-token>",
+                "HF_TOKEN": "",
             },
             use_local_tunnel=False,
             host="",
@@ -360,7 +368,7 @@ def main(args):
     # # # # # CONFIGURABLE PARAMETERS # # # # #
     SEQUENCE_LENGTH = 4096
     MBS = 1
-    GBS = 256
+    GBS = 512
     TRAIN_STEPS = 200
     VAL_INTERVAL = 50
     # # # # # # # # # # # # # # # # # # # # # #

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@ def parse_args():`
`51`	`51`	`ckpt_path,`
`52`	`52`	`tensor_model_parallel_size=args.tensor_parallelism,`
`53`	`53`	`pipeline_model_parallel_size=args.pipeline_parallelism,`
	`54`	`+ devices=args.tensor_parallelism * args.pipeline_parallelism,`
`54`	`55`	`)`
`55`	`56`	`tokenizer = model.tokenizer.tokenizer`
`56`	`57`	`megatron_mmlu(model.module, tokenizer)`