Skip to content

Commit 14f15ab

Browse files
committed
respond to comments
Signed-off-by: Jennifer Chen <[email protected]>
1 parent 856159e commit 14f15ab

File tree

3 files changed

+20
-9
lines changed

3 files changed

+20
-9
lines changed

examples/nemo_run/common/in_memory_mmlu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def parse_args():
5151
ckpt_path,
5252
tensor_model_parallel_size=args.tensor_parallelism,
5353
pipeline_model_parallel_size=args.pipeline_parallelism,
54+
devices=args.tensor_parallelism * args.pipeline_parallelism,
5455
)
5556
tokenizer = model.tokenizer.tokenizer
5657
megatron_mmlu(model.module, tokenizer)

examples/nemo_run/qat/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,16 @@ You can run the example either locally or on a [Slurm cluster](ADVANCED.md).
4747
To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container.
4848

4949
- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git`
50-
- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout ddcb75f`
50+
- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a`
5151

5252
Example docker command:
5353

54-
```
54+
```bash
5555
docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash
5656
```
5757

58+
You will also need to set your Huggingface token with `export HF_TOKEN=<your-token>`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written.
59+
5860
### Running the Flow Locally
5961

6062
After launching the NeMo container with the specified mounts, follow these examples to run the flow locally.

examples/nemo_run/qat/nemo_qat_flow.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,17 @@ def main(args):
153153
exp_dir = f"{args.log_dir.rstrip('/')}/{args.experiment}"
154154

155155
# 1. Process data
156+
# TODO figure out path
157+
# LOCALLY common/process.py works
158+
# On slurm examples/nemo_run/common/process.py works
159+
160+
openscience_path = os.path.abspath(
161+
os.path.join(os.path.dirname(__file__), "../common/process_openscience.py")
162+
)
156163
openscience_data = run.Script(
157-
os.path.abspath(
158-
os.path.join(os.path.dirname(__file__), "../common/process_openscience.py")
159-
),
164+
openscience_path
165+
if not args.use_slurm
166+
else "examples/nemo_run/common/process_openscience.py",
160167
entrypoint="python",
161168
args=["--output-dir", exp_dir],
162169
)
@@ -226,7 +233,6 @@ def main(args):
226233
train = distillation_recipe(ptq_model_out, bf16_ckpt_path)
227234
else:
228235
train = get_finetune_recipe(args.finetune_recipe)
229-
# TODO support resume from previous experiment?
230236
train.resume.restore_config.path = ptq_model_out
231237
train.optim.config.lr = args.learning_rate
232238
train.tokenizer = "data"
@@ -236,7 +242,7 @@ def main(args):
236242
train.trainer.max_steps = TRAIN_STEPS
237243
train.trainer.devices = args.train_gpus
238244
train.trainer.num_nodes = args.train_nodes
239-
train.trainer.limit_val_batches = 2 # TODO remove
245+
train.trainer.limit_val_batches = 32
240246

241247
# 5. Export
242248
export = run.Partial(
@@ -247,6 +253,8 @@ def main(args):
247253
mmlu_script_path = os.path.abspath(
248254
os.path.join(os.path.dirname(__file__), "../common/in_memory_mmlu.py")
249255
)
256+
if args.use_slurm:
257+
mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py"
250258
eval_ptq = run.Script(
251259
mmlu_script_path,
252260
args=["--nemo_ckpt", ptq_model_out],
@@ -343,7 +351,7 @@ def main(args):
343351
time="240",
344352
container_image="nvcr.io/nvidia/nemo:25.07",
345353
env_vars={
346-
"HF_TOKEN": "<your-token>",
354+
"HF_TOKEN": "",
347355
},
348356
use_local_tunnel=False,
349357
host="",
@@ -360,7 +368,7 @@ def main(args):
360368
# # # # # CONFIGURABLE PARAMETERS # # # # #
361369
SEQUENCE_LENGTH = 4096
362370
MBS = 1
363-
GBS = 256
371+
GBS = 512
364372
TRAIN_STEPS = 200
365373
VAL_INTERVAL = 50
366374
# # # # # # # # # # # # # # # # # # # # # #

0 commit comments

Comments
 (0)