Review suggestions

AAnoosheh · AAnoosheh · commit 3090a985516a · 2025-09-29T11:37:55.000-07:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/examples/nemo_run/common/process_climbmix.py b/examples/nemo_run/common/process_climbmix.py
@@ -41,27 +41,19 @@ def get_args():
         default="Qwen/Qwen3-8B",
         help="Tokenizer to use for preprocessing",
     )
-    parser.add_argument(
-        "--subset-indices",
-        help="Comma-separated subset indices to download",
-    )
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = get_args()
-    Path(args.output_dir).mkdir(exist_ok=True)
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 
     # create raw and processed directories
     raw_dir = Path(args.output_dir) / "climbmix_raw"
     proc_dir = Path(args.output_dir) / "climbmix_proc"
 
     # only download the subset of the data
-    if args.subset_indices:
-        subset_idx = [int(i) for i in args.subset_indices.split(",")]
-    else:
-        subset_idx = SUBSET_IDX
-    subset_filenames = [f"part_{i}.jsonl" for i in subset_idx]
+    subset_filenames = [f"part_{i}.jsonl" for i in SUBSET_IDX]
 
     # download raw data
     snapshot_download(
@@ -72,7 +64,7 @@ def get_args():
     )
 
     # preprocess (tokenize)
-    print("Processing ClimbMix dataset...")
+    print("Tokenizing ClimbMix dataset...")
     input_paths = [raw_dir / name for name in subset_filenames]
     megatron_preprocess_data(
         input_paths,
diff --git a/examples/nemo_run/prune_distill/README.md b/examples/nemo_run/prune_distill/README.md
@@ -2,10 +2,6 @@
 
 # NeMo Pruning + Knowledge Distillation Simplified Flow Example
 
-[Slurm Examples](ADVANCED.md) |
-[Advanced Topics](ADVANCED.md) |
-[NeMo Integration](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/modelopt)
-
 </div>
 
 ## Overview
@@ -36,14 +32,15 @@ graph TD;
 
 ## Results
 
-Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~14,000 steps with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours)
+Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~28,000 steps (determined by sequence length, default 4096) with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours)
 
-|                           | Tokens per Second | MMLU |
-|---------------------------|-------------------|------|
-| Qwen3-8B Original         | 4420              | 74.9 |
-| Qwen3-6B Pruned+Distilled | 6950              | 72.5 |
+|                                   | Tokens per Second | MMLU |
+|-----------------------------------|-------------------|------|
+| Qwen3-8B Original                 | 4420              | 74.9 |
+| Qwen3-6B Pruned+Distilled from 8B | 6950              | 72.5 |
+| Qwen3-4B Original (comparison)    | 5210              | 70.0 |
 
-The resulting compressed model maintains competitive performance while being significantly faster with a smaller memory footprint.
+The resulting compressed student maintains competitive performance while being significantly faster with a smaller memory footprint than the teacher. It also happens to have both better performance and throughput than the existing Qwen3-4B model!
 
 ## Usage
 
@@ -58,7 +55,7 @@ To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia
 Example docker command:
 
 ```bash
-docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash
+docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer:/opt/TensorRT-Model-Optimizer --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash
 ```
 
 You will also need to set your Huggingface token with `export HF_TOKEN=<your-token>`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written.
@@ -84,7 +81,7 @@ From the `nemo_run` folder, launch the example with the `nemo_prune_kd_flow.py`
 To perform Pruning + Knowledge Distillation, run:
 
 ```bash
-python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbix_proc --use-slurm
+python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbmix_proc --use-slurm
 ```
 
 ## Supported models
diff --git a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py
@@ -41,9 +41,9 @@ def get_args():
         default="prune_distill_flow",
     )
     parser.add_argument(
-        "--model-name",
+        "--model-id-or-path",
         type=str,
-        help="Name of the HF model",
+        help="ID or path of the HF model",
         default="Qwen/Qwen3-8B",
     )
     parser.add_argument(
@@ -55,12 +55,6 @@ def get_args():
             "<model_name>_<model_size>(_<long_sequence_length> or other special settings)"
         ),
     )
-    parser.add_argument(
-        "--hf-tokenizer",
-        type=str,
-        help="Name of HF model to use for tokenizer.",
-        default="Qwen/Qwen3-8B",
-    )
     parser.add_argument(
         "--prune-target-num-layers",
         type=int,
@@ -119,10 +113,12 @@ def main(args):
             seq_length=SEQUENCE_LENGTH,
         )
     else:
+        if not args.data_dir:
+            raise ValueError("--data-dir must be provided unless --mock-run is enabled.")
         tokenizer = run.Config(
             get_nmt_tokenizer,
             library="huggingface",
-            model_name=args.hf_tokenizer,
+            model_name=args.model_id_or_path,
         )
         data = run.Config(
             PreTrainingDataModule,
@@ -140,7 +136,7 @@ def main(args):
     import_model = run.Partial(
         llm.import_ckpt,
         model=model_module.model(),
-        source=f"hf://{args.model_name}",
+        source=f"hf://{args.model_id_or_path}",
         output_path=initial_model_out,
         overwrite=True,
     )
@@ -154,7 +150,7 @@ def main(args):
         nemo_checkpoint=initial_model_out,
         save_path=pruned_model_out,
     )
-    prune.tokenizer_path = args.hf_tokenizer
+    prune.tokenizer_path = args.model_id_or_path
     prune.pruning_config.target_num_layers = args.prune_target_num_layers
     prune.devices = 1
     prune.pp_size = 1
@@ -304,7 +300,7 @@ def main(args):
 
     # # # # # # # # # # # # # # # # # # # # # #
     # # # # # CONFIGURABLE PARAMETERS # # # # #
-    SEQUENCE_LENGTH = 8192
+    SEQUENCE_LENGTH = 4096
     PRUNE_MBS = 4
     DISTILL_MBS = 2
     VAL_BATCHES = 32
@@ -318,9 +314,9 @@ def main(args):
         DISTILL_STEPS = 20
         VAL_INTERVAL = 10
     else:
-        PRUNE_SAMPLES = 1024
+        PRUNE_SAMPLES = 512
         DISTILL_GBS = 768
-        _NUM_TOKENS = 89694564352
+        _NUM_TOKENS = int(90e9)
         DISTILL_STEPS = int(_NUM_TOKENS / DISTILL_GBS / SEQUENCE_LENGTH)
         VAL_INTERVAL = 1000
     # # # # # # # # # # # # # # # # # # # # # #