From d6325788b3afd83c2b8a71af6f9b4c9eca2bb136 Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Tue, 26 Aug 2025 06:23:25 -0700 Subject: [PATCH 1/3] Squashed commit Signed-off-by: Asha Anoosheh --- examples/llm_distill/README.md | 1 + examples/nemo_run/common/process_climbmix.py | 85 ++++ examples/nemo_run/prune_distill/README.md | 124 +++--- .../prune_distill/nemo_prune_kd_flow.py | 374 ++++++++++-------- 4 files changed, 368 insertions(+), 216 deletions(-) create mode 100644 examples/nemo_run/common/process_climbmix.py diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md index 3b44b4f77..dbb71fb26 100644 --- a/examples/llm_distill/README.md +++ b/examples/llm_distill/README.md @@ -16,6 +16,7 @@ This section focuses on demonstrating how to apply Model Optimizer to perform kn | Distillation with NeMo | Learn how to distill your models with NeMo Framework | \[[Link](#knowledge-distillation-kd-for-nvidia-nemo-models)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | | Distillation with Huggingface | Learn how to distill your models with Hugging Face | \[[Link](#knowledge-distillation-kd-for-huggingface-models)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | | Resources | Extra links to relevant resources | \[[Link](#resources)\] | | +| NeMo Prune + Distill Simplified Flow | Example script demonstrating end-to-end pruning plus distillation in NeMo | \[[Link](../nemo_run/prune_distill/README.md)\] | | diff --git a/examples/nemo_run/common/process_climbmix.py b/examples/nemo_run/common/process_climbmix.py new file mode 100644 index 000000000..27533d93f --- /dev/null +++ b/examples/nemo_run/common/process_climbmix.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from pathlib import Path + +from huggingface_hub import snapshot_download + +from modelopt.torch.utils.plugins import megatron_preprocess_data + +SUBSET_IDX = [ + *[0, 1, 6, 10, 11], + *[12, 13, 14, 21, 24], + *[33, 35, 38, 40, 48], + *[49, 52, 66, 70, 76], + *[83, 88, 91, 94, 99], +] # 25% of total dataset + + +def get_args(): + parser = argparse.ArgumentParser(description="Process ClimbMix dataset") + parser.add_argument( + "--output-dir", + default=".", + help="Path to the directory to store the processed dataset", + ) + parser.add_argument( + "--tokenizer", + default="Qwen/Qwen3-8B", + help="Tokenizer to use for preprocessing", + ) + parser.add_argument( + "--subset-indices", + help="Comma-separated subset indices to download", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + Path(args.output_dir).mkdir(exist_ok=True) + + # create raw and processed directories + raw_dir = Path(args.output_dir) / "climbmix_raw" + proc_dir = Path(args.output_dir) / "climbmix_proc" + + # only download the subset of the data + if args.subset_indices: + subset_idx = [int(i) for i in args.subset_indices.split(",")] + else: + subset_idx = SUBSET_IDX + subset_filenames = [f"part_{i}.jsonl" for i in subset_idx] + + # download raw data + snapshot_download( + repo_id="OptimalScale/ClimbMix", + repo_type="dataset", + local_dir=raw_dir, + allow_patterns=subset_filenames, + ) + + # preprocess (tokenize) + print("Processing ClimbMix dataset...") + input_paths = [raw_dir / name for name in subset_filenames] + megatron_preprocess_data( + input_paths, + output_dir=proc_dir, + tokenizer_name_or_path=args.tokenizer, + append_eod=True, + max_sequence_length=32000, + workers=8, + log_interval=10000, + ) diff --git a/examples/nemo_run/prune_distill/README.md b/examples/nemo_run/prune_distill/README.md index a6e493aca..2f753da59 100644 --- a/examples/nemo_run/prune_distill/README.md +++ b/examples/nemo_run/prune_distill/README.md @@ -1,95 +1,101 @@ -# Pruning and Knowledge Distillation Nemo Run example +
+ +# NeMo Pruning + Knowledge Distillation Simplified Flow Example + +[Slurm Examples](ADVANCED.md) | +[Advanced Topics](ADVANCED.md) | +[NeMo Integration](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/modelopt) + +
## Overview -This directory contains the NeMo 2.0 Pruning + Knowledge Distillation flow implementation. The main script `nemo_prune_kd_flow.py` enables model compression through structured pruning followed by knowledge distillation to recover performance. +This directory contains an end-to-end Pruning + Knowledge Distillation Simplified Flow example using NeMo for model compression. It supports structured pruning followed by knowledge distillation to recover performance after compression. -## Usage +After structured pruning, the compressed model may show some accuracy degradation; the knowledge distillation stage aims to recover that loss by transferring knowledge from the full-precision teacher model to the pruned student model. -### Prerequisites +## Flow Stages -#### Install NeMo 2.0 and related dependencies +The Simplified Flow runs the following steps in order: -To run the example, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.04.01 or higher using Docker/Slurm. Mount your cloned `modelopt` repository to the container by adding this mount flag to your Docker/Slurm command: `-v :/workspace/modelopt -v /modelopt:/usr/local/lib/python3.12/dist-packages/modelopt`. +1. 01_import — Import HuggingFace model to NeMo format +1. 02_prune — Apply structured pruning to create a compressed student model +1. 03_distill — Knowledge distillation from teacher to pruned student model +1. 04_export — Export final compressed model to HuggingFace format +1. eval_teacher — Evaluate teacher model on 5% of MMLU benchmark +1. eval_student — Evaluate student model on 5% of MMLU benchmark -To run SFT properly you may also need to clone NeMo and Megatron-LM at the respective commits, and mount to `/opt/NeMo` and `/opt/megatron-lm`: +```mermaid +graph TD; +01_import-->02_prune; +01_import-->eval_teacher; +02_prune-->03_distill; +03_distill-->eval_student; +03_distill-->04_export; +``` -- `git clone https://github.com/NVIDIA-NeMo/NeMo && cd NeMo && git checkout d7b87b1` -- `git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout 8c15450` +## Results -### Data Preparation +Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~14,000 steps with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours) -The script supports chat datasets in ShareGPT or HuggingFace/OpenAI chat format. You can prepare your dataset in JSONL format with the required chat structure. To provide your own custom dataset, use the `--data-path` flag, otherwise the default [LIMA](https://huggingface.co/datasets/GAIR/lima) dataset will be used. +| | Tokens per Second | MMLU | +|---------------------------|-------------------|------| +| Qwen3-8B Original | 4420 | 74.9 | +| Qwen3-6B Pruned+Distilled | 6950 | 72.5 | -### Running the Flow +The resulting compressed model maintains competitive performance while being significantly faster with a smaller memory footprint. -#### Standard Usage +## Usage -From the `nemo_run` folder, run: +### Prerequisites -```bash -python prune_distill/nemo_prune_kd_flow.py --data_path your_dataset.jsonl -``` +You can run the example either locally or on a [Slurm cluster](ADVANCED.md). -#### Mock Run (for testing) +To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container. -To test the flow without actual data, run the following command from the `nemo_run` folder: +- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git` + +Example docker command: ```bash -python prune_distill/nemo_prune_kd_flow.py --mock_run +docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash ``` -### Flow Stages +You will also need to set your Huggingface token with `export HF_TOKEN=`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written. -The script executes the following stages in sequence: +### Dataset Preparation -1. Process LIMA data (if `--data-path` is not specified) -1. **Import Model**: Imports the HuggingFace model to NeMo format -1. **Fine-tuning**: Fine-tunes the model on the provided dataset -1. **Pruning**: Prunes the fine-tuned model to create a smaller student model -1. **Knowledge Distillation**: Distills knowledge from the teacher to the pruned student model -1. **Export**: Exports the final compressed model +Unlike the QAT flow, this workflow does not automatically download the dataset due to its large size and long tokenization time. +You must first prepare the dataset by running: -### Configuration Parameters +```bash +python ../common/process_climbmix.py --output-dir /path/to/save +``` -The script includes several configurable parameters: +This will download and process the ClimbMix dataset, creating the necessary data files for training. -- **GPUS**: Number of GPUs (default: 8) -- **SEQUENCE_LENGTH**: Maximum sequence length (default: 8192) -- **MBS**: Micro batch size (default: 2) -- **GBS**: Global batch size (default: 2048 for real runs, 8 for mock runs) -- **FINETUNE_STEPS**: Number of fine-tuning steps (default: 2500 for real runs, 20 for mock runs) -- **DISTILL_STEPS**: Number of distillation steps (default: 7500 for real runs, 20 for mock runs) -- **VAL_INTERVAL**: Validation interval (default: 500 for real runs, 10 for mock runs) -- **PRUNE_SAMPLES**: Number of samples for pruning calibration (default: 1024 for real runs, 3 for mock runs) +### Running the Flow via Slurm -### Pruning Configuration +After launching the NeMo container with the specified mounts, change the contents of the `SLURM_CONFIG` in `nemo_prune_kd_flow.py` +to reflect your environment, and then perform the following: -- **Target Hidden Size**: Default is 3072 (configurable via `--prune_target_hidden_size`) -- **Target FFN Hidden Size**: Automatically set to 3 × target_hidden_size -- **Pruning Method**: Structured pruning to reduce model dimensions +From the `nemo_run` folder, launch the example with the `nemo_prune_kd_flow.py` script. To use a different model than the default model (Qwen3-8B), you can add the `--model-name --base-recipe ` flags and use the model's HuggingFace name and NeMo recipe names listed [here](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/recipes). Provide the processed dataset path using the `--data-dir` flag. -### Output +To perform Pruning + Knowledge Distillation, run: -The script generates the following outputs in the specified log directory: +```bash +python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbix_proc --use-slurm +``` -- `{model_name}_initial/`: Initial NeMo checkpoint -- `finetune_log_dir/`: Fine-tuning logs and checkpoints (teacher model) -- `{model_name}_pruned/`: Pruned student model -- `distill_log_dir/`: Knowledge distillation logs and checkpoints -- `{model_name}_final/`: Final compressed model after distillation +## Supported models -### Supported Models +Locally this script currently supports models that can be trained on 1 node with 8 x 80GB GPUs. On Slurm you can configure the number of nodes/gpus for training and pruning with the following flags: `--nodes`, `--train-gpus`. -Currently supports models that can be trained on 1 node with 8 x 80GB GPUs. The default configuration uses: +The default configuration works on 1 node with 8 H100 GPUs: -- **Model**: Meta-Llama-3.1-8B -- **Recipe**: llama31_8b -- **Pruning Strategy**: Structured pruning with knowledge distillation recovery +- **Model**: Qwen/Qwen3-8B +- **Recipe**: qwen3_8b -### Troubleshooting +### Dataset limitations -1. **GPU Memory Issues**: Reduce batch sizes (MBS, GBS) if encountering OOM errors -1. **Data Format**: Ensure your dataset follows the expected chat format -1. **NeMo Installation**: If encountering NeMo-related errors, use the recommended docker container -1. **Model Size**: Ensure your model fits within the 8-GPU configuration +The current pruning + knowledge distillation recipe has been tuned for the Qwen3-8B model to achieve significant speedup while maintaining performance. Pruning and distillation results are highly dependent on the specific model, dataset, and hyperparameters. There is no guarantee that a given dataset will recover the accuracy of the pruned model. Feel free to try your own model and dataset combinations and test which combination works best. diff --git a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py index 042021fa2..19d4ab0f9 100644 --- a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py +++ b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py @@ -14,255 +14,315 @@ # limitations under the License. import argparse -from pathlib import Path +import os +import sys +from datetime import timedelta import nemo_run as run -from megatron.core.dist_checkpointing.validation import StrictHandling from nemo.collections import llm -from nemo.collections.llm.api import export_ckpt -from nemo.collections.llm.gpt.data import ChatDataModule, MockDataModule +from nemo.collections.llm.gpt.data import MockDataModule, PreTrainingDataModule from nemo.collections.llm.modelopt.recipes.distillation_recipe import distillation_recipe from nemo.collections.llm.modelopt.recipes.prune_recipe import prune_recipe from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from modelopt.torch.export.plugins.nemo_run import export_most_recent_ckpt + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "common"))) +from process_climbmix import SUBSET_IDX +from utils import SlurmConfig, create_slurm_executor + def get_args(): - parser = argparse.ArgumentParser( - description="NeMo 2.0 Pruning + Distillation flow. Currently supports models that fit on 1 node and 8 GPUs." - ) + parser = argparse.ArgumentParser(description="NeMo 2.0 Pruning + Distillation flow.") parser.add_argument( - "--experiment_name", + "--experiment", type=str, help="Experiment name", default="prune_distill_flow", ) parser.add_argument( - "--model_hf", + "--model-name", type=str, - help="Hugging Face model name or path", - default="meta-llama/Meta-Llama-3.1-8B", + help="Name of the HF model", + default="Qwen/Qwen3-8B", ) parser.add_argument( - "--recipe", + "--base-recipe", type=str, - default="llama31_8b", + default="qwen3_8b", help=( "Choose NeMo 2.0 recipe. Recipes are named in the format of " "_(_ or other special settings)" ), ) parser.add_argument( - "--data_path", + "--hf-tokenizer", type=str, - help="Path to the finetuning chat dataset. Can be either ShareGPT or HuggingFace/OpenAI chat format", + help="Name of HF model to use for tokenizer.", + default="Qwen/Qwen3-8B", + ) + parser.add_argument( + "--prune-target-num-layers", + type=int, + default=24, + help="Number of model layers to remain after pruning", ) parser.add_argument( - "--tokenizer_hf", + "--log-dir", type=str, - help="Hugging Face tokenizer name or path", - default="meta-llama/Meta-Llama-3.1-8B-Instruct", + help=( + "Path to the directory to store logs. Best to pass in a non-relative path so that " + "artifacts are stored in one location." + ), + default="logs", ) parser.add_argument( - "--chat_template", + "--data-dir", type=str, - help="Path to the custom chat template to replace the HF tokenizer default chat template.", - required=False, + help="Path the preprocessed dataset", + ) + parser.add_argument( + "--train-gpus", + type=int, + help="Number of GPUs for training", + default=8, ) parser.add_argument( - "--prune_target_hidden_size", + "--nodes", type=int, - default=3072, - help="Target hidden size for pruning (also sets ffn_hidden_size to 3 * target_hidden_size)", + help="Number of nodes", + default=1, ) parser.add_argument( - "--mock_run", + "--use-slurm", + action="store_true", + help="Run on slurm using run.SlurmExecutor", + ) + parser.add_argument( + "--mock-run", action="store_true", help="Run in mock mode", ) return parser.parse_args() -def get_finetune_recipe(recipe_name: str): - if not hasattr(getattr(llm, recipe_name), "finetune_recipe"): - raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe") - return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None) - - -def get_most_recent_subdir(directory: Path): - # Get all subdirectories - subdirs = [d for d in directory.iterdir() if d.is_dir()] - if not subdirs: - raise ValueError(f"No subdirectories found in {directory}") - - # Sort by modification time (most recent first) - most_recent = max(subdirs, key=lambda x: x.stat().st_mtime) - - return most_recent - - -def get_most_recent_ckpt(directory: str): - """ - Find the most recent checkpoint subdirectory in a given directory. - - Args: - directory (str): Path to the directory to search in. - - Returns: - str: Path to the most recent subdirectory. - """ - exp_dir = Path(directory) / "default" - assert exp_dir.exists(), f"Experiment directory {exp_dir} does not exist" - - checkpoint_dir = exp_dir / "checkpoints" - if checkpoint_dir.exists(): - most_recent = get_most_recent_subdir(checkpoint_dir) - else: - most_recent = get_most_recent_subdir(exp_dir) - checkpoint_dir = most_recent / "checkpoints" - assert checkpoint_dir.exists(), f"Checkpoint directory {checkpoint_dir} does not exist" - most_recent = get_most_recent_subdir(checkpoint_dir) - - return str(most_recent) - - -def export_most_recent_ckpt(directory: str, output_path: str): - most_recent_ckpt = get_most_recent_ckpt(directory) - export_ckpt(most_recent_ckpt, "hf", output_path=output_path, overwrite=True) - - -def _read_chat_template(template_path: str): - with open(template_path) as f: - return f.read().strip() - - -if __name__ == "__main__": - args = get_args() - - # # # # # CONFIGURABLE PARAMETERS # # # # # - GPUS = 8 - SEQUENCE_LENGTH = 8192 - MBS = 2 - VAL_BATCHES = 32 - if args.mock_run: - GBS = 8 - FINETUNE_STEPS = 20 - DISTILL_STEPS = 20 - VAL_INTERVAL = 10 - PRUNE_SAMPLES = 3 - assert args.data_path is None, "Argument --data_path not used in mock mode." - else: - GBS = 2048 - FINETUNE_STEPS = 2500 - DISTILL_STEPS = 7500 - VAL_INTERVAL = 500 - PRUNE_SAMPLES = 1024 - args.data_path = args.data_path if args.data_path is not None else "lima_processed" - # # # # # # # # # # # # # # # # # # # # # # - +def main(args): # Common items - model_name = args.recipe - model_module = getattr(llm, model_name) + exp_dir = os.path.join(args.log_dir, args.experiment) + model_name = args.base_recipe + if args.mock_run: data = run.Config( MockDataModule, - global_batch_size=GBS, - micro_batch_size=MBS, + global_batch_size=DISTILL_GBS, + micro_batch_size=DISTILL_MBS, seq_length=SEQUENCE_LENGTH, ) else: tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", - model_name=args.tokenizer_hf, - chat_template=_read_chat_template(args.chat_template) if args.chat_template else None, + model_name=args.hf_tokenizer, ) data = run.Config( - ChatDataModule, - dataset_root=args.data_path, + PreTrainingDataModule, + paths=[f"{args.data_dir}/part_{i}_text_document" for i in SUBSET_IDX], seq_length=SEQUENCE_LENGTH, tokenizer=tokenizer, - global_batch_size=GBS, - micro_batch_size=MBS, - use_hf_tokenizer_chat_template=True, + global_batch_size=DISTILL_GBS, + micro_batch_size=DISTILL_MBS, + split="99,1,0", ) - # 1. Process data and Import and save initial NeMo checkpoint - lima_data = run.Script("process_lima.py", entrypoint="python") - - initial_model_out = f"{args.experiment_name}/{model_name}_initial" + # 1. Import and save initial NeMo checkpoint + initial_model_out = f"{exp_dir}/{model_name}_initial" + model_module = getattr(llm, model_name) import_model = run.Partial( llm.import_ckpt, model=model_module.model(), - source=f"hf://{args.model_hf}", + source=f"hf://{args.model_name}", output_path=initial_model_out, + overwrite=True, ) - # 2. Finetune - finetune = get_finetune_recipe(args.recipe) - finetune.tokenizer = "data" - finetune.data = data - finetune.resume.restore_config.path = initial_model_out - finetune.log.log_dir = f"{args.experiment_name}/finetune_log_dir" - finetune.trainer.strategy.tensor_model_parallel_size = GPUS - finetune.trainer.max_steps = FINETUNE_STEPS - finetune.trainer.val_check_interval = VAL_INTERVAL - finetune.trainer.limit_val_batches = VAL_BATCHES - - # 3. Prune to obtain student + # 2. Prune to obtain student prune_data = data.clone() + prune_data.micro_batch_size = PRUNE_MBS prune_data.global_batch_size = prune_data.micro_batch_size - finetuned_teacher_path = run.Config(get_most_recent_ckpt, finetune.log.log_dir) - pruned_model_out = f"{args.experiment_name}/{model_name}_pruned" + pruned_model_out = f"{exp_dir}/{model_name}_pruned" prune = prune_recipe( - nemo_checkpoint=finetuned_teacher_path, + nemo_checkpoint=initial_model_out, save_path=pruned_model_out, ) - prune.pruning_config.target_hidden_size = args.prune_target_hidden_size - prune.pruning_config.target_ffn_hidden_size = args.prune_target_hidden_size * 3 - prune.devices = GPUS - prune.pp_size = GPUS + prune.tokenizer_path = args.hf_tokenizer + prune.pruning_config.target_num_layers = args.prune_target_num_layers + prune.devices = 1 + prune.pp_size = 1 prune.data = prune_data prune.num_train_samples = PRUNE_SAMPLES prune.legacy_ckpt = True - # 4. Distill + # 3. Distill distill = distillation_recipe( - teacher_model_path=finetuned_teacher_path, + teacher_model_path=initial_model_out, student_model_path=pruned_model_out, + num_nodes=args.nodes, + num_gpus_per_node=args.train_gpus, ) distill.data = data distill.tokenizer = "data" - distill.log.log_dir = f"{args.experiment_name}/distill_log_dir" + distill.log.log_dir = f"{exp_dir}/distill_log_dir" + distill.log.ckpt.train_time_interval = run.Config(timedelta, hours=SAVE_CKPT_EVERY_N_HOURS) + distill.log.ckpt.save_top_k = 2 + distill.optim.config.lr = MAX_LR + distill.optim.lr_scheduler.min_lr = MIN_LR + distill.optim.lr_scheduler.warmup_steps = WARMUP_STEPS distill.trainer.max_steps = DISTILL_STEPS distill.trainer.val_check_interval = VAL_INTERVAL distill.trainer.limit_val_batches = VAL_BATCHES - distill.trainer.strategy.ckpt_load_strictness = StrictHandling.LOG_ALL + distill.trainer.strategy.tensor_model_parallel_size = args.train_gpus + distill.trainer.strategy.ckpt_load_strictness = "log_all" + + # 4. Evaluate MMLU + if args.use_slurm: + mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py" + else: + mmlu_script_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../common/in_memory_mmlu.py") + ) + eval_teacher = run.Script( + mmlu_script_path, + entrypoint="python", + args=["--nemo_ckpt", initial_model_out, "--tensor_parallelism", f"{args.train_gpus}"], + ) + eval_student = run.Script( + mmlu_script_path, + entrypoint="python", + args=[ + "--finetuned_ckpt_dir", + distill.log.log_dir, + "--tensor_parallelism", + f"{args.train_gpus}", + ], + ) # 5. Export - export_path = f"{args.experiment_name}/{model_name}_final" + export_path = f"{exp_dir}/{model_name}_final" export_model = run.Partial( export_most_recent_ckpt, - directory=f"{args.experiment_name}/distill_log_dir", + directory=distill.log.log_dir, output_path=export_path, ) - # Run all - executor_single = run.LocalExecutor() - executor_multi = run.LocalExecutor(launcher="torchrun", ntasks_per_node=GPUS) - with run.Experiment(args.experiment_name, log_level="INFO") as exp: - if args.data_path == "lima_processed": - s0 = exp.add(lima_data, tail_logs=True, name="lima_data", executor=run.LocalExecutor()) - s1 = exp.add(import_model, executor=executor_single, tail_logs=True, name="import") + # Setup executors + if args.use_slurm: + gpu_executor = create_slurm_executor( + SLURM_CONFIG, + ntasks_per_node=1, + num_gpus=1, + nodes=1, + ) + multi_gpu_executor = create_slurm_executor( + SLURM_CONFIG, + ntasks_per_node=args.train_gpus, + num_gpus=args.train_gpus, + nodes=args.nodes, + ) + else: + gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=1) + multi_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus) + + # Execute + with run.Experiment(exp_dir, log_level="INFO") as exp: + s1 = exp.add( + import_model, + executor=gpu_executor, + tail_logs=True, + name="01_import", + ) s2 = exp.add( - finetune, executor=executor_multi, tail_logs=True, name="finetune", dependencies=[s1] + prune, + executor=gpu_executor, + tail_logs=True, + name="02_prune", + dependencies=[s1], ) s3 = exp.add( - prune, executor=executor_multi, tail_logs=True, name="prune", dependencies=[s2] + distill, + executor=multi_gpu_executor, + tail_logs=True, + name="03_distill", + dependencies=[s2], ) - s4 = exp.add( - distill, executor=executor_multi, tail_logs=True, name="distill", dependencies=[s3] + _ = exp.add( + eval_teacher, + executor=multi_gpu_executor, + tail_logs=True, + name="eval_teacher", + dependencies=[s1], ) - s5 = exp.add( - export_model, executor=executor_single, tail_logs=True, name="export", dependencies=[s4] + _ = exp.add( + eval_student, + executor=multi_gpu_executor, + tail_logs=True, + name="eval_student", + dependencies=[s3], ) - exp.run() + # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo + multi_gpu_executor.ntasks_per_node = 1 # will throw error if more than 1 task during export + _ = exp.add( + export_model, + executor=multi_gpu_executor, + tail_logs=True, + name="04_export", + dependencies=[s3], + ) + exp.run(detach=True) + + +if __name__ == "__main__": + args = get_args() + + # # # # # # # # SLURM SETUP # # # # # # + # # # # # # # MODIFY THIS # # # # # # # + if args.use_slurm: + SLURM_CONFIG = SlurmConfig( + account="", + partition_gpu="batch", + partition_cpu="cpu", + time="HH:MM:SS", + container_image="nvcr.io/nvidia/nemo:25.09", + env_vars={ + "HF_TOKEN": "", + }, + use_local_tunnel=False, + host="", + user="", + container_mounts=[], + job_dir="/path/to/logs", + ) + + # # # # # # # # # # # # # # # # # # # # # # + # # # # # CONFIGURABLE PARAMETERS # # # # # + SEQUENCE_LENGTH = 8192 + PRUNE_MBS = 4 + DISTILL_MBS = 2 + VAL_BATCHES = 32 + MAX_LR = 1e-4 + MIN_LR = 1e-5 + WARMUP_STEPS = 100 + SAVE_CKPT_EVERY_N_HOURS = 3.5 + if args.mock_run: + PRUNE_SAMPLES = 3 + DISTILL_GBS = 8 + DISTILL_STEPS = 20 + VAL_INTERVAL = 10 + else: + PRUNE_SAMPLES = 1024 + DISTILL_GBS = 768 + _NUM_TOKENS = 89694564352 + DISTILL_STEPS = int(_NUM_TOKENS / DISTILL_GBS / SEQUENCE_LENGTH) + VAL_INTERVAL = 1000 + # # # # # # # # # # # # # # # # # # # # # # + + main(args) From 3090a985516a3ae49168d9c93a82434b0d58dab1 Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Mon, 29 Sep 2025 11:37:55 -0700 Subject: [PATCH 2/3] Review suggestions Signed-off-by: Asha Anoosheh --- examples/nemo_run/common/process_climbmix.py | 14 +++-------- examples/nemo_run/prune_distill/README.md | 21 +++++++--------- .../prune_distill/nemo_prune_kd_flow.py | 24 ++++++++----------- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/examples/nemo_run/common/process_climbmix.py b/examples/nemo_run/common/process_climbmix.py index 27533d93f..18fd35f2d 100644 --- a/examples/nemo_run/common/process_climbmix.py +++ b/examples/nemo_run/common/process_climbmix.py @@ -41,27 +41,19 @@ def get_args(): default="Qwen/Qwen3-8B", help="Tokenizer to use for preprocessing", ) - parser.add_argument( - "--subset-indices", - help="Comma-separated subset indices to download", - ) return parser.parse_args() if __name__ == "__main__": args = get_args() - Path(args.output_dir).mkdir(exist_ok=True) + Path(args.output_dir).mkdir(parents=True, exist_ok=True) # create raw and processed directories raw_dir = Path(args.output_dir) / "climbmix_raw" proc_dir = Path(args.output_dir) / "climbmix_proc" # only download the subset of the data - if args.subset_indices: - subset_idx = [int(i) for i in args.subset_indices.split(",")] - else: - subset_idx = SUBSET_IDX - subset_filenames = [f"part_{i}.jsonl" for i in subset_idx] + subset_filenames = [f"part_{i}.jsonl" for i in SUBSET_IDX] # download raw data snapshot_download( @@ -72,7 +64,7 @@ def get_args(): ) # preprocess (tokenize) - print("Processing ClimbMix dataset...") + print("Tokenizing ClimbMix dataset...") input_paths = [raw_dir / name for name in subset_filenames] megatron_preprocess_data( input_paths, diff --git a/examples/nemo_run/prune_distill/README.md b/examples/nemo_run/prune_distill/README.md index 2f753da59..d533bf34f 100644 --- a/examples/nemo_run/prune_distill/README.md +++ b/examples/nemo_run/prune_distill/README.md @@ -2,10 +2,6 @@ # NeMo Pruning + Knowledge Distillation Simplified Flow Example -[Slurm Examples](ADVANCED.md) | -[Advanced Topics](ADVANCED.md) | -[NeMo Integration](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/modelopt) - ## Overview @@ -36,14 +32,15 @@ graph TD; ## Results -Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~14,000 steps with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours) +Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~28,000 steps (determined by sequence length, default 4096) with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours) -| | Tokens per Second | MMLU | -|---------------------------|-------------------|------| -| Qwen3-8B Original | 4420 | 74.9 | -| Qwen3-6B Pruned+Distilled | 6950 | 72.5 | +| | Tokens per Second | MMLU | +|-----------------------------------|-------------------|------| +| Qwen3-8B Original | 4420 | 74.9 | +| Qwen3-6B Pruned+Distilled from 8B | 6950 | 72.5 | +| Qwen3-4B Original (comparison) | 5210 | 70.0 | -The resulting compressed model maintains competitive performance while being significantly faster with a smaller memory footprint. +The resulting compressed student maintains competitive performance while being significantly faster with a smaller memory footprint than the teacher. It also happens to have both better performance and throughput than the existing Qwen3-4B model! ## Usage @@ -58,7 +55,7 @@ To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia Example docker command: ```bash -docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash +docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer:/opt/TensorRT-Model-Optimizer --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash ``` You will also need to set your Huggingface token with `export HF_TOKEN=`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written. @@ -84,7 +81,7 @@ From the `nemo_run` folder, launch the example with the `nemo_prune_kd_flow.py` To perform Pruning + Knowledge Distillation, run: ```bash -python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbix_proc --use-slurm +python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbmix_proc --use-slurm ``` ## Supported models diff --git a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py index 19d4ab0f9..3bafb4fe5 100644 --- a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py +++ b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py @@ -41,9 +41,9 @@ def get_args(): default="prune_distill_flow", ) parser.add_argument( - "--model-name", + "--model-id-or-path", type=str, - help="Name of the HF model", + help="ID or path of the HF model", default="Qwen/Qwen3-8B", ) parser.add_argument( @@ -55,12 +55,6 @@ def get_args(): "_(_ or other special settings)" ), ) - parser.add_argument( - "--hf-tokenizer", - type=str, - help="Name of HF model to use for tokenizer.", - default="Qwen/Qwen3-8B", - ) parser.add_argument( "--prune-target-num-layers", type=int, @@ -119,10 +113,12 @@ def main(args): seq_length=SEQUENCE_LENGTH, ) else: + if not args.data_dir: + raise ValueError("--data-dir must be provided unless --mock-run is enabled.") tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", - model_name=args.hf_tokenizer, + model_name=args.model_id_or_path, ) data = run.Config( PreTrainingDataModule, @@ -140,7 +136,7 @@ def main(args): import_model = run.Partial( llm.import_ckpt, model=model_module.model(), - source=f"hf://{args.model_name}", + source=f"hf://{args.model_id_or_path}", output_path=initial_model_out, overwrite=True, ) @@ -154,7 +150,7 @@ def main(args): nemo_checkpoint=initial_model_out, save_path=pruned_model_out, ) - prune.tokenizer_path = args.hf_tokenizer + prune.tokenizer_path = args.model_id_or_path prune.pruning_config.target_num_layers = args.prune_target_num_layers prune.devices = 1 prune.pp_size = 1 @@ -304,7 +300,7 @@ def main(args): # # # # # # # # # # # # # # # # # # # # # # # # # # # CONFIGURABLE PARAMETERS # # # # # - SEQUENCE_LENGTH = 8192 + SEQUENCE_LENGTH = 4096 PRUNE_MBS = 4 DISTILL_MBS = 2 VAL_BATCHES = 32 @@ -318,9 +314,9 @@ def main(args): DISTILL_STEPS = 20 VAL_INTERVAL = 10 else: - PRUNE_SAMPLES = 1024 + PRUNE_SAMPLES = 512 DISTILL_GBS = 768 - _NUM_TOKENS = 89694564352 + _NUM_TOKENS = int(90e9) DISTILL_STEPS = int(_NUM_TOKENS / DISTILL_GBS / SEQUENCE_LENGTH) VAL_INTERVAL = 1000 # # # # # # # # # # # # # # # # # # # # # # From 9269c342afe22eb93e47289a5a7dccaa8328703b Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Wed, 1 Oct 2025 06:16:07 -0700 Subject: [PATCH 3/3] More suggestions Signed-off-by: Asha Anoosheh --- examples/nemo_run/prune_distill/README.md | 38 ++++++++++--------- .../prune_distill/nemo_prune_kd_flow.py | 35 ++++++++++------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/examples/nemo_run/prune_distill/README.md b/examples/nemo_run/prune_distill/README.md index d533bf34f..314c6f5a0 100644 --- a/examples/nemo_run/prune_distill/README.md +++ b/examples/nemo_run/prune_distill/README.md @@ -12,35 +12,37 @@ After structured pruning, the compressed model may show some accuracy degradatio ## Flow Stages -The Simplified Flow runs the following steps in order: +The Simplified Flow runs the following steps: 1. 01_import — Import HuggingFace model to NeMo format -1. 02_prune — Apply structured pruning to create a compressed student model +1. 02a_eval_teacher — Evaluate teacher model on 5% of MMLU benchmark +1. 02b_prune — Apply structured pruning to create a compressed student model 1. 03_distill — Knowledge distillation from teacher to pruned student model -1. 04_export — Export final compressed model to HuggingFace format -1. eval_teacher — Evaluate teacher model on 5% of MMLU benchmark -1. eval_student — Evaluate student model on 5% of MMLU benchmark +1. 04a_eval_student — Evaluate student model on 5% of MMLU benchmark +1. 04b_export — Export final compressed model to HuggingFace format ```mermaid graph TD; -01_import-->02_prune; -01_import-->eval_teacher; -02_prune-->03_distill; -03_distill-->eval_student; -03_distill-->04_export; +01_import-->02a_eval_teacher; +01_import-->02b_prune; +02b_prune-->03_distill; +03_distill-->04a_eval_student; +03_distill-->04b_export; ``` ## Results Pruning + Knowledge Distillation of Qwen3-8B achieves significant model compression while recovering most of the accuracy through distillation. We depth-prune the model from 32 to 24 layers (reducing from 8B to 6B parameters) and distill for ~28,000 steps (determined by sequence length, default 4096) with a learning rate of 1e-4 and global batch size of 768 using a 25% subset of the [ClimbMix dataset](https://huggingface.co/datasets/OptimalScale/ClimbMix). (This is about 90 billion tokens and takes a total of ~6k H100 GPU hours) -| | Tokens per Second | MMLU | -|-----------------------------------|-------------------|------| -| Qwen3-8B Original | 4420 | 74.9 | -| Qwen3-6B Pruned+Distilled from 8B | 6950 | 72.5 | -| Qwen3-4B Original (comparison) | 5210 | 70.0 | +| | Tokens per Second * | MMLU | +|-----------------------------------|---------------------|------| +| Qwen3-8B Original | 4420 | 74.9 | +| Qwen3-6B Pruned+Distilled from 8B | 6950 | 72.5 | +| Qwen3-4B Original (comparison) | 5210 | 70.0 | -The resulting compressed student maintains competitive performance while being significantly faster with a smaller memory footprint than the teacher. It also happens to have both better performance and throughput than the existing Qwen3-4B model! +The resulting compressed student maintains competitive performance while being significantly faster with fewer parameters than the teacher. It also happens to have both better performance and throughput than the existing Qwen3-4B model! + +\* _Measured on H100 using TRT-LLM, FP8 precision_ ## Usage @@ -76,7 +78,7 @@ This will download and process the ClimbMix dataset, creating the necessary data After launching the NeMo container with the specified mounts, change the contents of the `SLURM_CONFIG` in `nemo_prune_kd_flow.py` to reflect your environment, and then perform the following: -From the `nemo_run` folder, launch the example with the `nemo_prune_kd_flow.py` script. To use a different model than the default model (Qwen3-8B), you can add the `--model-name --base-recipe ` flags and use the model's HuggingFace name and NeMo recipe names listed [here](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/recipes). Provide the processed dataset path using the `--data-dir` flag. +Launch the example with the `nemo_prune_kd_flow.py` script. To use a different model than the default model (Qwen3-8B), you can add the `--model-name --base-recipe ` flags and use the model's HuggingFace name and NeMo recipe names listed [here](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/recipes). Provide the processed dataset path using the `--data-dir` flag. To perform Pruning + Knowledge Distillation, run: @@ -84,6 +86,8 @@ To perform Pruning + Knowledge Distillation, run: python prune_distill/nemo_prune_kd_flow.py --log-dir /my/log/dir --data-dir /path/to/climbmix_proc --use-slurm ``` +> **_NOTE:_** You can omit the `--use-slurm` flag to run locally for testing, and optionally with `--mock-run` to use a mock dataset. + ## Supported models Locally this script currently supports models that can be trained on 1 node with 8 x 80GB GPUs. On Slurm you can configure the number of nodes/gpus for training and pruning with the following flags: `--nodes`, `--train-gpus`. diff --git a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py index 3bafb4fe5..4501769f4 100644 --- a/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py +++ b/examples/nemo_run/prune_distill/nemo_prune_kd_flow.py @@ -73,7 +73,14 @@ def get_args(): parser.add_argument( "--data-dir", type=str, - help="Path the preprocessed dataset", + help="Path to the preprocessed dataset", + ) + parser.add_argument( + "--data-prefixes", + type=str, + nargs="*", + help="Prefixes of the .bin and .idx files in the data directory", + default=[f"part_{i}_text_document" for i in SUBSET_IDX], ) parser.add_argument( "--train-gpus", @@ -122,7 +129,7 @@ def main(args): ) data = run.Config( PreTrainingDataModule, - paths=[f"{args.data_dir}/part_{i}_text_document" for i in SUBSET_IDX], + paths=[f"{args.data_dir}/{prefix}" for prefix in args.data_prefixes], seq_length=SEQUENCE_LENGTH, tokenizer=tokenizer, global_batch_size=DISTILL_GBS, @@ -236,11 +243,18 @@ def main(args): tail_logs=True, name="01_import", ) + _ = exp.add( + eval_teacher, + executor=multi_gpu_executor, + tail_logs=True, + name="02a_eval_teacher", + dependencies=[s1], + ) s2 = exp.add( prune, executor=gpu_executor, tail_logs=True, - name="02_prune", + name="02b_prune", dependencies=[s1], ) s3 = exp.add( @@ -250,18 +264,11 @@ def main(args): name="03_distill", dependencies=[s2], ) - _ = exp.add( - eval_teacher, - executor=multi_gpu_executor, - tail_logs=True, - name="eval_teacher", - dependencies=[s1], - ) _ = exp.add( eval_student, executor=multi_gpu_executor, tail_logs=True, - name="eval_student", + name="04a_eval_student", dependencies=[s3], ) # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo @@ -270,7 +277,7 @@ def main(args): export_model, executor=multi_gpu_executor, tail_logs=True, - name="04_export", + name="04b_export", dependencies=[s3], ) exp.run(detach=True) @@ -316,8 +323,8 @@ def main(args): else: PRUNE_SAMPLES = 512 DISTILL_GBS = 768 - _NUM_TOKENS = int(90e9) - DISTILL_STEPS = int(_NUM_TOKENS / DISTILL_GBS / SEQUENCE_LENGTH) + NUM_TOKENS = int(90e9) + DISTILL_STEPS = int(NUM_TOKENS / DISTILL_GBS / SEQUENCE_LENGTH) VAL_INTERVAL = 1000 # # # # # # # # # # # # # # # # # # # # # #