diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 801f7ee2..3d895694 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -11,6 +11,7 @@ Quantization Aware Training (QAT) helps to improve the model accuracy beyond pos | Support Matrix | View the support matrix to see quantization compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | | End to End QAT | Example scripts demonstrating quantization techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qat-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | | End to End QAD | Example scripts demonstrating quantization aware distillation techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qad-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| NeMo QAT/QAD Simplified Flow | Example script demonstrating end-to-end QAT/QAD in NeMo | \[[Link](../nemo_run/qat/README.md)\] | | | Evaluate Accuracy | Evaluating model accuracy after QAT/QAD (with fake quantization) | \[[Link](#testing-qat-model-with-llm-benchmarks-for-accuracy-evaluation)\] | | | Deployment | Deploying the model after QAT/QAD | \[[Link](#deployment)\] | | | QLoRA | Model training with reduced GPU memory | \[[Link](#end-to-end-qlora-with-real-quantization)\] | | diff --git a/examples/nemo_run/common/in_memory_mmlu.py b/examples/nemo_run/common/in_memory_mmlu.py new file mode 100644 index 00000000..c9ab11e3 --- /dev/null +++ b/examples/nemo_run/common/in_memory_mmlu.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo.collections.llm.modelopt import setup_trainer_and_restore_model_with_modelopt_spec + +from modelopt.torch.export.plugins.nemo_run import _get_most_recent_ckpt +from modelopt.torch.utils.plugins.megatron_mmlu import megatron_mmlu + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "Run MMLU evaluation with ModelOpt Megatron model. Provide either --nemo_ckpt" + "or --finetuned_ckpt_dir" + ) + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--nemo_ckpt", type=str, required=False, help="Path to NeMo checkpoint.") + group.add_argument( + "--finetuned_ckpt_dir", + required=False, + type=str, + help="Checkpoint directory of 1 or more finetuned models", + ) + parser.add_argument( + "--tensor_parallelism", type=int, default=1, help="Tensor parallelism size." + ) + parser.add_argument( + "--pipeline_parallelism", type=int, default=1, help="Pipeline parallelism size." + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + ckpt_path = args.nemo_ckpt + if args.finetuned_ckpt_dir: + ckpt_path = _get_most_recent_ckpt(args.finetuned_ckpt_dir) + model, trainer = setup_trainer_and_restore_model_with_modelopt_spec( + ckpt_path, + tensor_model_parallel_size=args.tensor_parallelism, + pipeline_model_parallel_size=args.pipeline_parallelism, + devices=args.tensor_parallelism * args.pipeline_parallelism, + ) + tokenizer = model.tokenizer.tokenizer + megatron_mmlu(model.module, tokenizer) diff --git a/examples/nemo_run/llama_chat_template.txt b/examples/nemo_run/common/llama_chat_template.txt similarity index 100% rename from examples/nemo_run/llama_chat_template.txt rename to examples/nemo_run/common/llama_chat_template.txt diff --git a/examples/nemo_run/process_lima.py b/examples/nemo_run/common/process_lima.py similarity index 100% rename from examples/nemo_run/process_lima.py rename to examples/nemo_run/common/process_lima.py diff --git a/examples/nemo_run/common/process_openscience.py b/examples/nemo_run/common/process_openscience.py new file mode 100644 index 00000000..61172dc5 --- /dev/null +++ b/examples/nemo_run/common/process_openscience.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from pathlib import Path + +from datasets import load_dataset + + +def get_parser(): + parser = argparse.ArgumentParser(description="Process nvidia/OpenScience dataset") + parser.add_argument("--output-dir", type=str, default=".") + return parser + + +def convert_row_oai(row: dict): + return { + "messages": [ + {"role": "user", "content": row["input"]}, + {"role": "assistant", "content": row["output"]}, + ] + } + + +def process_subset(raw_dir, proc_dir): + ds = load_dataset(raw_dir) + ds = ds.map(convert_row_oai, remove_columns=["input", "output"]) + + split_ds = ds["train"].train_test_split(test_size=0.1) + split_ds["train"].to_json(os.path.join(proc_dir, "training.jsonl")) + split_ds["test"].to_json(os.path.join(proc_dir, "validation.jsonl")) + + +if __name__ == "__main__": + args = get_parser().parse_args() + raw_dir = f"{args.output_dir}/openscience_raw" + proc_dir = f"{args.output_dir}/openscience_proc" + + if not os.path.exists(raw_dir): + q235_subset = load_dataset("nvidia/OpenScience", data_files="OS-Q3-235B-4.jsonl") + q235_subset.save_to_disk(raw_dir) + + if not os.path.exists(proc_dir): + Path(proc_dir).mkdir(exist_ok=True) + print("Processing OpenScience dataset") + process_subset(raw_dir, proc_dir) + else: + print(f"Processed OpenScience dataset exists in: {proc_dir}, skipped processing") diff --git a/examples/nemo_run/common/utils.py b/examples/nemo_run/common/utils.py new file mode 100644 index 00000000..3f1bf8fc --- /dev/null +++ b/examples/nemo_run/common/utils.py @@ -0,0 +1,139 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +from dataclasses import dataclass, field + +import nemo_run as run +from nemo.collections import llm + + +@dataclass +class SlurmConfig: + """Configuration for SlurmExecutor.""" + + account: str = "" # Your Slurm account + partition_cpu: str = "" # Slurm CPU partition to use + partition_gpu: str = "" # Slurm GPU partition to use + time: str = "" # Job time limit (HH:MM:SS) + container_image: str = "" # Container image for jobs + env_vars: dict[str, str] = field(default_factory=dict) # Environment variables to set + container_mounts: list[str] = field(default_factory=list) # Container mounts + use_local_tunnel: bool = False # Set to True if running from within the cluster + host: str = "" # Required for SSH tunnel: Slurm cluster hostname + user: str = "" # Required for SSH tunnel: Your username + job_dir: str = "" # Required for SSH tunnel: Directory to store runs on cluster + identity: str | None = None # Optional for SSH tunnel: Path to SSH key for authentication + + def __post_init__(self): + """Validate the configuration and raise descriptive errors.""" + if not self.account: + raise ValueError("SlurmConfig.account must be set to your actual Slurm account") + if not self.partition_cpu: + raise ValueError("SlurmConfig.partition_cpu must be set") + if not self.partition_gpu: + raise ValueError("SlurmConfig.partition_gpu must be set") + if not self.time: + raise ValueError("SlurmConfig.time must be set to job time limit (e.g., '02:00:00')") + if not self.container_image: + raise ValueError("SlurmConfig.container_image must be set to container image for jobs") + if not self.use_local_tunnel: + # Only validate SSH tunnel settings if not using local tunnel + if not self.host: + raise ValueError( + "SlurmConfig.host must be set to your actual cluster hostname when using SSH tunnel" + ) + if not self.user: + raise ValueError( + "SlurmConfig.user must be set to your actual username when using SSH tunnel" + ) + if not self.job_dir: + raise ValueError( + "SlurmConfig.job_dir must be set to directory for storing runs on cluster" + ) + + self.env_vars |= { + "CUDA_DEVICE_MAX_CONNECTIONS": "1", # Disable GPU communication/computation overlap for performance + "TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + } + + +def create_slurm_executor( + slurm_cfg: SlurmConfig, nodes: int = 1, ntasks_per_node: int = 1, num_gpus: int = 0 +): + # Configure tunnel + if slurm_cfg.use_local_tunnel: + # Use LocalTunnel when already on the cluster + tunnel = run.LocalTunnel(job_dir=slurm_cfg.job_dir) + else: + # Use SSH tunnel when launching from local machine + tunnel = run.SSHTunnel( + host=slurm_cfg.host, + user=slurm_cfg.user, + job_dir=slurm_cfg.job_dir, + identity=slurm_cfg.identity, # can be None + ) + + if num_gpus > 0: + return run.SlurmExecutor( + account=slurm_cfg.account, + partition=slurm_cfg.partition_gpu, + ntasks_per_node=ntasks_per_node, + gpus_per_node=num_gpus, + nodes=nodes, + tunnel=tunnel, + container_image=slurm_cfg.container_image, + container_mounts=slurm_cfg.container_mounts, + time=slurm_cfg.time, + packager=run.GitArchivePackager(), + mem="0", + gres=f"gpu:{num_gpus}", + ) + else: + return run.SlurmExecutor( + account=slurm_cfg.account, + partition=slurm_cfg.partition_cpu, + nodes=nodes, + tunnel=tunnel, + container_image=slurm_cfg.container_image, + container_mounts=slurm_cfg.container_mounts, + time=slurm_cfg.time, + packager=run.GitArchivePackager(), + mem="0", + ) + + +def get_finetune_recipe(recipe_name: str): + if not hasattr(getattr(llm, recipe_name), "finetune_recipe"): + raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe") + return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None) + + +def read_chat_template(template_path: str): + with open(template_path) as f: + return f.read().strip() + + +def download_hf_dataset(dataset_name: str, output_dir: str | None = None): + """Download a dataset from HuggingFace Hub using huggingface-cli.""" + cmd = ["huggingface-cli", "download", dataset_name, "--repo-type", "dataset"] + + if output_dir: + cmd.extend(["--local-dir", output_dir]) + + subprocess.run(cmd, check=True) + print(f"Successfully downloaded dataset: {dataset_name}") diff --git a/examples/nemo_run/qat/ADVANCED.md b/examples/nemo_run/qat/ADVANCED.md new file mode 100644 index 00000000..72629a60 --- /dev/null +++ b/examples/nemo_run/qat/ADVANCED.md @@ -0,0 +1,56 @@ +# NeMo QAT/QAD Flow: Advanced Topics + +If you need to run QAT/QAD on a Slurm cluster (for example to use more than 1 node), this guide covers how to configure and launch on Slurm. + +To run the example on slurm, edit the `SLURM_CONFIG` at the bottom of `nemo_qat_flow.py` with the appropriate credentials, container, cluster name (host), and container mounts. Make sure you are mounting the NeMo and Megatron-LM repositories above in the Slurm cluster and that you've checked out the correct commits. + +## Running the Flow on Slurm + +To launch the Flow on a Slurm cluster, modify your Slurm credentials at the bottom of `nemo_qat_flow.py` and add the `--use-slurm` flag to the command. On a different server (e.g. your local server), launch the NeMo container as described in the [README](README.md) then run `python qat/nemo_qat_flow.py --use-slurm --log-dir /slurm/log/dir`, which will `ssh` into the Slurm cluster, `rsync` your files over, and launch the tasks. The log directory on the Slurm cluster should look like this after an experiment is run (assuming your experiment name is `qat_flow_ckpts`) + +```bash +qat_flow_ckpts qat_flow_ckpts_1755708286 +``` + +If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you'll find a directory structure like the following. Each folder is for a stage of the Simplified Flow, and in each stage you can see the logs for that stage as well as the sbatch command that was run. You can `cd` into each stage and `tail -f` the log file to see the logs while the stage is running. + +```bash +├── 00_openscience_data +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664.out +├── 01_import_model +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.01_import_model_5345665_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.01_import_model_5345665.out +├── 02_mmlu_bf16 +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666.out +├── 03_ptq +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.03_ptq_5345667_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.03_ptq_5345667.out +├── 04_mmlu_ptq +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668.out +├── 05_train +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.05_train_5345669_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.05_train_5345669.out +├── 06_mmlu_sft +│   ├── code +│   └── configs +├── 07_export_hf +│   ├── code +│   └── configs +``` + +**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency. diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md index 3cecf7c6..79715953 100644 --- a/examples/nemo_run/qat/README.md +++ b/examples/nemo_run/qat/README.md @@ -1,35 +1,86 @@ +
+ # NeMo QAT/QAD Simplified Flow Example +[Slurm Examples](ADVANCED.md) | +[Advanced Topics](ADVANCED.md) | +[NeMo Integration](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/modelopt) + +
+ ## Overview -This directory also contains an end-to-end NeMo QAT Simplified Flow example, which supports both QAT with cross-entropy loss and QAD (quantization-aware distillation) with knowledge-distillation loss between the full-precision teacher and quantized student models. +This directory contains an end-to-end QAT Simplified Flow example using NeMo for model training. It supports both QAT with cross-entropy loss and QAD (quantization-aware distillation) with knowledge-distillation loss between the BF16 teacher and quantized student models. + +After PTQ (post-training quantization), the quantized model may show some accuracy degradation on tasks like MMLU; the QAT/QAD stages aim to recover that loss. + +## Flow Stages + +The Simplified Flow runs the following steps in order: + +1. 00_openscience_data — Process NVIDIA/OpenScience data (skipped if `--data-path` is given) +1. 01_import_model — Import NeMo BF16 model checkpoint +1. 02_mmlu_bf16 — Evaluate 5% MMLU on BF16 checkpoint +1. 03_ptq — Apply PTQ +1. 04_mmlu_ptq — Evaluate 5% MMLU on PTQ checkpoint +1. 05_train — SFT/QAT (and optional QAD) +1. 06_mmlu_sft — Evaluate 5% MMLU on SFT/QAT checkpoint +1. 07_export_hf — Export to Hugging Face (Unified) format + +```mermaid +graph TD; +00_openscience_data-->05_train; +01_import_model-->02_mmlu_bf16; +01_import_model-->03_ptq; +03_ptq-->04_mmlu_ptq; +03_ptq-->05_train; +05_train-->06_mmlu_sft; +05_train-->07_export_hf; +``` + +## Results + +QAT of Qwen3-8B NVFP4 recovers most of the accuracy on the MMLU benchmark after NVFP4 PTQ. We finetune the Qwen3-8B NVFP4 checkpoint for 200 steps with a learning rate of 1e-5 and global batch size of 512 on one node of 8 x H100 GPUs. + +| | MMLU 5% | +|---------------------------|---------| +| Qwen3-8B FP16 | 73.8 | +| Qwen3-8B NVFP4 | 70.3 | +| Qwen3-8B NVFP4 after QAT | 72.8 | + +The resulting exported checkpoint also is much smaller in memory at 6.4GB compared to the original BF16 checkpoint which is 16.4 GB. ## Usage ### Prerequisites -To run the example, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.04.01 or higher using Docker/Slurm. Mount your cloned `modelopt` repository to the container by adding this mount flag to your Docker/Slurm command: `-v :/workspace/modelopt -v /modelopt:/usr/local/lib/python3.12/dist-packages/modelopt`. +You can run the example either locally or on a [Slurm cluster](ADVANCED.md). -To run SFT properly you may also need to clone NeMo and Megatron-LM at the respective commits, and mount to `/opt/NeMo` and `/opt/megatron-lm`: +To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container. -- `git clone https://github.com/NVIDIA-NeMo/NeMo && cd NeMo && git checkout d7b87b1` -- `git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout 8c15450` +- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git` +- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a` -### Running the Flow +Example docker command: + +```bash +docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash +``` + +You will also need to set your Huggingface token with `export HF_TOKEN=`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written. + +### Running the Flow Locally + +After launching the NeMo container with the specified mounts, follow these examples to run the flow locally. #### QAT -From the `nemo_run` folder, launch the example with `python qat/nemo_qat_flow.py --model-name --finetune-recipe `. Available NeMo recipe names are listed [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/recipes). To provide your own custom dataset, use the `--data-path` flag, otherwise the default [LIMA](https://huggingface.co/datasets/GAIR/lima) dataset will be used. +From the `nemo_run` folder, launch the example with the `qat/nemo_qat_flow.py` script. To use a different model than the default model (Qwen3-8B), you can add the `--model-name --finetune-recipe ` flags and use the model's HuggingFace name and NeMo recipe names listed [here](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/recipes). To provide your own custom dataset, use the `--data-path` flag, otherwise the default [NVIDIA OpenScience](https://huggingface.co/datasets/nvidia/OpenScience) dataset will be used. To perform QAT, run: ```bash -python qat/nemo_qat_flow.py \ - --model-name meta-llama/Meta-Llama-3.1-8B-Instruct \ - --finetune-recipe llama31_8b \ - --algorithm fp8 \ - --chat-template llama_chat_template.txt \ - --experiment llama3_qat_nemo +python qat/nemo_qat_flow.py --log-dir /my/log/dir --experiment qat_experiment ``` > **_NOTE:_** To enable KV cache quantization, add `--enable-kv-cache` and specify qformat using `--kv-cache-qformat `. @@ -41,31 +92,26 @@ In order to train using QAD, launch the example with `python qat/nemo_qat_flow.p To perform QAD training, run: ```bash -python qat/nemo_qat_flow.py \ - --model-name meta-llama/Meta-Llama-3.1-8B-Instruct \ - --distill \ - --algorithm fp8 \ - --chat-template llama_chat_template.txt \ - --experiment llama3_qad_nemo +python qat/nemo_qat_flow.py --distill --log-dir /my/log/dir --experiment qad_experiment ``` -### Custom Chat Template +## Supported models -By default the script will use the model/tokenizer's chat template, which may not contain the `{% generation %}` and `{% endgeneration %}` tags around the assistant tokens which are needed to generate the assistant loss mask (see [this PR](https://github.com/huggingface/transformers/pull/30650)). To provide path to a custom chat template, use the `--chat-template ` flag. +Locally this script currently supports models that can be trained on 1 node with 8 x 80GB GPUs. On Slurm you can configure the number of nodes/gpus for training and PTQ with the following flags: `--train-nodes`, `--train-gpus`, `--ptq-gpus`. -## Flow Stages +The default configuration works on 1 node with 4 H100 GPUs for PTQ and 8 H100 GPUs for training with the following model: -Currently the Simplified Flow runs the following steps in order: +- **Model**: Qwen3-8B +- **Recipe**: qwen3_8b -1. Process LIMA data (if `--data-path` is not specified) -1. Import NeMo model checkpoint -1. PTQ the model -1. SFT (finetune) the model -1. Export model to Unified checkpoint (HuggingFace) format +### Common Errors -## Supported models +Depending on the amount of memory your GPUs have, you may get an Out of Memory error. If that happens, add flags for `--tensor_parallelism` or `--pipeline_parallelism` (e.g. `--tensor_parallelism 2`). + +### Custom Chat Template + +By default the script will use the model/tokenizer's chat template, which may not contain the `{% generation %}` and `{% endgeneration %}` tags around the assistant tokens which are needed to generate the assistant loss mask (see [this PR](https://github.com/huggingface/transformers/pull/30650)). To provide path to a custom chat template, use the `--chat-template ` flag. -Currently supports models that can be trained on 1 node with 8 x 80GB GPUs. The default configuration uses: +### Dataset limitations -- **Model**: Meta-Llama-3.1-8B-Instruct -- **Recipe**: llama31_8b +The current QAT recipe has been tuned for the Qwen3-8B model to improve accuracy on the MMLU benchmark after PTQ degradation. QAT/QAD results are highly dependent on the specific model, dataset, and hyperparameters. There is no guarantee that the same dataset will recover the accuracy of the PTQ model. Feel free to try your own model and dataset combinations and test which combination works best. diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py index 5b189410..df921bd1 100644 --- a/examples/nemo_run/qat/nemo_qat_flow.py +++ b/examples/nemo_run/qat/nemo_qat_flow.py @@ -15,21 +15,26 @@ import argparse import os -from pathlib import Path +import sys import nemo_run as run from nemo.collections import llm -from nemo.collections.llm.api import export_ckpt from nemo.collections.llm.gpt.data.chat import ChatDataModule from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices from nemo.collections.llm.modelopt.recipes.distillation_recipe import distillation_recipe from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.utils import logging +from modelopt.torch.export.plugins.nemo_run import export_most_recent_ckpt -def get_parser(): +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "common"))) +from utils import SlurmConfig, create_slurm_executor, get_finetune_recipe, read_chat_template + + +def get_args(): parser = argparse.ArgumentParser( - description="NeMo2.0 QAT/QAD simplified flow. Currently supports running model locally on 1 node with 8 GPUs." + description="""NeMo2.0 QAT/QAD simplified flow. Supports running model locally on 1 node with 8 GPUs + or on a Slurm cluster with 1 or more nodes. Runs QAT on Qwen3-8B NVFP4 with the + nvidia/OpenScience dataset by default.""" ) quant_cfg_choices_list = ["no_quant", *get_quant_cfg_choices()] @@ -37,12 +42,12 @@ def get_parser(): "--model-name", type=str, help="Name of the HF model", - default="meta-llama/Meta-Llama-3.1-8B-Instruct", + default="Qwen/Qwen3-8B", ) parser.add_argument( "--finetune-recipe", type=str, - default="llama31_8b", + default="qwen3_8b", help=( "Choose NeMo 2.0 recipe. Recipes are named in the format of " "_(_ or other special settings)" @@ -53,6 +58,12 @@ def get_parser(): type=str, help="Path to the finetuning chat dataset. Can be either ShareGPT or HuggingFace/OpenAI chat format", ) + parser.add_argument( + "--learning-rate", + type=float, + help="Learning rate", + default=1e-5, + ) parser.add_argument( "--distill", action="store_true", @@ -71,15 +82,14 @@ def get_parser(): required=False, ) parser.add_argument( - "-algo", "--algorithm", type=str, - default="fp8", + default="nvfp4", choices=quant_cfg_choices_list, help="TensorRT-Model-Optimizer quantization algorithm", ) parser.add_argument( - "--slurm", + "--use-slurm", action="store_true", help="Run on slurm using run.SlurmExecutor", default=False, @@ -91,11 +101,32 @@ def get_parser(): default="qat_flow_ckpts", ) parser.add_argument( - "--ptq_gpus", + "--log-dir", + type=str, + help=( + "Path to the directory to store logs. Best to pass in a non-relative path so that " + "artifacts are stored in one location." + ), + default="logs", + ) + parser.add_argument( + "--ptq-gpus", type=int, help="Number of GPUs for quantization. Some models require a different number of GPUs for PTQ vs training.", + default=4, + ) + parser.add_argument( + "--train-gpus", + type=int, + help="Number of GPUs for training", default=8, ) + parser.add_argument( + "--train-nodes", + type=int, + help="Number of nodes for training. Does not apply to PTQ (assumes model will fit in 1 node)", + default=1, + ) parser.add_argument( "--kv-cache-qformat", type=str, @@ -104,91 +135,58 @@ def get_parser(): help="KV-cache quantization format", ) parser.add_argument( - "--enable_kv_cache", help="Enables KV-cache quantization", action="store_true" - ) - parser.add_argument("--disable_kv_cache", dest="enable_kv_cache", action="store_false") - parser.set_defaults(enable_kv_cache=None) - return parser - - -def get_finetune_recipe(recipe): - assert hasattr(llm, recipe), ( - f"Recipe named {recipe} not found. General format is _(_ " - "or other special settings)" + "--enable_kv_cache", + help="Enables KV-cache quantization", + action="store_true", + default=False, ) - finetune_recipe = getattr(llm, recipe).finetune_recipe - return finetune_recipe(peft_scheme=None) # TODO add dir - - -def get_most_recent_subdir(directory: str): - """ - Find the most recent subdirectory in a given directory. - - Args: - directory (str): Path to the directory to search in - - Returns: - str: Path to the most recent subdirectory, or None if no subdirectories exist - """ - dir_path = Path(directory) - # Get all subdirectories - subdirs = [d for d in dir_path.iterdir() if d.is_dir()] - if not subdirs: - return None + parser.add_argument("--tensor_parallelism", type=int, default=2) + parser.add_argument("--pipeline_parallelism", type=int, default=1) + return parser.parse_args() - # Sort by modification time (most recent first) - most_recent = max(subdirs, key=lambda x: x.stat().st_mtime) - return str(most_recent) - -def export_most_recent_ckpt(exp_dir: str, output_path: str): - """ - Args: - exp_dir: experiment directory - output_path: path to write exported model - """ - most_recent_exp = get_most_recent_subdir(f"{exp_dir}/default/") - if "checkpoints" in most_recent_exp: - most_recent_ckpt = most_recent_exp - else: - most_recent_ckpt = get_most_recent_subdir(f"{most_recent_exp}/checkpoints/") - logging.info(f"Exporting checkpoint from {most_recent_ckpt}") - export_ckpt(most_recent_ckpt, "hf", output_path) - - -def _read_chat_template(template_path: str): - with open(template_path) as f: - return f.read().strip() - - -if __name__ == "__main__": - args = get_parser().parse_args() +def main(args): if not args.distill and not args.finetune_recipe: raise ValueError("If distillation is not used, --finetune-recipe must be specified") model_name = args.finetune_recipe model_module = getattr(llm, model_name) if not model_name: model_name = os.path.basename(args.model_name) + exp_dir = f"{args.log_dir.rstrip('/')}/{args.experiment}" # 1. Process data - lima_data = run.Script("process_lima.py", entrypoint="python") + # TODO figure out path + # LOCALLY common/process.py works + # On slurm examples/nemo_run/common/process.py works + + openscience_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../common/process_openscience.py") + ) + openscience_data = run.Script( + openscience_path + if not args.use_slurm + else "examples/nemo_run/common/process_openscience.py", + entrypoint="python", + args=["--output-dir", exp_dir], + ) # 2. Import Model - nemo_ckpt_path = f"{model_name}-nemo" + bf16_ckpt_path = f"{exp_dir}/{model_name}-nemo" import_model = run.Partial( llm.import_ckpt, model=model_module.model(), source=f"hf://{args.model_name}", - output_path=nemo_ckpt_path, + output_path=bf16_ckpt_path, + overwrite=True, ) # 3. PTQ - ptq_model_out = f"{model_name}-{args.algorithm}" + ptq_model_out = f"{exp_dir}/{model_name}-{args.algorithm}" ptq = run.Script( "/opt/NeMo/scripts/llm/ptq.py", args=[ "-nc", - nemo_ckpt_path, + bf16_ckpt_path, "-out", ptq_model_out, "--export_format", @@ -206,64 +204,179 @@ def _read_chat_template(template_path: str): # 4. Train if not args.hf_tokenizer: - tokenizer_path = os.path.join(nemo_ckpt_path, "context/nemo_tokenizer") + tokenizer_path = os.path.join(bf16_ckpt_path, "context/nemo_tokenizer") tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", model_name=tokenizer_path, - chat_template=_read_chat_template(args.chat_template) if args.chat_template else None, + chat_template=read_chat_template(args.chat_template) if args.chat_template else None, ) else: tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", model_name=args.hf_tokenizer, - chat_template=_read_chat_template(args.chat_template) if args.chat_template else None, + chat_template=read_chat_template(args.chat_template) if args.chat_template else None, ) - data_path = args.data_path if args.data_path is not None else "lima_processed" + data_path = args.data_path if args.data_path is not None else f"{exp_dir}/openscience_proc" data = run.Config( ChatDataModule, dataset_root=data_path, - seq_length=4096, + seq_length=SEQUENCE_LENGTH, tokenizer=tokenizer, - global_batch_size=64, - micro_batch_size=1, + global_batch_size=GBS, + micro_batch_size=MBS, use_hf_tokenizer_chat_template=True, + num_workers=2, + persistent_workers=True, ) if args.distill: - train = distillation_recipe(ptq_model_out, nemo_ckpt_path) + train = distillation_recipe(ptq_model_out, bf16_ckpt_path) else: train = get_finetune_recipe(args.finetune_recipe) train.resume.restore_config.path = ptq_model_out + train.optim.config.lr = args.learning_rate train.tokenizer = "data" train.data = data - train.log.log_dir = args.experiment - train.trainer.val_check_interval = 200 - train.trainer.max_steps = 200 + train.log.log_dir = exp_dir + train.trainer.val_check_interval = VAL_INTERVAL + train.trainer.max_steps = TRAIN_STEPS + train.trainer.devices = args.train_gpus + train.trainer.num_nodes = args.train_nodes + train.trainer.limit_val_batches = 32 + train.trainer.strategy.tensor_model_parallel_size = args.tensor_parallelism + train.trainer.strategy.pipeline_model_parallel_size = args.pipeline_parallelism # 5. Export export = run.Partial( - export_most_recent_ckpt, exp_dir=train.log.log_dir, output_path=f"{model_name}_hf" + export_most_recent_ckpt, train.log.log_dir, output_path=f"{exp_dir}/{model_name}_hf" + ) + # 6. Evaluate MMLU + + mmlu_script_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../common/in_memory_mmlu.py") + ) + if args.use_slurm: + mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py" + eval_ptq = run.Script( + mmlu_script_path, + args=["--nemo_ckpt", ptq_model_out, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", + ) + eval_bf16 = run.Script( + mmlu_script_path, + args=["--nemo_ckpt", bf16_ckpt_path, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", + ) + eval_sft = run.Script( + mmlu_script_path, + args=["--finetuned_ckpt_dir", exp_dir, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", ) - with run.Experiment(args.experiment, log_level="INFO") as exp: - ptq_executor = run.LocalExecutor(ntasks_per_node=args.ptq_gpus, launcher="torchrun") + if args.use_slurm: + cpu_executor = create_slurm_executor(SLURM_CONFIG) + ptq_gpu_executor = create_slurm_executor( + SLURM_CONFIG, num_gpus=args.ptq_gpus, ntasks_per_node=args.ptq_gpus + ) + train_gpu_executor = create_slurm_executor( + SLURM_CONFIG, num_gpus=args.train_gpus, ntasks_per_node=args.train_gpus + ) + single_gpu_executor = create_slurm_executor(SLURM_CONFIG, num_gpus=1, ntasks_per_node=1) + else: + cpu_executor = single_gpu_executor = run.LocalExecutor() + ptq_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus) + train_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus) + + with run.Experiment(exp_dir, log_level="INFO") as exp: if not args.data_path: - s0 = exp.add(lima_data, tail_logs=True, name="lima_data", executor=run.LocalExecutor()) + s0 = exp.add( + openscience_data, tail_logs=True, name="00_openscience_data", executor=cpu_executor + ) + # 1. Import BF16 model and evaluate MMLU s1 = exp.add( - import_model, tail_logs=True, name="import_model", executor=run.LocalExecutor() + import_model, tail_logs=True, name="01_import_model", executor=single_gpu_executor + ) + exp.add( + eval_bf16, + tail_logs=True, + name="02_mmlu_bf16", + executor=ptq_gpu_executor, + dependencies=[s1], + ) + + # 2. PTQ model and evaluate PTQ model + s2 = exp.add( + ptq, tail_logs=True, name="03_ptq", executor=ptq_gpu_executor, dependencies=[s1] ) - s2 = exp.add(ptq, tail_logs=True, name="ptq", executor=ptq_executor, dependencies=[s1]) - train_executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") s3 = exp.add( - train, tail_logs=True, name="train", executor=train_executor, dependencies=[s2] + eval_ptq, + tail_logs=True, + name="04_mmlu_ptq", + executor=ptq_gpu_executor, + dependencies=[s2], ) + # 3. Train PTQ model (QAT or QAD) + train_dep = [s3] + if not args.data_path: + train_dep.append(s0) s4 = exp.add( + train, + tail_logs=True, + name="05_train", + executor=train_gpu_executor, + dependencies=train_dep, + ) + s5 = exp.add( + eval_sft, + tail_logs=True, + name="06_mmlu_sft", + executor=ptq_gpu_executor, + dependencies=[s4], + ) + # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo + train_gpu_executor.ntasks_per_node = 1 # will throw error if more than 1 task during export + exp.add( export, tail_logs=True, - name="export_hf", - executor=run.LocalExecutor(), - dependencies=[s3], + name="07_export_hf", + executor=train_gpu_executor, + dependencies=[s5], + ) + exp.run(detach=True) + + +if __name__ == "__main__": + args = get_args() + + # # # # # # # # SLURM SETUP # # # # # # + # # # # # # MODIFY THIS # # # # # # # + if args.use_slurm: + SLURM_CONFIG = SlurmConfig( + account="", + partition_gpu="batch", + partition_cpu="cpu", + time="04:00:00", + container_image="nvcr.io/nvidia/nemo:25.07", + env_vars={ + "HF_TOKEN": "", + }, + use_local_tunnel=False, + host="", + user="", + container_mounts=[], + job_dir="/path/to/logs", + identity=None, ) - exp.run(detach=False) + + # # # # # # # # # # # # # # # # # # # # # # + # # # # # CONFIGURABLE PARAMETERS # # # # # + SEQUENCE_LENGTH = 4096 + MBS = 1 + GBS = 512 + TRAIN_STEPS = 200 + VAL_INTERVAL = 50 + # # # # # # # # # # # # # # # # # # # # # # + + main(args) diff --git a/modelopt/torch/export/plugins/nemo_run.py b/modelopt/torch/export/plugins/nemo_run.py new file mode 100644 index 00000000..63cd7fbe --- /dev/null +++ b/modelopt/torch/export/plugins/nemo_run.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Export functions for NeMo Run.""" + +from pathlib import Path + +from nemo.collections.llm.api import export_ckpt +from nemo.utils import logging + + +def export_most_recent_ckpt(directory: str, output_path: str): + """Export most recent checkpoint from a NeMo Run experiment directory.""" + most_recent_ckpt = _get_most_recent_ckpt(directory) + logging.info(f"Exporting most recent NeMo Run checkpoint: {most_recent_ckpt}") + export_ckpt( + most_recent_ckpt, + "hf", + output_path=output_path, + overwrite=True, + ) + + +def _get_most_recent_subdir(directory: Path): + # Get all subdirectories + subdirs = [d for d in directory.iterdir() if d.is_dir()] + if not subdirs: + raise ValueError(f"No subdirectories found in {directory}") + + # Sort by modification time (most recent first) + most_recent = max(subdirs, key=lambda x: x.stat().st_mtime) + + return most_recent + + +def _get_most_recent_ckpt(directory: str): + """Find the most recent checkpoint subdirectory in a given NeMo Run experiment directory. + + Args: + directory (str): Path to the directory to search in. + + Returns: + str: Path to the most recent subdirectory. + """ + exp_dir = Path(directory) / "default" + if not exp_dir.exists(): + raise FileNotFoundError(f"Experiment directory {exp_dir} does not exist") + + checkpoint_dir = exp_dir / "checkpoints" + if checkpoint_dir.exists(): + most_recent = _get_most_recent_subdir(checkpoint_dir) + else: + most_recent = _get_most_recent_subdir(exp_dir) + checkpoint_dir = most_recent / "checkpoints" + if not checkpoint_dir.exists(): + raise FileNotFoundError(f"Checkpoint directory {checkpoint_dir} does not exist") + most_recent = _get_most_recent_subdir(checkpoint_dir) + + return str(most_recent)