diff --git a/examples/pruning/README.md b/examples/pruning/README.md index e5abb4f3e..840f6d054 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -55,8 +55,8 @@ model = GPTModel( # For Megatron-LM framework, you can use the following utility function from megatron.training.training import evaluate_and_print_results -def forward_loop(model): - evaluate_and_print_results(model, ...) +def forward_loop(_): + evaluate_and_print_results(prefix, forward_step, train_iterator, model, ...) # Specify the pruning constraints (Check Support Matrix for available pruning dimensions) @@ -66,7 +66,7 @@ export_config = { } -# Run the pruning process +# Run the pruning process (if model is a list then pass model[0] to the prune API) # Save minitron scores at scores_path so we can re-run pruning with different export configs without running the forward loop again # NOTE: Skip scores_path on re-running if you want to change the dataset and re-calibrate model, pruning_scores = mtp.prune( @@ -81,7 +81,7 @@ model, pruning_scores = mtp.prune( If your model parameters are already sorted, you can skip the sorting step by setting `"skip_sorting": True` in `config` instead of passing `forward_loop`. > [!Note] -> Fine-tuning / distillation is required after pruning to recover the accuracy. Please refer to pruning [fine-tuning](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/3_pruning.html#pruning-fine-tuning) for more details. +> Fine-tuning / distillation is required after pruning to recover the accuracy. Please refer to [end-to-end pruning and distillation tutorial](https://github.com/NVIDIA-NeMo/NeMo/tree/main/tutorials/llm/qwen/pruning-distillation) for more details. ## Support Matrix diff --git a/modelopt/torch/nas/conversion.py b/modelopt/torch/nas/conversion.py index be5aff0c1..a93c209fb 100644 --- a/modelopt/torch/nas/conversion.py +++ b/modelopt/torch/nas/conversion.py @@ -26,7 +26,9 @@ NASModeRegistry = _ModeRegistryCls("nas") -def convert(model: ModelLike, mode: ModeLike) -> nn.Module: +def convert( + model: ModelLike, mode: ModeLike, registry: _ModeRegistryCls = NASModeRegistry +) -> nn.Module: """Convert a regular PyTorch model into a model that supports design space optimization. Args: @@ -84,7 +86,7 @@ def convert(model: ModelLike, mode: ModeLike) -> nn.Module: #. Use ``*`` as a wildcard matching any layer. """ # apply mode and handle model-like object with wrapper - return apply_mode(model, mode, registry=NASModeRegistry) + return apply_mode(model, mode, registry=registry) def export(model: nn.Module, strict: bool = True, calib: bool = False) -> nn.Module: diff --git a/modelopt/torch/prune/pruning.py b/modelopt/torch/prune/pruning.py index 322e7e0c0..cdc4e7d8f 100644 --- a/modelopt/torch/prune/pruning.py +++ b/modelopt/torch/prune/pruning.py @@ -20,7 +20,6 @@ from torch import nn import modelopt.torch.nas as mtn -from modelopt.torch.opt.conversion import apply_mode from modelopt.torch.opt.mode import ModeLike, _ModeRegistryCls from modelopt.torch.opt.searcher import ConstraintsDict, SearchConfig @@ -199,8 +198,8 @@ def prune( search algorithm. The returned subnet is thus a reference to the same model instance as the input model. """ - # apply prune mode(s) to model - model = apply_mode(model, mode, registry=PruneModeRegistry) + # apply prune mode(s) to model and convert it to DynamicModule + model = mtn.convert(model, mode, registry=PruneModeRegistry) # now run the search and return the result return mtn.search(