diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py index 8a8d70494..52a7b7ed6 100644 --- a/modelopt/torch/nas/plugins/megatron.py +++ b/modelopt/torch/nas/plugins/megatron.py @@ -13,12 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Plugin to add NAS/Pruning support for megatron-core GPT model.""" +"""Plugin to add NAS/Pruning support for megatron-core Language models like GPT and Mamba.""" import types from collections.abc import Callable, Sequence from typing import Any -from warnings import warn import torch import torch.nn as nn @@ -98,7 +97,7 @@ except ImportError: HAS_MAMBA = False -__all__ = ["drop_mcore_gpt_layers", "drop_mcore_language_model_layers"] +__all__ = ["drop_mcore_language_model_layers"] class _DynamicParallelLinear(DynamicModule): @@ -1457,15 +1456,6 @@ def drop_mcore_language_model_layers(model: nn.Module, *, layers_to_drop: list[i model.config.num_layers = new_num_layers -def drop_mcore_gpt_layers(model: nn.Module, *, layers_to_drop: list[int]) -> None: - """[DEPRECATED] Remove given layers (1-indexed) of the model (works with TP and/or PP).""" - warn( - "`drop_mcore_gpt_layers` is deprecated in favor of `drop_mcore_language_model_layers`.", - DeprecationWarning, - ) - drop_mcore_language_model_layers(model, layers_to_drop=layers_to_drop) - - class MegatronConstraintsFunc(ConstraintsFunc): """A Functor class to check if sub-net satisfied all provided constraints. diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py index 59e4ddb39..6d3dfe6eb 100644 --- a/modelopt/torch/prune/plugins/mcore_minitron.py +++ b/modelopt/torch/prune/plugins/mcore_minitron.py @@ -27,23 +27,35 @@ import copy import torch +import torch.nn as nn from pydantic import create_model # isort: off -# import nas plugin to check if it is enabled else raises an Exception +# import nas plugin to check if it is enabled else raises an Exception and disables the plugin from modelopt.torch.nas.plugins.megatron import * # noqa: F403 -from modelopt.torch.nas.plugins.megatron import HAS_MAMBA, _DynamicMCoreLanguageModel +from modelopt.torch.nas.plugins.megatron import ( + HAS_MAMBA, + _DynamicMCoreLanguageModel, + SUPPORTED_MODELS, +) # isort: on from modelopt.torch.nas.conversion import NASModeRegistry from modelopt.torch.nas.registry import DMRegistry -from modelopt.torch.nas.utils import sort_parameters +from modelopt.torch.nas.utils import get_subnet_config, sort_parameters from modelopt.torch.opt.config import ModeloptBaseConfig, get_kwargs_for_create_model_with_rules +from modelopt.torch.opt.conversion import ApplyModeError +from modelopt.torch.opt.dynamic import DynamicSpace +from modelopt.torch.opt.mode import ( + ConvertEntrypoint, + ConvertReturnType, + ModeDescriptor, + RestoreEntrypoint, +) from modelopt.torch.opt.searcher import BaseSearcher, SearchConfig, SearchStateDict from modelopt.torch.opt.utils import named_hparams from modelopt.torch.utils import print_rank_0 -from ..fastnas import FastNASModeDescriptor from ..pruning import PruneModeRegistry SUPPORTED_HPARAMS = { @@ -58,6 +70,8 @@ "num_layers", } +__all__ = ["MCoreMinitronConfig", "MCoreMinitronModeDescriptor", "MCoreMinitronSearcher"] + class MCoreMinitronSearcher(BaseSearcher): """Searcher for Minitron pruning algorithm.""" @@ -218,9 +232,48 @@ def run_search(self) -> None: ) +def _convert_model_to_dynamic_space( + model: nn.Module, config: ModeloptBaseConfig | None = None +) -> DynamicSpace: + """Create a dynamic space for the model (in-place).""" + dynamic_space = DynamicSpace(model) + dynamic_space._should_be_converted = lambda mod: isinstance(mod, tuple(SUPPORTED_MODELS.keys())) + dynamic_space.convert_to_dynamic(config.model_dump() if config else None, DMRegistry) + if not dynamic_space.is_configurable(): + raise ApplyModeError( + "The model does not contain any configurable hyperparameters! Please check the" + " documentation for modules and config and how to get a configurable model." + ) + + return dynamic_space + + +def convert_mcore_minitron(model: nn.Module, config: ModeloptBaseConfig) -> ConvertReturnType: + """Convert the model to the dynamic search space (in-place) and return the converted model and metadata. + + This is a simplified version of convert_fastnas_searchspace that removes the automated recursive tracing + and instead directly converts the top-level model to a DynamicModule. Submodules should not need to be explicitly + converted as that happens from the top-level model. + """ + _convert_model_to_dynamic_space(model, config) + + # store current config in metadata + metadata = {"subnet_config": get_subnet_config(model)} + + # return converted model as well as metadata + return model, metadata + + +def restore_mcore_minitron( + model: nn.Module, config: ModeloptBaseConfig, metadata: dict +) -> nn.Module: + """Restore the model to the original state.""" + return convert_mcore_minitron(model, config)[0] + + @NASModeRegistry.register_mode @PruneModeRegistry.register_mode -class MCoreMinitronModeDescriptor(FastNASModeDescriptor): +class MCoreMinitronModeDescriptor(ModeDescriptor): """Class to describe the ``"mcore_minitron"`` mode. The properties of this mode can be inspected via the source code. @@ -236,7 +289,27 @@ def config_class(self) -> type[ModeloptBaseConfig]: """Specifies the config class for the mode.""" return MCoreMinitronConfig + @property + def next_modes(self) -> set[str] | None: + """Modes that must immediately follow this mode.""" + return {"export", "kd_loss", "quantize", "sparse_magnitude", "sparse_gpt"} + + @property + def export_mode(self) -> str | None: + """The mode that corresponds to the export mode of this mode.""" + return "export" + @property def search_algorithm(self) -> type[BaseSearcher]: - """Specifies the search algorithm to use for this mode (if any).""" + """Specifies the search algorithm to use for this mode.""" return MCoreMinitronSearcher + + @property + def convert(self) -> ConvertEntrypoint: + """The mode's entrypoint for converting a model to a search space.""" + return convert_mcore_minitron + + @property + def restore(self) -> RestoreEntrypoint: + """The mode's entrypoint for restoring a model with the modelopt_state.""" + return restore_mcore_minitron diff --git a/modelopt/torch/trace/plugins/__init__.py b/modelopt/torch/trace/plugins/__init__.py index e0e03e9fa..d5ffe3d7c 100644 --- a/modelopt/torch/trace/plugins/__init__.py +++ b/modelopt/torch/trace/plugins/__init__.py @@ -15,20 +15,7 @@ """Handles tracing plugins for third-party modules.""" -import warnings as _warnings +from modelopt.torch.utils import import_plugin -try: - from .megatron import * - -except ImportError: - pass -except Exception as e: - _warnings.warn(f"Failed to import megatron plugin due to: {e!r}") - -try: +with import_plugin("transformers"): from .transformers import * - -except ImportError: - pass -except Exception as e: - _warnings.warn(f"Failed to import transformers plugin due to: {e!r}") diff --git a/modelopt/torch/trace/plugins/megatron.py b/modelopt/torch/trace/plugins/megatron.py deleted file mode 100644 index b385e5426..000000000 --- a/modelopt/torch/trace/plugins/megatron.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Plugins for tracing Megatron modules.""" - -from megatron.core.models.gpt import GPTModel - -from ..symbols import Symbol, SymInfo, SymMap - -try: - from megatron.core.models.mamba import MambaModel - - HAS_MAMBA = True -except ImportError: - HAS_MAMBA = False - - -# NOTE: No need to register symbols for VocabParallelEmbedding, SelfAttention, MLP, LayerNorm, Row/Col Parallel Linear, -# etc. as they are not traced and manually handled in the _DynamicMCoreLanguageModel class -@SymMap.register([GPTModel] + ([MambaModel] if HAS_MAMBA else [])) -def get_megatron_language_model_sym_info(mod) -> SymInfo: - """Get symbol information for ``GPTModel`` and ``MambaModel`` layers.""" - hidden_size = Symbol(is_searchable=True) - return SymInfo(is_shape_preserving=True, hidden_size=hidden_size) diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py index 7ddf59f3b..6a4641f32 100644 --- a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py +++ b/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py @@ -48,8 +48,9 @@ _DynamicVocabParallelEmbedding, expand_head_indices, ) -from modelopt.torch.nas.search_space import generate_search_space +from modelopt.torch.nas.registry import DMRegistry from modelopt.torch.opt.utils import named_dynamic_modules, search_space_size +from modelopt.torch.prune.plugins.mcore_minitron import _convert_model_to_dynamic_space from modelopt.torch.utils import flatten_tree from modelopt.torch.utils.random import centroid @@ -178,7 +179,7 @@ def _test_gpt_parameter_sorting(activation_func, rank, size): m.weight.data = torch.randn_like(m.weight) model.eval() - search_space = generate_search_space(model) + dynamic_space = _convert_model_to_dynamic_space(model) # Compute activations for sorting for _ in range(5): @@ -188,18 +189,18 @@ def _test_gpt_parameter_sorting(activation_func, rank, size): prompt_tokens = torch.randint(0, vocab_size, (batch_size, max_sequence_length)).cuda() y1 = run_mcore_inference(model, prompt_tokens) - search_space.sort_parameters() + mtn.utils.sort_parameters(model) # check if all ffn_hidden_size, num_heads_per_group, num_query_groups, hidden_size have been sorted sortable_per_pp = [ - n for n, hp in search_space.named_hparams(configurable=True) if hp.importance is not None + n for n, hp in dynamic_space.named_hparams(configurable=True) if hp.importance is not None ] # 3 hps per layer + 1 for hidden_size (num_layers is not sorted!) assert len(sortable_per_pp) == 3 * num_layers // size + 1 # Export since sorting force reassigns SelfAttention weights which we dont want to re-sort! # TODO: ideally we shouldn't need this - search_space.export() + dynamic_space.export(DMRegistry) # sanity check if the model functionality is preserved after sorting y2 = run_mcore_inference(model, prompt_tokens) diff --git a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py index 179d3249d..70ca72f17 100644 --- a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py +++ b/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py @@ -43,9 +43,9 @@ _DynamicRowParallelLinear, _DynamicVocabParallelEmbedding, ) -from modelopt.torch.nas.search_space import generate_search_space from modelopt.torch.nas.traced_hp import TracedHp from modelopt.torch.opt.utils import named_dynamic_modules, search_space_size +from modelopt.torch.prune.plugins.mcore_minitron import _convert_model_to_dynamic_space from modelopt.torch.utils import flatten_tree from modelopt.torch.utils.random import centroid @@ -163,7 +163,7 @@ def _test_mamba_parameter_sorting(rank, size): m.weight.data = torch.randn_like(m.weight) model.eval() - search_space = generate_search_space(model) + dynamic_space = _convert_model_to_dynamic_space(model) # Compute activations for sorting for _ in range(5): @@ -173,11 +173,11 @@ def _test_mamba_parameter_sorting(rank, size): prompt_tokens = torch.randint(0, vocab_size, (batch_size, max_sequence_length)).cuda() y1 = run_mcore_inference(model, prompt_tokens) - search_space.sort_parameters() + dynamic_space.sort_parameters() # check if all mamba_num_heads, mamba_head_dim, hidden_size have been sorted sortable_per_pp = [ - n for n, hp in search_space.named_hparams(configurable=True) if hp.importance is not None + n for n, hp in dynamic_space.named_hparams(configurable=True) if hp.importance is not None ] # 2 mamba hps per layer + 1 for hidden_size (num_layers is not sorted!) assert len(sortable_per_pp) == 2 * num_layers // size + 1 diff --git a/tests/unit/torch/trace/test_symbol.py b/tests/unit/torch/trace/test_symbol.py index 1396912fc..1eaec3c24 100644 --- a/tests/unit/torch/trace/test_symbol.py +++ b/tests/unit/torch/trace/test_symbol.py @@ -139,20 +139,6 @@ def test_sym_map_registry(): except ImportError: pass - try: - from megatron.core.models.gpt import GPTModel - - mods_in_registry.add(GPTModel) - except ImportError: - pass - - try: - from megatron.core.models.mamba import MambaModel - - mods_in_registry.add(MambaModel) - except ImportError: - pass - not_a_leaf = {nn.Sequential} dependent_registry = set()