eagle705
diff --git a/‎src/megatron/bridge/models/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎src/megatron/bridge/models/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/megatron/bridge/models/nemotron/__init__.py‎
Lines changed: 34 additions & 0 deletions b/‎src/megatron/bridge/models/nemotron/__init__.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/megatron/bridge/models/nemotron/nemotron_bridge.py‎
Lines changed: 106 additions & 0 deletions b/‎src/megatron/bridge/models/nemotron/nemotron_bridge.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/megatron/bridge/models/nemotron/nemotron_provider.py‎
Lines changed: 148 additions & 0 deletions b/‎src/megatron/bridge/models/nemotron/nemotron_provider.py‎
Lines changed: 148 additions & 0 deletions
@@ -73,6 +73,15 @@
     NemotronNano9Bv2Provider,
     NemotronNano12Bv2Provider,
 )
+from megatron.bridge.models.nemotron import (
+    Nemotron3ModelProvider4B,
+    Nemotron3ModelProvider8B,
+    Nemotron3ModelProvider22B,
+    Nemotron4ModelProvider15B,
+    Nemotron4ModelProvider340B,
+    NemotronBridge,
+    NemotronModelProvider,
+)
 from megatron.bridge.models.qwen import (
     Qwen2ModelProvider,
     Qwen2ModelProvider1P5B,
@@ -184,6 +193,14 @@
     "NVIDIAMambaProvider8B",
     "MistralModelProvider",
     "MistralSmall3ModelProvider24B",
+    # Nemotron Models
+    "NemotronBridge",
+    "NemotronModelProvider",
+    "Nemotron3ModelProvider4B",
+    "Nemotron3ModelProvider8B",
+    "Nemotron3ModelProvider22B",
+    "Nemotron4ModelProvider15B",
+    "Nemotron4ModelProvider340B",
     # VL Models
     "Qwen25VLModel",
     "Qwen25VLBridge",
 
@@ -0,0 +1,34 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.bridge.models.nemotron.nemotron_bridge import NemotronBridge
+from megatron.bridge.models.nemotron.nemotron_provider import (
+    Nemotron3ModelProvider4B,
+    Nemotron3ModelProvider8B,
+    Nemotron3ModelProvider22B,
+    Nemotron4ModelProvider15B,
+    Nemotron4ModelProvider340B,
+    NemotronModelProvider,
+)
+
+
+__all__ = [
+    "NemotronBridge",
+    "NemotronModelProvider",
+    "Nemotron3ModelProvider4B",
+    "Nemotron3ModelProvider8B",
+    "Nemotron3ModelProvider22B",
+    "Nemotron4ModelProvider15B",
+    "Nemotron4ModelProvider340B",
+]
@@ -0,0 +1,106 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core.models.gpt.gpt_model import GPTModel
+from transformers import NemotronForCausalLM
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    QKVMapping,
+)
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+from megatron.bridge.models.nemotron.nemotron_provider import NemotronModelProvider
+
+
+@MegatronModelBridge.register_bridge(source=NemotronForCausalLM, target=GPTModel)
+class NemotronBridge(MegatronModelBridge):
+    """
+    Megatron Bridge for Nemotron Causal LM.
+
+    As a user you would not use this bridge directly, but through `AutoBridge`.
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-4-340B-Instruct")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> NemotronModelProvider:
+        hf_config = hf_pretrained.config
+
+        provider = NemotronModelProvider(
+            num_layers=hf_config.num_hidden_layers,
+            hidden_size=hf_config.hidden_size,
+            ffn_hidden_size=hf_config.intermediate_size,
+            num_attention_heads=hf_config.num_attention_heads,
+            init_method_std=hf_config.initializer_range,
+            layernorm_epsilon=hf_config.norm_eps,
+            num_query_groups=hf_config.num_key_value_heads,
+            seq_length=hf_config.max_position_embeddings,
+            rotary_base=hf_config.rope_theta,
+            rotary_percent=hf_config.partial_rotary_factor,
+            kv_channels=getattr(hf_config, "head_dim", None),
+            make_vocab_size_divisible_by=self.make_vocab_size_divisible_by(hf_config.vocab_size),
+            share_embeddings_and_output_weights=getattr(hf_config, "tie_word_embeddings", False),
+            fp16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.float16),
+            bf16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.bfloat16),
+            params_dtype=self.dtype_from_hf(hf_config, default=torch.float32),
+            generation_config=hf_pretrained.generation_config,
+            vocab_size=hf_config.vocab_size,
+        )
+
+        return provider
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        # Return MegatronMappingRegistry containing parameter mappings from Megatron to HF format
+        # First create simple 1:1 parameter mappings using a dictionary for readability
+
+        # Dictionary maps Megatron parameter names -> HF parameter names
+        # Supports wildcard (*) patterns for layer-specific parameters
+        param_mappings = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "output_layer.weight": "lm_head.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "decoder.final_layernorm.bias": "model.norm.bias",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "model.layers.*.input_layernorm.bias",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "model.layers.*.post_attention_layernorm.bias",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc1.weight": "model.layers.*.mlp.up_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+        }
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(megatron_param, hf_param)
+        for megatron_param, hf_param in param_mappings.items():
+            mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    megatron_param="decoder.layers.*.self_attention.linear_qkv.weight",
+                    q="model.layers.*.self_attn.q_proj.weight",
+                    k="model.layers.*.self_attn.k_proj.weight",
+                    v="model.layers.*.self_attn.v_proj.weight",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
@@ -0,0 +1,148 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Callable, Optional
+
+import torch
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+from megatron.bridge.utils import fusions
+
+
+logger = logging.getLogger(__name__)
+
+
+def squared_relu(x):
+    """Squared ReLU activation function."""
+    return torch.pow(torch.nn.functional.relu(x), 2)
+
+
+@dataclass
+class NemotronModelProvider(GPTModelProvider):
+    """Configuration class for Nemotron models."""
+
+    # configs that are common across model sizes
+    normalization: str = "LayerNorm"
+    activation_func: Callable = squared_relu
+    position_embedding_type: str = "rope"
+    share_embeddings_and_output_weights: bool = False
+    add_bias_linear: bool = False
+
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    rotary_percent: float = 0.5
+    masked_softmax_fusion: bool = field(default_factory=fusions.can_enable_masked_softmax_fusion)
+    persist_layer_norm: bool = True
+    bias_dropout_add_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = True
+    cross_entropy_loss_fusion: bool = True
+    apply_rope_fusion: bool = field(default_factory=fusions.can_enable_apply_rope_fusion)
+
+    # Nemotron3Config4B as default configs
+    num_layers: int = 32
+    seq_length: int = 4096
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 9216
+    num_attention_heads: int = 24
+    num_query_groups: Optional[int] = 8
+    kv_channels: Optional[int] = 128
+    init_method_std: float = 0.0134
+
+    # Data type settings to match HF models
+    bf16: bool = True
+    fp16: bool = False
+    params_dtype: torch.dtype = torch.bfloat16
+    autocast_dtype: torch.dtype = torch.bfloat16
+
+
+@dataclass
+class Nemotron3ModelProvider4B(NemotronModelProvider):
+    """
+    Configuration class for the Nemotron3 4B model, inheriting from NemotronModelProvider.
+    """
+
+    num_layers: int = 32
+    seq_length: int = 4096
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 9216
+    num_attention_heads: int = 24
+    num_query_groups: int = 8
+    kv_channels: Optional[int] = 128
+    init_method_std: float = 0.0134
+
+
+@dataclass
+class Nemotron3ModelProvider8B(NemotronModelProvider):
+    """
+    Configuration class for the Nemotron3 8B model, inheriting from NemotronModelProvider.
+    """
+
+    num_layers: int = 32
+    seq_length: int = 4096
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 16384
+    num_attention_heads: int = 32
+    num_query_groups: Optional[int] = None
+    kv_channels: Optional[int] = None
+    init_method_std: float = 0.010
+
+
+@dataclass
+class Nemotron3ModelProvider22B(NemotronModelProvider):
+    """
+    Configuration class for the Nemotron3 22B model, inheriting from NemotronModelProvider.
+    """
+
+    num_layers: int = 40
+    seq_length: int = 4096
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_attention_heads: int = 48
+    num_query_groups: Optional[int] = None
+    kv_channels: Optional[int] = None
+    init_method_std: float = 0.008
+
+
+@dataclass
+class Nemotron4ModelProvider15B(NemotronModelProvider):
+    """
+    Configuration class for the Nemotron4 15B model, inheriting from NemotronModelProvider.
+    """
+
+    num_layers: int = 32
+    seq_length: int = 4096
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_attention_heads: int = 48
+    num_query_groups: Optional[int] = 8
+    kv_channels: Optional[int] = None
+    init_method_std: float = 0.0134
+
+
+@dataclass
+class Nemotron4ModelProvider340B(NemotronModelProvider):
+    """
+    Configuration class for the Nemotron4 340B model, inheriting from NemotronModelProvider.
+    """
+
+    num_layers: int = 96
+    seq_length: int = 4096
+    hidden_size: int = 18432
+    ffn_hidden_size: int = 73728
+    num_attention_heads: int = 96
+    num_query_groups: Optional[int] = 8
+    kv_channels: Optional[int] = None
+    init_method_std: float = 0.0063