NVIDIA
diff --git a/‎mamba_builders.py‎
Lines changed: 2 additions & 6 deletions b/‎mamba_builders.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎megatron/core/models/common/language_module/language_module.py‎
Lines changed: 3 additions & 27 deletions b/‎megatron/core/models/common/language_module/language_module.py‎
Lines changed: 3 additions & 27 deletions
diff --git a/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 2 additions & 2 deletions b/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 93 additions & 19 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 93 additions & 19 deletions
diff --git a/‎megatron/core/models/mamba/mamba_layer_specs.py‎
Lines changed: 0 additions & 33 deletions b/‎megatron/core/models/mamba/mamba_layer_specs.py‎
Lines changed: 0 additions & 33 deletions
@@ -8,18 +8,15 @@
 from megatron.training.arguments import core_transformer_config_from_args
 from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec
 
-
 def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
     print_rank_0('building MAMBA model ...')
     if config is None:
         config = core_transformer_config_from_args(args, TransformerConfig)
     assert args.use_legacy_models is False, "Mamba only supported in Mcore!"
 
     if config.transformer_impl == "inference_optimized":
-        mamba_stack_spec = mamba_inference_stack_spec
-        assert (
-            not config.inference_fuse_tp_communication
-        ), "inference_fuse_tp_communication is not supported for Mamba"
+        mamba_stack_spec = mamba_inference_stack_spec 
+        assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba"
     elif args.spec is not None:
         mamba_stack_spec = import_module(args.spec)
     else:
@@ -42,7 +39,6 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p
         rotary_percent=args.rotary_percent,
         rotary_base=args.rotary_base,
         pg_collection=pg_collection,
-        vp_stage=vp_stage,
     )
 
     for l in range(model.decoder.num_layers_per_pipeline_rank):
 
@@ -23,7 +23,6 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group
 from megatron.core.utils import (
@@ -256,20 +255,12 @@ def setup_embeddings_and_output_layer(self) -> None:
             LanguageModule.embedding_warning_printed = True
 
     def shared_embedding_or_output_weight(self) -> Tensor:
-        """Gets the embedding weight or output logit weights when share embedding and output weights set to True
-          or when use Multi-Token Prediction (MTP).
+        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
 
         Returns:
-            Tensor: During pre processing or MTP process it returns the input embeddings weight while during post processing it returns the final output layers weight
+            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
         """
-        if self.pre_process or getattr(self, 'mtp_process', False):
-            # Multi-Token Prediction (MTP) need both embedding layer and output layer.
-            # So there will be both embedding layer and output layer in the mtp process stage.
-            # When share_embeddings_and_output_weights is True, the embedding weight is the
-            # canonical shared weight and is passed to the output layer during forward.
-            assert hasattr(
-                self, 'embedding'
-            ), f"embedding is needed in this pipeline stage, but it is not initialized."
+        if self.pre_process:
             return self.embedding.word_embeddings.weight
         elif self.post_process:
             return self.output_layer.weight
@@ -302,21 +293,6 @@ def sharded_state_dict(
         output_layer_weight_key = f'{prefix}output_layer.weight'
         output_layer_bias_key = f'{prefix}output_layer.bias'
 
-        # Multi-Token Prediction (MTP) needs embedding layer in mtp process stage.
-        # If MTP is not placed in the pre processing stage, we need to maintain a copy of
-        # embedding layer in the mtp process stage and tie it to the embedding in the pre
-        # processing stage.
-        # Note: MTP loss is computed at post_process stage, so the output_layer on mtp_process
-        # rank doesn't need special tying - it's not used for loss computation.
-        if getattr(self, 'mtp_process', False) and not self.pre_process:
-            emb_weight = self.embedding.word_embeddings.weight
-            tie_word_embeddings_state_dict(
-                sharded_state_dict,
-                emb_weight,
-                first_stage_word_emb_key,
-                tp_group=self.tp_group,
-                dp_cp_group=metadata['dp_cp_group'],
-            )
         if self.share_embeddings_and_output_weights:
             self.tie_embeddings_and_output_weights_state_dict(
                 sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata
 
@@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):
         # get flags for latter use
         is_mtp = isinstance(self.layer, MultiTokenPredictionLayer)
         is_moe = (
-            isinstance(self.layer.mtp_model_layer.mlp, MoELayer)
+            isinstance(self.layer.transformer_layer.mlp, MoELayer)
             if is_mtp
             else isinstance(self.layer.mlp, MoELayer)
         )
 
@@ -613,9 +613,9 @@ def build_mtp_layer_callables(layer):
     multi-token prediction layer nodes (attention, MLP, etc.)
     """
 
-    forward_funcs, backward_dw = build_transformer_layer_callables(layer.mtp_model_layer)
+    forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer)
     attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs
-    is_moe = isinstance(layer.mtp_model_layer.mlp, MoELayer)
+    is_moe = isinstance(layer.transformer_layer.mlp, MoELayer)
     assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now."
 
     def submodule_mtp_attn_forward(node, hidden_states):
 
@@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend(
         raise ValueError(f"Invalid spec: {spec}")
 
     mtp_layer_spec = get_mtp_layer_spec_for_backend(
-        mtp_model_layer_spec=transformer_layer_spec, backend=backend
+        transformer_layer_spec=transformer_layer_spec, backend=backend
     )
     mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0
     mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers
 
@@ -6,7 +6,7 @@
 import torch
 from torch import Tensor
 
-from megatron.core import tensor_parallel
+from megatron.core import parallel_state, tensor_parallel
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.inference.contexts import BaseInferenceContext
@@ -26,9 +26,11 @@
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer.enums import CudaGraphScope, ModelType
 from megatron.core.transformer.multi_token_prediction import (
+    MTPLossAutoScaler,
+    MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
-    mtp_on_this_rank,
-    process_mtp_loss,
+    roll_tensor,
+    tie_word_embeddings_state_dict,
 )
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlock
@@ -142,9 +144,7 @@ def __init__(
             self.rotary_base = rotary_base
         self.rotary_scaling = rope_scaling
         self.mtp_block_spec = mtp_block_spec
-        self.mtp_process = mtp_block_spec is not None and mtp_on_this_rank(
-            self.config, ignore_virtual=False, vp_stage=vp_stage
-        )
+        self.mtp_process = mtp_block_spec is not None
 
         if self.pre_process or self.mtp_process:
             self.embedding = LanguageModelEmbedding(
@@ -609,19 +609,56 @@ def _postprocess(
             return hidden_states
 
         if self.config.mtp_num_layers is not None:
-            hidden_states = process_mtp_loss(
-                hidden_states=hidden_states,
-                labels=labels,
-                loss_mask=loss_mask,
-                output_layer=self.output_layer,
-                output_weight=output_weight,
-                runtime_gather_output=runtime_gather_output,
-                is_training=self.training,
-                compute_language_model_loss=self.compute_language_model_loss,
-                config=self.config,
-                cp_group=self.pg_collection.cp,
-                packed_seq_params=packed_seq_params,
-            )
+            mtp_labels = labels.clone()
+            hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0)
+            hidden_states = hidden_states_list[0]
+            if loss_mask is None:
+                # if loss_mask is not provided, use all ones as loss_mask
+                loss_mask = torch.ones_like(mtp_labels)
+            for mtp_layer_number in range(self.config.mtp_num_layers):
+                # output
+                mtp_logits, _ = self.output_layer(
+                    hidden_states_list[mtp_layer_number + 1],
+                    weight=output_weight,
+                    runtime_gather_output=runtime_gather_output,
+                )
+                # Calc loss for the current Multi-Token Prediction (MTP) layers.
+                mtp_labels, _ = roll_tensor(
+                    mtp_labels,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
+                )
+                loss_mask, num_tokens = roll_tensor(
+                    loss_mask,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
+                )
+                mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits)
+                mtp_loss = loss_mask * mtp_loss
+                if self.training:
+                    # TODO(shifangx): remove the use of parallel_state here
+                    # after moving loss logging to loss_func in pretrain_gpt.py
+                    MTPLossLoggingHelper.save_loss_to_tracker(
+                        torch.sum(mtp_loss) / num_tokens,
+                        mtp_layer_number,
+                        self.config.mtp_num_layers,
+                        avg_group=parallel_state.get_data_parallel_group(
+                            with_context_parallel=True
+                        ),
+                    )
+                mtp_loss_scale = self.config.mtp_loss_scaling_factor / self.config.mtp_num_layers
+                if self.config.calculate_per_token_loss:
+                    hidden_states = MTPLossAutoScaler.apply(
+                        hidden_states, mtp_loss_scale * mtp_loss
+                    )
+                else:
+                    hidden_states = MTPLossAutoScaler.apply(
+                        hidden_states, mtp_loss_scale * mtp_loss / num_tokens
+                    )
         sequence_parallel_override = False
 
         if in_inference_mode and inference_context.materialize_only_last_token_logits:
@@ -678,6 +715,27 @@ def _postprocess(
 
         return loss
 
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Gets the embedding weight or output logit weights when share input embedding and
+        output weights set to True or when use Multi-Token Prediction (MTP) feature.
+
+        Returns:
+            Tensor: During pre processing or MTP process it returns the input embeddings weight.
+            Otherwise, during post processing it returns the final output layers weight.
+        """
+        if self.pre_process or self.mtp_process:
+            # Multi-Token Prediction (MTP) need both embedding layer and output layer.
+            # So there will be both embedding layer and output layer in the mtp process stage.
+            # In this case, if share_embeddings_and_output_weights is True, the shared weights
+            # will be stored in embedding layer, and output layer will not have any weight.
+            assert hasattr(
+                self, 'embedding'
+            ), f"embedding is needed in this pipeline stage, but it is not initialized."
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
     def build_schedule_plan(
         self,
         input_ids: Tensor,
@@ -768,4 +826,20 @@ def sharded_state_dict(
             output_extra_state and output_extra_state.data
         ), f'Expected output layer extra state to be empty, got: {output_extra_state}'
 
+        # Multi-Token Prediction (MTP) need embedding layer in mtp process stage.
+        # If MTP is not placed in the pre processing stage, we need to maintain a copy of
+        # embedding layer in the mtp process stage and tie it to the embedding in the pre
+        # processing stage.
+        # Now MTP loss is computed in post processing stage, so the output_layer is not needed.
+        if self.mtp_process and not self.pre_process:
+            emb_weight_key = f'{prefix}embedding.word_embeddings.weight'
+            emb_weight = self.embedding.word_embeddings.weight
+            tie_word_embeddings_state_dict(
+                sharded_state_dict,
+                emb_weight,
+                emb_weight_key,
+                tp_group=self.tp_group,
+                dp_cp_group=metadata['dp_cp_group'],
+            )
+
         return sharded_state_dict
@@ -1,7 +1,6 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 from megatron.core.extensions.transformer_engine import (
-    TEColumnParallelLinear,
     TEDotProductAttention,
     TELayerNormColumnParallelLinear,
     TENorm,
@@ -20,49 +19,20 @@
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.multi_token_prediction import (
-    MultiTokenPredictionBlock,
-    MultiTokenPredictionBlockSubmodules,
-    MultiTokenPredictionLayer,
-    MultiTokenPredictionLayerSubmodules,
-)
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import (
     MoETransformerLayer,
     TransformerLayer,
     TransformerLayerSubmodules,
 )
 
-# This should be private and should not be used outside of this file.
 moe = get_moe_module_spec(
     use_te=True,
     num_experts=8,  # Can be any positive integer (must not be None).
     moe_grouped_gemm=True,
     moe_use_legacy_grouped_gemm=False,
 )
 
-
-# MTP block spec for Mamba - provides norms and projection only.
-# Inner layers are built by MultiTokenPredictionLayer using nested MambaStack
-_mamba_mtp_block_spec = ModuleSpec(
-    module=MultiTokenPredictionBlock,
-    submodules=MultiTokenPredictionBlockSubmodules(
-        layer_specs=[
-            ModuleSpec(
-                module=MultiTokenPredictionLayer,
-                submodules=MultiTokenPredictionLayerSubmodules(
-                    enorm=TENorm,
-                    hnorm=TENorm,
-                    eh_proj=TEColumnParallelLinear,
-                    mtp_model_layer=None,  # Built via pattern + mamba_submodules
-                    layer_norm=TENorm,
-                ),
-            )
-        ]
-    ),
-)
-
-
 mamba_stack_spec = ModuleSpec(
     module=MambaStack,
     submodules=MambaStackSubmodules(
@@ -117,11 +87,9 @@
                 pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add
             ),
         ),
-        mtp_block_spec=_mamba_mtp_block_spec,
     ),
 )
 
-
 mamba_inference_stack_spec = ModuleSpec(
     module=MambaStack,
     submodules=MambaStackSubmodules(
@@ -179,6 +147,5 @@
                 pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add
             ),
         ),
-        mtp_block_spec=_mamba_mtp_block_spec,
     ),
 )
Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):`
`123`	`123`	`# get flags for latter use`
`124`	`124`	`is_mtp = isinstance(self.layer, MultiTokenPredictionLayer)`
`125`	`125`	`is_moe = (`
`126`		`- isinstance(self.layer.mtp_model_layer.mlp, MoELayer)`
	`126`	`+ isinstance(self.layer.transformer_layer.mlp, MoELayer)`
`127`	`127`	`if is_mtp`
`128`	`128`	`else isinstance(self.layer.mlp, MoELayer)`
`129`	`129`	`)`
Original file line number	Diff line number	Diff line change
`@@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend(`
`704`	`704`	`raise ValueError(f"Invalid spec: {spec}")`
`705`	`705`
`706`	`706`	`mtp_layer_spec = get_mtp_layer_spec_for_backend(`
`707`		`- mtp_model_layer_spec=transformer_layer_spec, backend=backend`
	`707`	`+ transformer_layer_spec=transformer_layer_spec, backend=backend`
`708`	`708`	`)`
`709`	`709`	`mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0`
`710`	`710`	`mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers`