re-apply following commit properly post merge with main

yuzhongw-nvidia · ko3n1g · FDecaYed · commit 1c063fce640c · 2026-01-27T21:49:28.000+08:00
[dev] feat(moe): Cherry-pick NVIDIA#1989 back to dev (NVIDIA#3011) Signed-off-by: oliver könig <okoenig@nvidia.com> Co-authored-by: oliver könig <okoenig@nvidia.com>
diff --git a/gpt_builders.py b/gpt_builders.py
@@ -11,6 +11,7 @@
 )
 from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
     get_transformer_block_with_experimental_attention_variant_spec,
+    get_transformer_layer_with_experimental_attention_variant_spec,
 )
 from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import (
     get_gpt_heterogeneous_layer_spec,
@@ -76,13 +77,19 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
                 mtp_transformer_layer_spec = import_module(args.spec)
             else:
                 # Define the decoder block spec
-                decoder_layer_specs = get_gpt_decoder_layer_specs(
-                    config,
-                    use_transformer_engine=use_te,
-                    normalization=args.normalization,
-                    qk_l2_norm=args.qk_l2_norm,
-                    vp_stage=vp_stage,
-                )
+                if args.experimental_attention_variant is not None:
+                    decoder_layer_specs = (
+                        get_transformer_layer_with_experimental_attention_variant_spec(
+                            config=config
+                        )
+                    )
+                else:
+                    decoder_layer_specs = get_gpt_decoder_layer_specs(
+                        config,
+                        use_transformer_engine=use_te,
+                        normalization=args.normalization,
+                        qk_l2_norm=args.qk_l2_norm,
+                    )
                 mtp_transformer_layer_spec = decoder_layer_specs[-1]
             # Use spec of the last layer in decoder block as spec of the transformer layer in MTP
             mtp_block_spec = get_gpt_mtp_block_spec(
diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
@@ -149,12 +149,12 @@ def get_experimental_attention_variant_module_spec(
 ##########
 
 
-def get_transformer_block_with_experimental_attention_variant_spec(
-    config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
-) -> TransformerBlockSubmodules:
-    """Build transformer block spec with experimental attention variants (e.g., linear attention).
+def get_transformer_layer_with_experimental_attention_variant_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> List[ModuleSpec]:
+    """Build transformer layer specs with experimental attention variants (e.g., linear attention).
 
-    This function constructs a heterogeneous transformer block that supports mixing different
+    This function is for constructing a heterogeneous transformer that supports mixing different
     attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers.
     **Note that, this API is a experimental API in the short term, and might be deprecated in the
     future. In the long run, we will move to a new design that better support hybrid models.**
@@ -170,22 +170,19 @@ def get_transformer_block_with_experimental_attention_variant_spec(
         2. Per-Layer Spec Construction: Iterates through layers, constructing transformer
            layer specs based on attention and MLP patterns.
 
-        3. Pipeline Slicing: Extracts layer specs for the current pipeline stage.
-
     Args:
         config: Transformer configuration containing model hyperparameters and feature flags.
-        vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism.
-        pp_rank: Pipeline model parallel rank.
 
     Returns:
-        TransformerBlockSubmodules containing per-layer specs and final layer norm.
+        List[ModuleSpec] containing per-layer specs.
 
     Note:
         Currently only supports transformer_engine backend. Kitchen backend can be used as a
         wrapper with TE fallback for unsupported operations.
     """
 
-    backend = _get_backend_spec_provider(config=config)
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
 
     # Get attention patterns and specs
     experimental_attention_pattern = [0] * config.num_layers
@@ -257,6 +254,42 @@ def get_transformer_block_with_experimental_attention_variant_spec(
             )
         )
 
+    return layer_specs
+
+
+def get_transformer_block_with_experimental_attention_variant_spec(
+    config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
+) -> TransformerBlockSubmodules:
+    """Build transformer block spec with experimental attention variants (e.g., linear attention).
+
+    This function constructs a heterogeneous transformer block that supports mixing different
+    attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers.
+    **Note that, this API is a experimental API in the short term, and might be deprecated in the
+    future. In the long run, we will move to a new design that better support hybrid models.**
+
+    Constructing transformer layer specs by
+    `get_transformer_layer_with_experimental_attention_variant_spec` and then slicing the
+    layer specs to only include the layers that are built in this pipeline stage.
+
+    Args:
+        config: Transformer configuration containing model hyperparameters and feature flags.
+        vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism.
+        pp_rank: Pipeline model parallel rank.
+
+    Returns:
+        TransformerBlockSubmodules containing per-layer specs and final layer norm.
+
+    Note:
+        Currently only supports transformer_engine backend. Kitchen backend can be used as a
+        wrapper with TE fallback for unsupported operations.
+    """
+
+    backend = _get_backend_spec_provider(config=config)
+
+    layer_specs = get_transformer_layer_with_experimental_attention_variant_spec(
+        config=config, backend=backend
+    )
+
     # Slice the layer specs to only include the layers that are built in this pipeline stage.
     if config.pipeline_model_parallel_layout is not None:
         local_layer_ids = config.pipeline_model_parallel_layout.get_layer_id_list(
@@ -270,6 +303,7 @@ def get_transformer_block_with_experimental_attention_variant_spec(
     layer_specs = [layer_specs[layer_id] for layer_id in local_layer_ids]
 
     # Get GPT decoder block spec
+    rms_norm = config.normalization == "RMSNorm"
     gpt_decoder_block_spec = TransformerBlockSubmodules(
         layer_specs=layer_specs, layer_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False)
     )
@@ -359,7 +393,7 @@ def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpecProvider
     )
     backend: BackendSpecProvider = (
         KitchenSpecProvider(
-            fallback=TESpecProvider(),
+            fallback=TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn),
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
         )
@@ -396,6 +430,7 @@ def _get_self_attention_module_spec(
         qk_l2_norm=config.qk_l2_norm,
         use_kitchen=config.use_kitchen,
         use_te_activation_func=config.use_te_activation_func,
+        fallback_to_eager_attn=config.fallback_to_eager_attn,
         use_kitchen_attention=config.use_kitchen_attention,
         kitchen_attention_backend=config.kitchen_attention_backend,
     )
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -618,6 +618,7 @@ def get_gpt_decoder_block_spec(
     layer_specs = get_gpt_decoder_layer_specs(
         config, use_transformer_engine, normalization, qk_l2_norm
     )
+
     # Slice the layer specs to only include the layers that are built in this pipeline stage.
     # Note: MCore layer_number starts at 1
     num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank)
@@ -637,10 +638,6 @@ def get_gpt_decoder_block_spec(
         offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank)
         local_layer_specs = layer_specs[offset : offset + num_layers_to_build]
 
-    if use_transformer_engine:
-        layer_norm_impl = TENorm
-    else:
-        layer_norm_impl = LNImpl
     # Block spec.
     if use_transformer_engine:
         layer_norm_impl = TENorm