Add Protocol for core_attention

nschank · nschank · commit 5c3618c3912b · 2026-01-03T17:17:28.000Z
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
@@ -113,7 +113,7 @@ def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
                 params={"attn_mask_type": attn_mask_type},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                    core_attention=TEDotProductAttention,
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -159,7 +159,7 @@ def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
                         params={"attn_mask_type": attn_mask_type},
                         submodules=SelfAttentionSubmodules(
                             linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                            core_attention=TEDotProductAttention,
+                            core_attention=not_none(TEDotProductAttention),
                             linear_proj=TERowParallelLinear,
                         ),
                     ),
diff --git a/examples/multimodal/radio/radio_g.py b/examples/multimodal/radio/radio_g.py
@@ -125,7 +125,7 @@ def get_radio_g_layer_spec_te() -> ModuleSpec:
                 params={"attn_mask_type": attn_mask_type},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                    core_attention=TEDotProductAttention,
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
diff --git a/megatron/core/extensions/kitchen.py b/megatron/core/extensions/kitchen.py
@@ -1431,9 +1431,9 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType = None,
-        attention_bias: Tensor = None,
+        attention_mask: Optional[Tensor],
+        attn_mask_type: Optional[AttnMaskType] = None,
+        attention_bias: Optional[Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """Forward."""
@@ -1581,11 +1581,11 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType = None,
-        attention_bias: Tensor = None,
+        attention_mask: Optional[Tensor],
+        attn_mask_type: Optional[AttnMaskType] = None,
+        attention_bias: Optional[Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
-    ):
+    ) -> Tensor:
         """Forward."""
         assert self.init_finished, "Must call finish_init before forward."
         assert packed_seq_params is None, (
@@ -1752,7 +1752,9 @@ def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type:
         """Which module to use for layer norm"""
         return self.fallback.layer_norm(rms_norm=rms_norm, for_qk=for_qk)
 
-    def core_attention(self) -> type:
+    def core_attention(
+        self,
+    ) -> type[KitchenDotProductAttention] | type[KitchenFlashAttention] | type:
         """Which module to use for attention"""
         if not self.use_kitchen_attention:
             log_single_rank(
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
@@ -62,9 +62,10 @@
 )
 
 if TYPE_CHECKING:
-    # For type checking, 
+    # For type checking, treat transformer_engine as always available.
     import transformer_engine as te
     from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
+
     HAVE_TE = True
 else:
     try:
@@ -1160,7 +1161,7 @@ def __init__(
         v_channels: Optional[int] = None,
         num_splits: Optional[int] = None,
         cp_comm_type: str = "p2p",
-        pg_collection: ProcessGroupCollection = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
     ):
         if not HAVE_TE:
             raise ImportError(
@@ -1334,12 +1335,12 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
+        attention_mask: Optional[Tensor],
         attn_mask_type: AttnMaskType,
-        attention_bias: Tensor = None,
-        packed_seq_params: PackedSeqParams = None,
+        attention_bias: Optional[Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
         num_splits: Optional[int] = None,
-    ):
+    ) -> torch.Tensor:
         """Forward."""
         # Default to constructor-provided num_splits unless explicitly overridden
         if num_splits is None:
diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
@@ -66,7 +66,7 @@ def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.padding},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                    core_attention=TEDotProductAttention,
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -95,7 +95,7 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                    core_attention=TEDotProductAttention,
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
diff --git a/megatron/core/models/backends.py b/megatron/core/models/backends.py
@@ -166,7 +166,7 @@ def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type:
             return FusedLayerNorm
         return TENorm
 
-    def core_attention(self) -> type:
+    def core_attention(self) -> type[TEDotProductAttention]:
         """Which module to use for attention"""
         return TEDotProductAttention
 
diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py
@@ -64,7 +64,7 @@ def get_bert_layer_with_transformer_engine_spec():
                 params={"attn_mask_type": AttnMaskType.padding},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=not_none(TELayerNormColumnParallelLinear),
-                    core_attention=TEDotProductAttention,
+                    core_attention=not_none(TEDotProductAttention),
                     linear_proj=TERowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
diff --git a/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py b/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py
@@ -121,7 +121,7 @@ def _get_heterogenous_attention_spec(
                 linear_qkv=(
                     not_none(TELayerNormColumnParallelLinear) if use_te else ColumnParallelLinear
                 ),
-                core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                core_attention=not_none(TEDotProductAttention) if use_te else DotProductAttention,
                 linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
                 q_layernorm=ln,
                 k_layernorm=ln,
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
@@ -23,7 +23,9 @@
     get_tensor_model_parallel_world_size,
 )
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.tensor_parallel.mappings import all_gather_last_dim_from_tensor_parallel_region
+from megatron.core.tensor_parallel.mappings import (
+    all_gather_last_dim_from_tensor_parallel_region,
+)
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
@@ -168,14 +170,49 @@ def __call__(
     ) -> LinearLayer: ...
 
 
+class CoreAttention(Protocol):
+    """Protocol for core_attention modules."""
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Optional[Tensor],
+        /,
+        *,
+        attn_mask_type: AttnMaskType,
+        attention_bias: Optional[Tensor],
+        packed_seq_params: Optional[PackedSeqParams],
+    ) -> Tensor:
+        """Applies dot product attention."""
+        ...
+
+
+class CoreAttentionBuilder(Protocol):
+    """Protocol for building core_attention layers."""
+
+    def __call__(
+        self,
+        *,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        cp_comm_type: Optional[str],
+        softmax_scale: Optional[float],
+        pg_collection: Optional[ProcessGroupCollection],
+    ) -> CoreAttention: ...
+
+
 @dataclass
 class SelfAttentionSubmodules:
     """
     Configuration class for specifying the submodules of a self-attention.
     """
 
     linear_qkv: LinearQkvBuilder
-    core_attention: Union[ModuleSpec, type] = None
+    core_attention: CoreAttentionBuilder
     linear_proj: Union[ModuleSpec, type] = None
     q_layernorm: Union[ModuleSpec, type] = None
     k_layernorm: Union[ModuleSpec, type] = None
@@ -189,7 +226,7 @@ class CrossAttentionSubmodules:
 
     linear_q: LinearLayerBuilder
     linear_kv: LinearLayerBuilder
-    core_attention: Union[ModuleSpec, type] = None
+    core_attention: CoreAttentionBuilder
     linear_proj: Union[ModuleSpec, type] = None
 
 
@@ -273,8 +310,7 @@ def __init__(
             tmp_config.num_query_groups = world_size
         else:
             tmp_config = self.config
-        self.core_attention = build_module(
-            submodules.core_attention,
+        self.core_attention = submodules.core_attention(
             config=tmp_config,
             layer_number=self.layer_number,
             attn_mask_type=self.attn_mask_type,
@@ -342,7 +378,7 @@ def custom_forward(*inputs):
             attention_mask = inputs[3]
             attn_mask_type = inputs[5]
             attn_mask_type = AttnMaskType(attn_mask_type.item())
-            output_ = self.core_attention(
+            output_ = apply_module(self.core_attention)(
                 query,
                 key,
                 value,
@@ -381,7 +417,9 @@ def _get_pp_layer_offset_for_inference(self):
         ), "Virtual pipeline parallelism is not supported for inference"
 
         # Import here to avoid circular imports
-        from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
+        from megatron.core.transformer.transformer_layer import (
+            get_transformer_layer_offset,
+        )
 
         return get_transformer_layer_offset(
             self.config, vp_stage=None, pp_rank=get_pg_rank(self.pg_collection.pp)
@@ -400,7 +438,7 @@ def _adjust_key_value_for_inference(
         sequence_len_offset: Optional[int] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
-    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, AttnMaskType, Tensor]:
         """
         Saves the generated key and value tensors to the end of the buffers in inference_context.
         Returns the full size keys and values from the provided inference_context, as well as
@@ -1017,7 +1055,7 @@ def forward(
         else:
             if inference_context is None or inference_context.is_static_batching():
                 # Static batching attention kernel.
-                core_attn_out = self.core_attention(
+                core_attn_out = apply_module(self.core_attention)(
                     query,
                     key,
                     value,
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
@@ -144,7 +144,7 @@ def forward(
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask: Tensor,
+        attention_mask: Optional[Tensor],
         attn_mask_type: Optional[AttnMaskType] = None,
         attention_bias: Optional[Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,