pytorch
diff --git a/‎tests/unit_tests/test_module.py‎
Lines changed: 14 additions & 17 deletions b/‎tests/unit_tests/test_module.py‎
Lines changed: 14 additions & 17 deletions
diff --git a/‎torchtitan/experiments/graph_trainer/graph_utils.py‎
Lines changed: 10 additions & 1 deletion b/‎torchtitan/experiments/graph_trainer/graph_utils.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎torchtitan/experiments/graph_trainer/simple_fsdp.py‎
Lines changed: 3 additions & 1 deletion b/‎torchtitan/experiments/graph_trainer/simple_fsdp.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎torchtitan/experiments/rl/unified/models/attention.py‎
Lines changed: 2 additions & 1 deletion b/‎torchtitan/experiments/rl/unified/models/attention.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchtitan/experiments/rl/unified/models/vllm_wrapper.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/experiments/rl/unified/models/vllm_wrapper.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/rl/vllm_compat/models/attention.py‎
Lines changed: 3 additions & 1 deletion b/‎torchtitan/experiments/rl/vllm_compat/models/attention.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎torchtitan/experiments/rl/vllm_compat/models/qwen3/model_vllm_compat.py‎
Lines changed: 15 additions & 11 deletions b/‎torchtitan/experiments/rl/vllm_compat/models/qwen3/model_vllm_compat.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎torchtitan/experiments/vlm/model/model.py‎
Lines changed: 3 additions & 2 deletions b/‎torchtitan/experiments/vlm/model/model.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎torchtitan/experiments/vlm/model/siglip2.py‎
Lines changed: 11 additions & 10 deletions b/‎torchtitan/experiments/vlm/model/siglip2.py‎
Lines changed: 11 additions & 10 deletions
@@ -13,24 +13,22 @@
 
 
 class TestModuleInitWeights(unittest.TestCase):
-    """Tests for Module.init_weights enforcement.
+    """Tests for Module.init_weights behavior.
 
-    Module.init_weights uses ``raise NotImplementedError`` because
-    nn.Module's metaclass is plain ``type`` (not ABCMeta), so
-    @abstractmethod alone does not prevent instantiation of subclasses
-    that forget to implement init_weights.
+    Module.init_weights provides a default no-op implementation so that
+    subclasses without learnable parameters (or loaded from checkpoints)
+    do not need to override it.
     """
 
-    def test_missing_init_weights_raises_on_call(self):
-        """Subclass without init_weights gets NotImplementedError at call time."""
+    def test_default_init_weights_is_noop(self):
+        """Subclass without init_weights gets the default no-op."""
 
-        class BadModule(Module):
+        class SimpleModule(Module):
             def __init__(self):
                 super().__init__()
 
-        m = BadModule()
-        with self.assertRaises(NotImplementedError):
-            m.init_weights()
+        m = SimpleModule()
+        m.init_weights()  # should not raise
 
     def test_init_weights_implemented(self):
         """Subclass with init_weights works normally."""
@@ -99,16 +97,15 @@ def test_isinstance_checks(self):
         self.assertIsInstance(emb, nn.Module)
         self.assertIsInstance(emb, Module)
 
-    def test_missing_init_weights_raises(self):
-        """Diamond class without init_weights raises on call."""
+    def test_default_init_weights_noop_diamond(self):
+        """Diamond class without init_weights gets the default no-op."""
 
-        class BadEmbedding(nn.Embedding, Module):
+        class SimpleEmbedding(nn.Embedding, Module):
             def __init__(self, num_embeddings, embedding_dim):
                 super().__init__(num_embeddings, embedding_dim)
 
-        emb = BadEmbedding(10, 4)
-        with self.assertRaises(NotImplementedError):
-            emb.init_weights()
+        emb = SimpleEmbedding(10, 4)
+        emb.init_weights()  # should not raise
 
     def test_module_hierarchy_is_flat(self):
         """Diamond embedding adds no extra layer to the module tree."""
 
@@ -28,6 +28,7 @@
     end_with_pass,
     get_extra_fsdp_pg_name,
 )
+from torchtitan.protocols.module import Module
 from torchtitan.tools.logging import logger
 
 
@@ -182,7 +183,7 @@ def wrapper_fn(args, kwargs):
     return wrapper_fn
 
 
-class CompiledModule(torch.nn.Module):
+class CompiledModule(Module):
     def __init__(
         self,
         inner: torch.nn.Module,
@@ -225,6 +226,14 @@ def __delattr__(self, name: str) -> None:
         else:
             super().__delattr__(name)
 
+    def init_weights(self, **kwargs) -> None:
+        # Explicitly delegate to inner model. Without this override,
+        # Module.init_weights (a no-op) would be found via MRO before
+        # the overwritten __getattr__ is triggered, silently skipping
+        # weight initialization.
+        # This is similar to state_dict, load_state_dict, ...
+        self.inner.init_weights(**kwargs)
+
     def state_dict(self, *args, **kwargs) -> Any:
         return self.inner.state_dict(*args, **kwargs)
 
 
@@ -23,6 +23,8 @@
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.placement_types import _StridedShard, Placement
 
+from torchtitan.protocols.module import Module
+
 _active_parametrization = True
 
 
@@ -150,7 +152,7 @@ def _register_parametrization(
     module.__class__ = module_cls
 
 
-class ReplicateComputation(torch.nn.Module):
+class ReplicateComputation(Module):
     def __init__(
         self,
         device_mesh: DeviceMesh,
 
@@ -11,12 +11,13 @@
 from torchtitan.experiments.rl.vllm_compat.models.attention import (
     VLLMCompatibleFlashAttention,
 )
+from torchtitan.protocols.module import Module
 from vllm.model_executor.layers.attention import Attention
 
 logger = logging.getLogger(__name__)
 
 
-class VLLMAttention(torch.nn.Module):
+class VLLMAttention(Module):
     """Adapter from TorchTitan tensor layout to ``vllm.Attention``.
 
     vLLM's ``Attention`` layer manages KV-cache and paged attention internally,
 
@@ -16,7 +16,6 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.checkpoint as dcp
-import torch.nn as nn
 from torch.distributed._tensor import DTensor, Replicate
 from torch.distributed.checkpoint.state_dict import (
     set_model_state_dict,
@@ -29,6 +28,7 @@
     replace_with_vllm_attention,
 )
 from torchtitan.protocols.model_spec import ModelSpec
+from torchtitan.protocols.module import Module
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -118,7 +118,7 @@ def create_torchtitan_config_from_vllm_config(
         "positions": 0,
     }
 )
-class TorchTitanVLLMModelWrapper(nn.Module):
+class TorchTitanVLLMModelWrapper(Module):
     """
     Generic vLLM-compatible model wrapper for TorchTitan models. Implemented
     required interface required by vLLM Engine.
 
@@ -10,10 +10,12 @@
 
 import torch
 from torch.distributed._tensor import DTensor
+
+from torchtitan.protocols.module import Module
 from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func
 
 
-class VLLMCompatibleFlashAttention(torch.nn.Module):
+class VLLMCompatibleFlashAttention(Module):
     """Wrapper around FlashAttention as used by VLLM"""
 
     def __init__(self) -> None:
 
@@ -23,6 +23,7 @@
 # Import from main torchtitan
 from torchtitan.models.qwen3.model import Qwen3Model
 from torchtitan.protocols.model import BaseModel
+from torchtitan.protocols.module import Module
 
 # Import from local experiment's models
 from ..attention import VLLMCompatibleFlashAttention
@@ -82,7 +83,7 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     )
 
 
-class VLLMRMSNorm(nn.Module):
+class VLLMRMSNorm(Module):
     """
     RMSNorm using vLLM's exact Triton kernel for bitwise determinism.
     Compatible with PyTorch's nn.RMSNorm interface but uses vLLM's implementation.
@@ -104,7 +105,7 @@ def reset_parameters(self):
         nn.init.ones_(self.weight)
 
 
-class FeedForwardVLLMCompat(nn.Module):
+class FeedForwardVLLMCompat(Module):
     """
     FeedForward module compatible with vLLM implementation.
     Uses merged gate_up projection like vLLM.
@@ -132,13 +133,14 @@ def forward(self, x):
         output = self.down_proj(activated)
         return output
 
-    def init_weights(self, init_std: float):
-        # Initialize like vLLM
+    def init_weights(self, **kwargs) -> None:
+        init_std = kwargs.get("init_std")
+        assert init_std is not None
         nn.init.trunc_normal_(self.gate_up_proj.weight, mean=0.0, std=0.02)
         nn.init.trunc_normal_(self.down_proj.weight, mean=0.0, std=init_std)
 
 
-class Attention(nn.Module):
+class Attention(Module):
     """
     Multi-head attention module compatible with vLLM.
     """
@@ -172,7 +174,9 @@ def __init__(self, model_args: Qwen3Model.Config):
         # Always use vLLM compatible flash attention
         self.inner_attention = VLLMCompatibleFlashAttention()
 
-    def init_weights(self, init_std: float):
+    def init_weights(self, **kwargs) -> None:
+        init_std = kwargs.get("init_std")
+        assert init_std is not None
         for linear in (self.wq, self.wk, self.wv):
             nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
         nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)
@@ -226,7 +230,7 @@ def forward(
         return self.wo(output)
 
 
-class TransformerBlock(nn.Module):
+class TransformerBlock(Module):
     """
     TransformerBlock with vLLM-compatible FFN.
     """
@@ -267,11 +271,11 @@ def forward(
 
         return x
 
-    def init_weights(self, buffer_device: torch.device):
+    def init_weights(self, **kwargs) -> None:
         for norm in (self.attention_norm, self.ffn_norm):
             norm.reset_parameters()
-        self.attention.init_weights(self.weight_init_std)
-        self.feed_forward.init_weights(self.weight_init_std)
+        self.attention.init_weights(init_std=self.weight_init_std)
+        self.feed_forward.init_weights(init_std=self.weight_init_std)
 
 
 class Qwen3VLLMCompatModel(BaseModel):
@@ -318,7 +322,7 @@ def init_weights(
             nn.init.normal_(self.tok_embeddings.weight)
         for layer in self.layers.values():
             if layer is not None:
-                layer.init_weights(buffer_device)
+                layer.init_weights(buffer_device=buffer_device)
         if self.norm is not None:
             self.norm.reset_parameters()
         final_out_std = self.config.dim**-0.5
 
@@ -14,6 +14,7 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.models.common.attention import AttentionMasksType
 from torchtitan.models.llama3 import Llama3Model as Llama3
+from torchtitan.protocols.module import Module
 
 from .args import Siglip2Config, SpecialTokens
 from .siglip2 import VisionTransformer
@@ -34,7 +35,7 @@ def _scatter_img_tokens(h_BSD, tokens_BS, i_NLD, i_mask_NL, img_id):
     return h_BSD
 
 
-class Projector(nn.Module):
+class Projector(Module):
     """Project the Encoder embedding to the LLM embedding."""
 
     def __init__(self, in_dim: int, out_dim: int) -> None:
@@ -49,7 +50,7 @@ def forward(self, x_NLD: torch.Tensor):
         x_NLD = self.w2(x_NLD)
         return x_NLD
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         nn.init.xavier_uniform_(self.w1.weight)
         if self.w1.bias is not None:
             nn.init.zeros_(self.w1.bias)
 
@@ -18,6 +18,7 @@
     get_causal_mask_mod,
     get_document_mask_mod,
 )
+from torchtitan.protocols.module import Module
 
 from .args import Siglip2Config
 
@@ -71,7 +72,7 @@ def resize_positional_embeddings(
     return resized_embs_BLD
 
 
-class VisionEmbeddings(nn.Module):
+class VisionEmbeddings(Module):
     def __init__(self, args: Siglip2Config):
         super().__init__()
         self.patch_embedding = nn.Linear(
@@ -81,7 +82,7 @@ def __init__(self, args: Siglip2Config):
         self.position_embedding = nn.Embedding(args.n_pos_embs**2, args.dim)
         self.n_pos_embs = args.n_pos_embs
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         nn.init.trunc_normal_(self.patch_embedding.weight, mean=0.0, std=0.02)
         nn.init.normal_(self.position_embedding.weight)
 
@@ -106,7 +107,7 @@ def forward(self, pixels_NLD: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tens
         return embeddings
 
 
-class Attention(nn.Module):
+class Attention(Module):
     """
     Multi-head attention module.
 
@@ -151,12 +152,12 @@ def forward(self, x: torch.Tensor, attention_masks: AttentionMasksType):
 
         return self.out_proj(output)
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         for linear in (self.q_proj, self.k_proj, self.v_proj, self.out_proj):
             nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
 
 
-class FeedForward(nn.Module):
+class FeedForward(Module):
     def __init__(self, args: Siglip2Config):
         super().__init__()
         self.fc1 = nn.Linear(args.dim, args.ffn_dim)
@@ -168,12 +169,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.fc2(x)
         return x
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         nn.init.trunc_normal_(self.fc1.weight, mean=0.0, std=0.02)
         nn.init.trunc_normal_(self.fc2.weight, mean=0.0, std=0.02)
 
 
-class TransformerLayer(nn.Module):
+class TransformerLayer(Module):
     def __init__(self, args: Siglip2Config):
         super().__init__()
         self.layer_norm1 = nn.LayerNorm(args.dim, eps=args.layer_norm_eps)
@@ -188,14 +189,14 @@ def forward(
         x = x + self.mlp(self.layer_norm2(x))
         return x
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         self.layer_norm1.reset_parameters()
         self.layer_norm2.reset_parameters()
         self.self_attn.init_weights()
         self.mlp.init_weights()
 
 
-class VisionTransformer(nn.Module):
+class VisionTransformer(Module):
     def __init__(self, args: Siglip2Config):
         super().__init__()
         self.args = args
@@ -251,7 +252,7 @@ def forward(
 
         return h
 
-    def init_weights(self):
+    def init_weights(self, **kwargs) -> None:
         self.embeddings.init_weights()
         for layer in self.layers.values():
             layer.init_weights()