mori360
diff --git a/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 12 additions & 10 deletions b/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎tests/unit_tests/test_compile_moe.py‎
Lines changed: 8 additions & 6 deletions b/‎tests/unit_tests/test_compile_moe.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tests/unit_tests/test_module.py‎
Lines changed: 177 additions & 4 deletions b/‎tests/unit_tests/test_module.py‎
Lines changed: 177 additions & 4 deletions
diff --git a/‎tests/unit_tests/test_train_spec.py‎
Lines changed: 6 additions & 3 deletions b/‎tests/unit_tests/test_train_spec.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎torchtitan/distributed/pipeline_parallel.py‎
Lines changed: 4 additions & 3 deletions b/‎torchtitan/distributed/pipeline_parallel.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎torchtitan/experiments/ft/diloco/utils.py‎
Lines changed: 2 additions & 1 deletion b/‎torchtitan/experiments/ft/diloco/utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchtitan/experiments/ft/trainer.py‎
Lines changed: 3 additions & 0 deletions b/‎torchtitan/experiments/ft/trainer.py‎
Lines changed: 3 additions & 0 deletions
@@ -7,31 +7,33 @@
 import unittest
 
 import torch
-import torch.nn as nn
 
 from torch.utils.flop_counter import FlopCounterMode
 from torchtitan.config import ActivationCheckpointConfig as ACConfig
 from torchtitan.distributed.activation_checkpoint import apply_ac
+from torchtitan.models.common.linear import Linear
+from torchtitan.protocols.module import Module, ModuleDict
 
 
-class ToyModule(nn.Module):
+class ToyModule(Module):
     def __init__(self):
         super().__init__()
-        self.layers = nn.ModuleDict({"0": TransformerBlock()})
+        self.layers = ModuleDict({"0": TransformerBlock()})
 
     def forward(self, x):
         return self.layers["0"](x)
 
 
-class TransformerBlock(nn.Module):
+class TransformerBlock(Module):
     def __init__(self):
         super().__init__()
-        self.moe = nn.Module()
-        self.moe.router = nn.Module()
-        self.moe.router.gate = nn.Linear(512, 512, bias=False)
-        self.attention = nn.Module()
-        self.attention.wq = nn.Linear(512, 512, bias=False)
-        self.output = nn.Linear(512, 1024, bias=False)
+        linear_config = Linear.Config(bias=False)
+        self.moe = Module()
+        self.moe.router = Module()
+        self.moe.router.gate = linear_config.build(in_features=512, out_features=512)
+        self.attention = Module()
+        self.attention.wq = linear_config.build(in_features=512, out_features=512)
+        self.output = linear_config.build(in_features=512, out_features=1024)
 
     def forward(self, x):
         gate_out = self.moe.router.gate(x)
 
@@ -7,17 +7,19 @@
 import unittest
 
 import torch
-import torch.nn as nn
 
 from torchtitan.config import CompileConfig
+from torchtitan.models.common.linear import Linear
 from torchtitan.models.llama4.parallelize import apply_compile
+from torchtitan.protocols.module import Module, ModuleDict
 
 
-class TransformerBlock(nn.Module):
+class TransformerBlock(Module):
     def __init__(self, dim=512):
         super().__init__()
-        self.attention = nn.Linear(dim, dim, bias=False)
-        self.mlp = nn.Linear(dim, dim, bias=False)
+        linear_config = Linear.Config(bias=False)
+        self.attention = linear_config.build(in_features=dim, out_features=dim)
+        self.mlp = linear_config.build(in_features=dim, out_features=dim)
         self.moe_enabled = False
 
     def forward(self, x):
@@ -26,10 +28,10 @@ def forward(self, x):
         return x
 
 
-class TinyModel(nn.Module):
+class TinyModel(Module):
     def __init__(self, num_layers=2, dim=512):
         super().__init__()
-        self.layers = nn.ModuleDict(
+        self.layers = ModuleDict(
             {str(i): TransformerBlock(dim) for i in range(num_layers)}
         )
 
 
@@ -5,11 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import unittest
+from dataclasses import dataclass
 
 import torch
 import torch.nn as nn
 
-from torchtitan.protocols.module import Module
+from torchtitan.models.common.linear import Linear
+from torchtitan.protocols.module import Module, ModuleDict, ModuleList, Sequential
 
 
 class TestModuleInitWeights(unittest.TestCase):
@@ -36,7 +38,9 @@ def test_init_weights_implemented(self):
         class GoodModule(Module):
             def __init__(self):
                 super().__init__()
-                self.linear = nn.Linear(4, 4)
+                self.linear = Linear.Config(bias=True).build(
+                    in_features=4, out_features=4
+                )
 
             def init_weights(self, **kwargs):
                 nn.init.zeros_(self.linear.weight)
@@ -110,11 +114,13 @@ def __init__(self, num_embeddings, embedding_dim):
     def test_module_hierarchy_is_flat(self):
         """Diamond embedding adds no extra layer to the module tree."""
 
-        class Model(nn.Module):
+        class Model(Module):
             def __init__(self):
                 super().__init__()
                 self.embed = TestDiamondInheritance.TestEmbedding(100, 32)
-                self.linear = nn.Linear(32, 16)
+                self.linear = Linear.Config(bias=True).build(
+                    in_features=32, out_features=16
+                )
 
         model = Model()
         param_names = {name for name, _ in model.named_parameters()}
@@ -138,5 +144,172 @@ def counting_init(self, *args, **kwargs):
             nn.Module.__init__ = orig_init
 
 
+class TestFromNnModule(unittest.TestCase):
+    """Tests for Module.from_nn_module utility."""
+
+    def test_is_subclass(self):
+        """Created class is subclass of both original and Module."""
+        Conv2d = Module.from_nn_module(nn.Conv2d)
+        self.assertTrue(issubclass(Conv2d, nn.Conv2d))
+        self.assertTrue(issubclass(Conv2d, Module))
+
+    def test_isinstance(self):
+        """Instance satisfies isinstance checks for both original and Module."""
+        Conv2d = Module.from_nn_module(nn.Conv2d)
+        m = Conv2d(3, 16, 3)
+        self.assertIsInstance(m, nn.Conv2d)
+        self.assertIsInstance(m, Module)
+
+    def test_init_weights_calls_reset_parameters(self):
+        """For classes with reset_parameters, init_weights delegates to it."""
+        LayerNorm = Module.from_nn_module(nn.LayerNorm)
+        m = LayerNorm(32)
+        # Manually set weight to zeros, then init_weights should reset
+        nn.init.zeros_(m.weight)
+        m.init_weights()
+        # After reset_parameters, weight should be ones for LayerNorm
+        self.assertTrue(torch.allclose(m.weight, torch.ones(32)))
+
+    def test_init_weights_noop_for_parameterless(self):
+        """For classes without reset_parameters, init_weights is a no-op."""
+        GELU = Module.from_nn_module(nn.GELU)
+        m = GELU()
+        m.init_weights()  # should not raise
+
+    def test_cache(self):
+        """Repeated calls return the same class object."""
+        cls1 = Module.from_nn_module(nn.Conv2d)
+        cls2 = Module.from_nn_module(nn.Conv2d)
+        self.assertIs(cls1, cls2)
+
+    def test_forward_unchanged(self):
+        """Forward output is identical to original class."""
+        LayerNorm = Module.from_nn_module(nn.LayerNorm)
+        torch.manual_seed(42)
+        orig = nn.LayerNorm(16)
+        wrapped = LayerNorm(16)
+        # Copy weights
+        wrapped.load_state_dict(orig.state_dict())
+        x = torch.randn(2, 16)
+        torch.testing.assert_close(orig(x), wrapped(x))
+
+    def test_state_dict_unchanged(self):
+        """state_dict keys and values match the original class."""
+        Conv2d = Module.from_nn_module(nn.Conv2d)
+        orig = nn.Conv2d(3, 16, 3)
+        wrapped = Conv2d(3, 16, 3)
+        wrapped.load_state_dict(orig.state_dict())
+        for key in orig.state_dict():
+            self.assertIn(key, wrapped.state_dict())
+            torch.testing.assert_close(
+                orig.state_dict()[key], wrapped.state_dict()[key]
+            )
+
+
+class TestContainerInitWeights(unittest.TestCase):
+    """Tests for ModuleList, ModuleDict, Sequential init_weights."""
+
+    def test_module_list_init_weights(self):
+        """ModuleList.init_weights calls init_weights on each child."""
+        LayerNorm = Module.from_nn_module(nn.LayerNorm)
+        norms = ModuleList([LayerNorm(8) for _ in range(3)])
+        for n in norms:
+            nn.init.zeros_(n.weight)
+        norms.init_weights()
+        for n in norms:
+            self.assertTrue(torch.allclose(n.weight, torch.ones(8)))
+
+    def test_module_dict_init_weights(self):
+        """ModuleDict.init_weights calls init_weights on each child."""
+        LayerNorm = Module.from_nn_module(nn.LayerNorm)
+        norms = ModuleDict({"a": LayerNorm(8), "b": LayerNorm(8)})
+        for n in norms.values():
+            nn.init.zeros_(n.weight)
+        norms.init_weights()
+        for n in norms.values():
+            self.assertTrue(torch.allclose(n.weight, torch.ones(8)))
+
+    def test_sequential_init_weights(self):
+        """Sequential.init_weights calls init_weights on each child."""
+        linear = Linear.Config(bias=False).build(in_features=4, out_features=4)
+        GELU = Module.from_nn_module(nn.GELU)
+        seq = Sequential(linear, GELU())
+        seq.init_weights()  # should not raise
+
+    def test_containers_are_module(self):
+        """Container instances satisfy Module protocol."""
+        self.assertIsInstance(ModuleList(), Module)
+        self.assertIsInstance(ModuleDict(), Module)
+        self.assertIsInstance(Sequential(), Module)
+
+
+class TestVerifyModuleProtocol(unittest.TestCase):
+    """Tests for BaseModel.verify_module_protocol."""
+
+    def test_passes_for_all_module(self):
+        """No error when all submodules are Module instances."""
+        from torchtitan.protocols.model import BaseModel
+
+        class GoodModel(BaseModel):
+            @dataclass(kw_only=True, slots=True)
+            class Config(BaseModel.Config):
+                def update_from_config(self, *, trainer_config, **kwargs):
+                    pass
+
+                def get_nparams_and_flops(self, model, seq_len):
+                    return (0, 0)
+
+            def __init__(self):
+                super().__init__()
+                self.linear = Linear.Config().build(in_features=4, out_features=4)
+
+        model = GoodModel()
+        model.verify_module_protocol()  # should not raise
+
+    def test_default_raises_for_plain_nn_module(self):
+        """Default verify_module_protocol raises when plain nn.Module child exists."""
+        from torchtitan.protocols.model import BaseModel
+
+        class BadModel(BaseModel):
+            @dataclass(kw_only=True, slots=True)
+            class Config(BaseModel.Config):
+                def update_from_config(self, *, trainer_config, **kwargs):
+                    pass
+
+                def get_nparams_and_flops(self, model, seq_len):
+                    return (0, 0)
+
+            def __init__(self):
+                super().__init__()
+                self.plain = nn.Linear(4, 4)
+
+        model = BadModel()
+        with self.assertRaises(RuntimeError):
+            model.verify_module_protocol()
+
+    def test_override_skips_verification(self):
+        """Subclass can override verify_module_protocol to skip verification."""
+        from torchtitan.protocols.model import BaseModel
+
+        class ThirdPartyModel(BaseModel):
+            @dataclass(kw_only=True, slots=True)
+            class Config(BaseModel.Config):
+                def update_from_config(self, *, trainer_config, **kwargs):
+                    pass
+
+                def get_nparams_and_flops(self, model, seq_len):
+                    return (0, 0)
+
+            def __init__(self):
+                super().__init__()
+                self.plain = nn.Linear(4, 4)  # third-party module
+
+            def verify_module_protocol(self) -> None:
+                pass  # skip for third-party internals
+
+        model = ThirdPartyModel()
+        model.verify_module_protocol()  # should not raise
+
+
 if __name__ == "__main__":
     unittest.main()
@@ -12,6 +12,7 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.models.common.linear import Linear
 from torchtitan.models.llama3 import model_registry, parallelize_llama
 from torchtitan.protocols import BaseModel
 from torchtitan.protocols.model_spec import ModelSpec
@@ -30,13 +31,15 @@ def get_nparams_and_flops(self, model, seq_len):
 
     def __init__(self, config: Config):
         super().__init__()
-        self.linear = nn.Linear(config.hidden, config.hidden)
+        self.linear = Linear.Config().build(
+            in_features=config.hidden, out_features=config.hidden
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.linear(x)
 
-    def init_weights(self, buffer_device: torch.device | None = None) -> None:
-        nn.init.normal_(self.linear.weight, mean=0.0, std=0.02)
+    def init_weights(self, buffer_device: torch.device | None = None, **kwargs) -> None:
+        self.linear.init_weights()
 
 
 def fake_post_optimizer_build_fn(
 
@@ -35,6 +35,7 @@
 from torchtitan.protocols.model import BaseModel
 from torchtitan.protocols.model_converter import ModelConvertersContainer
 from torchtitan.protocols.model_spec import ParallelizeFunction
+from torchtitan.protocols.module import ModuleDict, ModuleList
 from torchtitan.tools.logging import logger
 
 __all__ = [
@@ -437,7 +438,7 @@ def _build_stage_from_modules(
                         indices_to_keep = {
                             int(idx) for idx in layers_to_keep if idx.isdigit()
                         }
-                        new_layers = nn.ModuleList(
+                        new_layers = ModuleList(
                             [
                                 layer
                                 for i, layer in enumerate(module_value)
@@ -448,9 +449,9 @@ def _build_stage_from_modules(
                 else:
                     # No layers from this structure needed, set to empty structure
                     if isinstance(module_value, nn.ModuleDict):
-                        setattr(model, module_name, nn.ModuleDict())
+                        setattr(model, module_name, ModuleDict())
                     elif isinstance(module_value, nn.ModuleList):
-                        setattr(model, module_name, nn.ModuleList())
+                        setattr(model, module_name, ModuleList())
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
                 # Replace with None
 
@@ -8,6 +8,7 @@
 
 from torchtitan.distributed.pipeline_parallel import generate_llm_fqn_per_model_part
 from torchtitan.experiments.ft.config import FaultTolerance as FTConfig
+from torchtitan.protocols.module import ModuleList
 from torchtitan.tools.logging import logger
 
 
@@ -72,7 +73,7 @@ def _build_fragment_from_modules(
                     indices_to_keep = {
                         int(idx) for idx in layers_to_keep if idx.isdigit()
                     }
-                    new_layers = nn.ModuleList(
+                    new_layers = ModuleList(
                         [
                             layer
                             for i, layer in enumerate(module_value)
 
@@ -129,6 +129,9 @@ def __init__(self, config: Config):
         )
         model_converters.convert(model)
 
+        # Verify all submodules satisfy the Module protocol
+        model.verify_module_protocol()
+
         # metrics logging (FT addition: ft_enable, ft_replica_id)
         self.metrics_processor = config.metrics.build(
             parallel_dims=parallel_dims,
Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`
`9`	`9`	`from torchtitan.distributed.pipeline_parallel import generate_llm_fqn_per_model_part`
`10`	`10`	`from torchtitan.experiments.ft.config import FaultTolerance as FTConfig`
	`11`	`+from torchtitan.protocols.module import ModuleList`
`11`	`12`	`from torchtitan.tools.logging import logger`
`12`	`13`
`13`	`14`
`@@ -72,7 +73,7 @@ def _build_fragment_from_modules(`
`72`	`73`	`indices_to_keep = {`
`73`	`74`	`int(idx) for idx in layers_to_keep if idx.isdigit()`
`74`	`75`	`}`
`75`		`- new_layers = nn.ModuleList(`
	`76`	`+ new_layers = ModuleList(`
`76`	`77`	`[`
`77`	`78`	`layer`
`78`	`79`	`for i, layer in enumerate(module_value)`
Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,9 @@ def __init__(self, config: Config):`
`129`	`129`	`)`
`130`	`130`	`model_converters.convert(model)`
`131`	`131`
	`132`	`+ # Verify all submodules satisfy the Module protocol`
	`133`	`+ model.verify_module_protocol()`
	`134`	`+`
`132`	`135`	`# metrics logging (FT addition: ft_enable, ft_replica_id)`
`133`	`136`	`self.metrics_processor = config.metrics.build(`
`134`	`137`	`parallel_dims=parallel_dims,`