pytorch
diff --git a/‎.ci/docker/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration_test_8gpu_features.yaml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_features.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_torchcomms.yaml‎
Lines changed: 0 additions & 54 deletions b/‎.github/workflows/integration_test_8gpu_torchcomms.yaml‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎tests/integration_tests/features.py‎
Lines changed: 28 additions & 0 deletions b/‎tests/integration_tests/features.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎tests/integration_tests/flux.py‎
Lines changed: 6 additions & 2 deletions b/‎tests/integration_tests/flux.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tests/unit_tests/test_configurable.py‎
Lines changed: 24 additions & 0 deletions b/‎tests/unit_tests/test_configurable.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎tests/unit_tests/test_dataset_flux.py‎
Lines changed: 16 additions & 11 deletions b/‎tests/unit_tests/test_dataset_flux.py‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎tests/unit_tests/test_module.py‎
Lines changed: 14 additions & 17 deletions b/‎tests/unit_tests/test_module.py‎
Lines changed: 14 additions & 17 deletions
diff --git a/‎tests/unit_tests/test_rope.py‎
Lines changed: 68 additions & 0 deletions b/‎tests/unit_tests/test_rope.py‎
Lines changed: 68 additions & 0 deletions
@@ -3,7 +3,7 @@ datasets >= 3.6.0
 tensorboard
 wandb
 fsspec
-tyro
+tyro >= 1.0.5
 tokenizers >= 0.15.0
 safetensors
 einops
 
@@ -66,6 +66,9 @@ jobs:
         fi
         python -m pip install --force-reinstall --pre \
           "${TORCH_SPEC}" --index-url ${{ matrix.index-url }}
+        if [[ "${{ matrix.gpu-arch-type }}" == "cuda" ]]; then
+          python -m pip install --pre torchcomms --index-url ${{ matrix.index-url }}
+        fi
         end=$(date +%s)
         echo "pip install torch took $((end - start)) seconds"
 
 
@@ -559,6 +559,34 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "Float8 emulation test",
             "float8_emulation",
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--comm.mode torchcomms",
+                    "--parallelism.context_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--compile.enable",
+                ],
+            ],
+            "FSDP+CP+PP+compile with torchcomms",
+            "torchcomms_3d_dp+cp+pp+compile",
+            ngpu=8,
+            skip_rocm_test=True,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--comm.mode torchcomms",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--compile.enable",
+                ],
+            ],
+            "FSDP+TP+PP+compile with torchcomms",
+            "torchcomms_3d_dp+tp+pp+compile",
+            ngpu=8,
+            skip_rocm_test=True,
+        ),
     ]
 
     return integration_tests_flavors
@@ -53,13 +53,15 @@ def run_single_test(test_flavor: OverrideDefinitions, output_dir: str):
     dump_folder_arg = f"--dump_folder {output_dir}/{test_name}"
 
     # Random init encoder for offline testing
-    random_init_encoder_arg = "--encoder.test_mode --dataloader.encoder.test_mode"
+    random_init_arg = "--tokenizer.test_mode --encoder.random_init"
     clip_encoder_version_arg = (
         "--encoder.clip_encoder tests/assets/flux_test_encoders/clip-vit-large-patch14/"
     )
     t5_encoder_version_arg = (
         "--encoder.t5_encoder tests/assets/flux_test_encoders/t5-v1_1-xxl/"
     )
+    t5_tokenizer_path_arg = "--tokenizer.t5_tokenizer_path tests/assets/tokenizer"
+    clip_tokenizer_path_arg = "--tokenizer.clip_tokenizer_path tests/assets/tokenizer"
     hf_assets_path_arg = "--hf_assets_path tests/assets/tokenizer"
 
     all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
@@ -78,9 +80,11 @@ def run_single_test(test_flavor: OverrideDefinitions, output_dir: str):
             )
 
         cmd += " " + dump_folder_arg
-        cmd += " " + random_init_encoder_arg
+        cmd += " " + random_init_arg
         cmd += " " + clip_encoder_version_arg
         cmd += " " + t5_encoder_version_arg
+        cmd += " " + t5_tokenizer_path_arg
+        cmd += " " + clip_tokenizer_path_arg
         cmd += " " + hf_assets_path_arg
         if override_arg:
             cmd += " " + " ".join(override_arg)
 
@@ -200,6 +200,30 @@ def __init__(self, config: Config):
         self.assertEqual(d2["inner"]["a"], 1)
         self.assertEqual(d2["inner"]["b"], 256)
 
+    def test_repr_with_unset_init_false(self):
+        """repr() must not crash when field(init=False) slots are unset."""
+        cfg = self.NewStyleComponent.Config(x=10)
+        # Before build: dim and hidden are unset
+        r = repr(cfg)
+        self.assertIn("x=10", r)
+        self.assertIn("dim=<UNSET>", r)
+        self.assertIn("hidden=<UNSET>", r)
+
+        # After build: all fields set
+        obj = cfg.build(dim=64, hidden=128)
+        r2 = repr(obj.config)
+        self.assertIn("x=10", r2)
+        self.assertIn("dim=64", r2)
+        self.assertIn("hidden=128", r2)
+        self.assertNotIn("UNSET", r2)
+
+    def test_repr_no_init_false_fields(self):
+        """repr() works normally when there are no field(init=False) fields."""
+        cfg = self.NoKwargsComponent.Config(x=42)
+        r = repr(cfg)
+        self.assertIn("x=42", r)
+        self.assertNotIn("UNSET", r)
+
     def test_init_false_with_inheritance(self):
         """Child config can redeclare field with default."""
 
 
@@ -65,21 +65,29 @@ def test_load_dataset(self):
                         str(256),
                         "--dataloader.dataset",
                         dataset_name,
-                        "--dataloader.classifier_free_guidance_prob",
+                        "--dataloader.prompt_dropout_prob",
                         "0.447",
-                        "--dataloader.encoder.test_mode",
-                        "--encoder.test_mode",
+                        "--tokenizer.test_mode",
+                        "--tokenizer.t5_tokenizer_path",
+                        "tests/assets/tokenizer",
+                        "--tokenizer.clip_tokenizer_path",
+                        "tests/assets/tokenizer",
+                        "--encoder.random_init",
                         "--encoder.t5_encoder",
                         "tests/assets/flux_test_encoders/t5-v1_1-xxl",
                         "--encoder.clip_encoder",
                         "tests/assets/flux_test_encoders/clip-vit-large-patch14",
                     ]
                 )
 
+                # Build the tokenizer container from config
+                tokenizer = config.tokenizer.build(tokenizer_path=config.hf_assets_path)
+
                 dl = config.dataloader.build(
                     dp_world_size=world_size,
                     dp_rank=rank,
                     local_batch_size=batch_size,
+                    tokenizer=tokenizer,
                 )
 
                 it = iter(dl)
@@ -91,11 +99,11 @@ def test_load_dataset(self):
                         len(input_data) == 3
                     )  # (clip_encodings, t5_encodings, prompt)
                     assert labels.shape == (batch_size, 3, 256, 256)
-                    assert input_data["clip_tokens"].shape == (
+                    assert input_data["clip"].shape == (
                         batch_size,
                         77,
                     )
-                    assert input_data["t5_tokens"].shape == (
+                    assert input_data["t5"].shape == (
                         batch_size,
                         256,
                     )
@@ -107,6 +115,7 @@ def test_load_dataset(self):
                     dp_world_size=world_size,
                     dp_rank=rank,
                     local_batch_size=batch_size,
+                    tokenizer=tokenizer,
                 )
                 dl_resumed.load_state_dict(state)
                 it_resumed = iter(dl_resumed)
@@ -119,10 +128,6 @@ def test_load_dataset(self):
                     torch.manual_seed(i)
                     input_ids, labels = next(it_resumed)
 
-                    assert torch.equal(
-                        input_ids["clip_tokens"], expected_input_ids["clip_tokens"]
-                    )
-                    assert torch.equal(
-                        input_ids["t5_tokens"], expected_input_ids["t5_tokens"]
-                    )
+                    assert torch.equal(input_ids["clip"], expected_input_ids["clip"])
+                    assert torch.equal(input_ids["t5"], expected_input_ids["t5"])
                     assert torch.equal(labels, expected_labels)
@@ -13,24 +13,22 @@
 
 
 class TestModuleInitWeights(unittest.TestCase):
-    """Tests for Module.init_weights enforcement.
+    """Tests for Module.init_weights behavior.
 
-    Module.init_weights uses ``raise NotImplementedError`` because
-    nn.Module's metaclass is plain ``type`` (not ABCMeta), so
-    @abstractmethod alone does not prevent instantiation of subclasses
-    that forget to implement init_weights.
+    Module.init_weights provides a default no-op implementation so that
+    subclasses without learnable parameters (or loaded from checkpoints)
+    do not need to override it.
     """
 
-    def test_missing_init_weights_raises_on_call(self):
-        """Subclass without init_weights gets NotImplementedError at call time."""
+    def test_default_init_weights_is_noop(self):
+        """Subclass without init_weights gets the default no-op."""
 
-        class BadModule(Module):
+        class SimpleModule(Module):
             def __init__(self):
                 super().__init__()
 
-        m = BadModule()
-        with self.assertRaises(NotImplementedError):
-            m.init_weights()
+        m = SimpleModule()
+        m.init_weights()  # should not raise
 
     def test_init_weights_implemented(self):
         """Subclass with init_weights works normally."""
@@ -99,16 +97,15 @@ def test_isinstance_checks(self):
         self.assertIsInstance(emb, nn.Module)
         self.assertIsInstance(emb, Module)
 
-    def test_missing_init_weights_raises(self):
-        """Diamond class without init_weights raises on call."""
+    def test_default_init_weights_noop_diamond(self):
+        """Diamond class without init_weights gets the default no-op."""
 
-        class BadEmbedding(nn.Embedding, Module):
+        class SimpleEmbedding(nn.Embedding, Module):
             def __init__(self, num_embeddings, embedding_dim):
                 super().__init__(num_embeddings, embedding_dim)
 
-        emb = BadEmbedding(10, 4)
-        with self.assertRaises(NotImplementedError):
-            emb.init_weights()
+        emb = SimpleEmbedding(10, 4)
+        emb.init_weights()  # should not raise
 
     def test_module_hierarchy_is_flat(self):
         """Diamond embedding adds no extra layer to the module tree."""
 
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from torchtitan.models.common.rope import apply_rotary_emb_cos_sin
+
+
+class TestApplyRotaryEmbCosSin(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+        self.bsz = 2
+        self.seqlen = 16
+        self.n_heads = 4
+        self.head_dim = 64
+        self.xq = torch.randn(
+            self.bsz, self.seqlen, self.n_heads, self.head_dim, dtype=torch.bfloat16
+        )
+        self.xk = torch.randn(
+            self.bsz, self.seqlen, self.n_heads, self.head_dim, dtype=torch.bfloat16
+        )
+        self.rope_cache = torch.randn(
+            self.seqlen, self.head_dim * 2, dtype=torch.float32
+        )
+
+    def test_output_dtype_matches_input(self):
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+        self.assertEqual(xq_out.dtype, self.xq.dtype)
+        self.assertEqual(xk_out.dtype, self.xk.dtype)
+
+    def test_output_shape_matches_input(self):
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+        self.assertEqual(xq_out.shape, self.xq.shape)
+        self.assertEqual(xk_out.shape, self.xk.shape)
+
+    def test_computes_in_fp32(self):
+        """Output must match a reference computed entirely in float32.
+
+        Ensures inductor cannot fuse away the fp32 upcast when compiling
+        adjacent ops (e.g. q_norm/k_norm) with the RoPE computation.
+        """
+        xq_out, xk_out = apply_rotary_emb_cos_sin(self.xq, self.xk, self.rope_cache)
+
+        cos = self.rope_cache[..., : self.head_dim].unsqueeze(0).unsqueeze(2)
+        sin = self.rope_cache[..., self.head_dim :].unsqueeze(0).unsqueeze(2)
+
+        def rotate_half(x):
+            half = x.shape[-1] // 2
+            return torch.cat([-x[..., half:], x[..., :half]], dim=-1)
+
+        xq_ref = (
+            (self.xq.float() * cos) + (rotate_half(self.xq.float()) * sin)
+        ).bfloat16()
+        xk_ref = (
+            (self.xk.float() * cos) + (rotate_half(self.xk.float()) * sin)
+        ).bfloat16()
+
+        self.assertEqual((xq_out - xq_ref).abs().max().item(), 0.0)
+        self.assertEqual((xk_out - xk_ref).abs().max().item(), 0.0)
+
+
+if __name__ == "__main__":
+    unittest.main()