Add Llama 3.2 1B and 3B to model registry, clean up test imports

dean-mccoppin · dean-mccoppin · commit ceda986f3cb1 · 2026-03-20T16:05:30.000-04:00
Llama 3.2 1B and 3B are the only Llama variants with weight tying, so
they belong in the registry. Without them the feature has no real entry
point.

Also dropped the try/except guard in test_weight_tying.py, which was
inconsistent with every other unit test here and silently skips on
broken imports.
diff --git a/tests/unit_tests/test_weight_tying.py b/tests/unit_tests/test_weight_tying.py
@@ -6,23 +6,13 @@
 
 import unittest
 
-try:
-    from torchtitan.models.common.attention import GQAttention
-    from torchtitan.models.common.embedding import Embedding
-    from torchtitan.models.common.feed_forward import (
-        compute_ffn_hidden_dim,
-        FeedForward,
-    )
-    from torchtitan.models.common.linear import Linear
-    from torchtitan.models.common.rmsnorm import RMSNorm
-    from torchtitan.models.common.rope import RoPE
-    from torchtitan.models.llama3.model import Llama3Model, Llama3TransformerBlock
-
-    HAS_TORCHTITAN_MODELS = True
-except Exception:
-    HAS_TORCHTITAN_MODELS = False
-
-_SKIP_MSG = "torchtitan model imports not available (missing triton or other deps)"
+from torchtitan.models.common.attention import GQAttention
+from torchtitan.models.common.embedding import Embedding
+from torchtitan.models.common.feed_forward import compute_ffn_hidden_dim, FeedForward
+from torchtitan.models.common.linear import Linear
+from torchtitan.models.common.rmsnorm import RMSNorm
+from torchtitan.models.common.rope import RoPE
+from torchtitan.models.llama3.model import Llama3Model, Llama3TransformerBlock
 
 
 def _make_config(enable_weight_tying: bool = False):
@@ -56,7 +46,6 @@ def _make_config(enable_weight_tying: bool = False):
     )
 
 
-@unittest.skipUnless(HAS_TORCHTITAN_MODELS, _SKIP_MSG)
 class TestLlama3WeightTying(unittest.TestCase):
     def test_weights_are_shared_when_tying_enabled(self):
         """tok_embeddings.weight and output.weight should share the same storage."""
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -113,6 +113,66 @@
             scaling="llama",
         ),
     ),
+    "1B": Llama3Model.Config(
+        dim=2048,
+        n_layers=16,
+        enable_weight_tying=True,
+        tok_embeddings=Embedding.Config(),
+        norm=RMSNorm.Config(),
+        output=Linear.Config(),
+        layer=Llama3TransformerBlock.Config(
+            attention_norm=RMSNorm.Config(),
+            ffn_norm=RMSNorm.Config(),
+            feed_forward=FeedForward.Config(
+                hidden_dim=compute_ffn_hidden_dim(
+                    2048, multiple_of=1024, ffn_dim_multiplier=1.5
+                ),
+            ),
+            attention=GQAttention.Config(
+                n_heads=32,
+                n_kv_heads=8,
+                attn_backend="sdpa",
+                rope_backend="complex",
+            ),
+        ),
+        rope=RoPE.Config(
+            dim=2048 // 32,
+            max_seq_len=131072,
+            theta=500000,
+            backend="complex",
+            scaling="llama",
+        ),
+    ),
+    "3B": Llama3Model.Config(
+        dim=3072,
+        n_layers=28,
+        enable_weight_tying=True,
+        tok_embeddings=Embedding.Config(),
+        norm=RMSNorm.Config(),
+        output=Linear.Config(),
+        layer=Llama3TransformerBlock.Config(
+            attention_norm=RMSNorm.Config(),
+            ffn_norm=RMSNorm.Config(),
+            feed_forward=FeedForward.Config(
+                hidden_dim=compute_ffn_hidden_dim(
+                    3072, multiple_of=1024, ffn_dim_multiplier=1.0
+                ),
+            ),
+            attention=GQAttention.Config(
+                n_heads=24,
+                n_kv_heads=8,
+                attn_backend="sdpa",
+                rope_backend="complex",
+            ),
+        ),
+        rope=RoPE.Config(
+            dim=3072 // 24,
+            max_seq_len=131072,
+            theta=500000,
+            backend="complex",
+            scaling="llama",
+        ),
+    ),
     "8B": Llama3Model.Config(
         dim=4096,
         n_layers=32,