support lora moe

mori360 · mori360 · commit 559ebe17a2d6 · 2026-03-16T20:26:31.000-07:00
ghstack-source-id: 726558b Pull Request resolved: #2569
diff --git a/tests/unit_tests/test_model_converter.py b/tests/unit_tests/test_model_converter.py
@@ -205,6 +205,85 @@ def test_lora_key_remap_roundtrip():
         assert torch.equal(rt_sd[k], tt_sd[k])
 
 
+def test_lora_moe_freeze_and_trainability():
+    """LoRA on MoE model: router frozen, expert LoRA adapters trainable, base weights frozen."""
+    from torchtitan.models.common.moe.moe import GroupedExperts, TokenChoiceTopKRouter
+
+    # Build a minimal MoE-like model: a router + grouped experts + a dense linear
+    num_experts = 4
+    dim = 64
+    hidden_dim = 128
+
+    class SimpleMoEModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.router = TokenChoiceTopKRouter(
+                dim=dim,
+                num_experts=num_experts,
+                num_expert_groups=None,
+                num_limited_groups=None,
+                top_k=2,
+                score_func="softmax",
+                route_norm=False,
+                route_scale=1.0,
+                gate_bias=False,
+            )
+            self.experts = GroupedExperts(
+                dim=dim,
+                hidden_dim=hidden_dim,
+                num_experts=num_experts,
+                use_grouped_mm=False,
+            )
+            self.output = nn.Linear(dim, dim)
+
+        def forward(self, x):
+            # Just test that LoRA params exist — no need for full MoE forward
+            return self.output(x)
+
+    model = SimpleMoEModel()
+    converter = LoRAConverter(LoRAConverter.Config(rank=4, alpha=8.0))
+    converter.convert(model)
+
+    # Router gate should be frozen (LoRA skips router gates)
+    assert not hasattr(model.router.gate, "lora_a"), "Router gate should not have LoRA"
+    for param in model.router.parameters():
+        assert not param.requires_grad, "Router params should be frozen"
+
+    # Dense linear should have LoRA adapters
+    assert hasattr(model.output, "lora_a")
+    assert hasattr(model.output, "lora_b")
+
+    # GroupedExperts should have expert LoRA adapters
+    assert hasattr(model.experts, "lora_a_w1")
+    assert hasattr(model.experts, "lora_b_w1")
+    assert hasattr(model.experts, "lora_a_w2")
+    assert hasattr(model.experts, "lora_b_w2")
+    assert hasattr(model.experts, "lora_a_w3")
+    assert hasattr(model.experts, "lora_b_w3")
+
+    # Check trainability: LoRA params trainable, base params frozen
+    lora_param_names = []
+    base_param_names = []
+    for name, param in model.named_parameters():
+        if "lora_a" in name or "lora_b" in name:
+            lora_param_names.append(name)
+            assert param.requires_grad, f"LoRA param '{name}' should be trainable"
+        else:
+            base_param_names.append(name)
+            assert not param.requires_grad, f"Base param '{name}' should be frozen"
+
+    assert len(lora_param_names) > 0, "No LoRA params found"
+    assert len(base_param_names) > 0, "No base params found"
+
+    # Verify expert LoRA shapes: (num_experts, *, rank) or (num_experts, rank, *)
+    assert model.experts.lora_a_w1.shape == (num_experts, dim, 4)
+    assert model.experts.lora_b_w1.shape == (num_experts, 4, hidden_dim)
+    assert model.experts.lora_a_w2.shape == (num_experts, hidden_dim, 4)
+    assert model.experts.lora_b_w2.shape == (num_experts, 4, dim)
+    assert model.experts.lora_a_w3.shape == (num_experts, dim, 4)
+    assert model.experts.lora_b_w3.shape == (num_experts, 4, hidden_dim)
+
+
 def test_qat_preserves_weight_dtype():
     """QAT converter should not change weight dtype (fake quantization happens in forward)."""
     pytest.importorskip("torchao")
diff --git a/torchtitan/components/lora.py b/torchtitan/components/lora.py
@@ -14,11 +14,15 @@
 
 from torchtitan.config import Configurable
 from torchtitan.models.common.linear import Linear
+from torchtitan.models.common.moe.moe import GroupedExperts, TokenChoiceTopKRouter
 from torchtitan.tools.logging import logger
 
 # Cache for dynamically created LoRA classes
 _lora_class_cache: dict[type, type] = {}
 
+# Cache for dynamically created expert LoRA classes
+_expert_lora_class_cache: dict[type, type] = {}
+
 
 def apply_lora(linear: nn.Linear, rank: int, alpha: float) -> nn.Linear:
     parent_cls = type(linear)
@@ -79,8 +83,164 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
     return _lora_class_cache[parent_cls].from_linear(linear, rank, alpha)
 
 
+def _compute_expert_lora_delta(
+    lora_a: torch.Tensor,
+    lora_b: torch.Tensor,
+    scaling: float,
+    target_weight: nn.Parameter,
+) -> torch.Tensor:
+    """Compute the LoRA weight delta for expert weights.
+
+    Args:
+        lora_a: (E, in, r) — projects input dim to rank.
+        lora_b: (E, r, out) — projects rank to output dim.
+        scaling: alpha / rank.
+        target_weight: The base weight parameter to match DTensor placements.
+
+    Returns:
+        delta matching target_weight's shape and placements.
+        Math: delta = scaling * B^T @ A^T  →  shape (E, out, in).
+    """
+    from torch.distributed.tensor import distribute_tensor, DTensor
+
+    delta = scaling * torch.bmm(lora_b.transpose(-2, -1), lora_a.transpose(-2, -1))
+    # When the base weight is a DTensor (TP/EP sharded), distribute the delta
+    # to match its placements so the in-place add_/sub_ operates on matching shapes.
+    if isinstance(target_weight, DTensor) and not isinstance(delta, DTensor):
+        delta = distribute_tensor(
+            delta, target_weight.device_mesh, target_weight.placements
+        )
+    return delta
+
+
+def apply_expert_lora(
+    experts: GroupedExperts, rank: int, alpha: float
+) -> GroupedExperts:
+    """Apply LoRA adapters to a GroupedExperts module via class swapping.
+
+    LoRA parameters are registered as direct parameters on the module. EP partition
+    functions that use ``named_parameters(recurse=False)`` with ``Shard(0)`` will
+    correctly shard them on the expert dimension. TP/ETP partition functions only
+    touch w1/w2/w3 by name and leave LoRA parameters unsharded.
+
+    Forward uses merge-per-forward: LoRA deltas are merged into base weights before
+    calling the base forward, then unmerged after. This reuses the base
+    GroupedExperts.forward without duplicating its DTensor/EP/padding logic.
+    """
+    parent_cls = type(experts)
+    assert issubclass(
+        parent_cls, GroupedExperts
+    ), f"parent_cls must be a subclass of GroupedExperts, got {parent_cls}"
+
+    if parent_cls not in _expert_lora_class_cache:
+
+        class LoRAGroupedExperts(parent_cls):  # type: ignore[valid-type, misc]
+            def __init__(self, *args: Any, **kwargs: Any) -> None:
+                raise RuntimeError(
+                    "LoRAGroupedExperts should not be instantiated directly."
+                )
+
+            @classmethod
+            def from_experts(
+                cls, experts: GroupedExperts, rank: int, alpha: float
+            ) -> "LoRAGroupedExperts":
+                experts.__class__ = cls
+                experts._init_expert_lora(rank, alpha)  # type: ignore[attr-defined]
+                return experts  # type: ignore[return-value]
+
+            def _init_expert_lora(self, rank: int, alpha: float) -> None:
+                self._lora_scaling = alpha / rank
+                num_experts = self.num_experts
+                # w1: (E, hidden_dim, dim) -> A1: (E, dim, r), B1: (E, r, hidden_dim)
+                dim_w1_in = self.w1.shape[2]  # dim
+                dim_w1_out = self.w1.shape[1]  # hidden_dim
+                # w2: (E, dim, hidden_dim) -> A2: (E, hidden_dim, r), B2: (E, r, dim)
+                dim_w2_in = self.w2.shape[2]  # hidden_dim
+                dim_w2_out = self.w2.shape[1]  # dim
+                # w3: (E, hidden_dim, dim) -> A3: (E, dim, r), B3: (E, r, hidden_dim)
+                dim_w3_in = self.w3.shape[2]  # dim
+                dim_w3_out = self.w3.shape[1]  # hidden_dim
+
+                device = self.w1.device
+                dtype = self.w1.dtype
+
+                self.lora_a_w1 = nn.Parameter(
+                    torch.empty(
+                        num_experts, dim_w1_in, rank, device=device, dtype=dtype
+                    )
+                )
+                self.lora_b_w1 = nn.Parameter(
+                    torch.empty(
+                        num_experts, rank, dim_w1_out, device=device, dtype=dtype
+                    )
+                )
+                self.lora_a_w2 = nn.Parameter(
+                    torch.empty(
+                        num_experts, dim_w2_in, rank, device=device, dtype=dtype
+                    )
+                )
+                self.lora_b_w2 = nn.Parameter(
+                    torch.empty(
+                        num_experts, rank, dim_w2_out, device=device, dtype=dtype
+                    )
+                )
+                self.lora_a_w3 = nn.Parameter(
+                    torch.empty(
+                        num_experts, dim_w3_in, rank, device=device, dtype=dtype
+                    )
+                )
+                self.lora_b_w3 = nn.Parameter(
+                    torch.empty(
+                        num_experts, rank, dim_w3_out, device=device, dtype=dtype
+                    )
+                )
+
+            def init_weights(self, init_std: float) -> None:
+                super().init_weights(init_std)
+                for name in ("lora_a_w1", "lora_a_w2", "lora_a_w3"):
+                    nn.init.kaiming_uniform_(getattr(self, name), a=math.sqrt(5))
+                for name in ("lora_b_w1", "lora_b_w2", "lora_b_w3"):
+                    nn.init.zeros_(getattr(self, name))
+
+            def forward(
+                self,
+                x: torch.Tensor,
+                num_tokens_per_expert: torch.Tensor,
+            ) -> torch.Tensor:
+                # Merge LoRA deltas into base weights, run base forward, unmerge.
+                # This reuses all base GroupedExperts logic (DTensor, EP, padding).
+                deltas = {}
+                for w_name, a_name, b_name in (
+                    ("w1", "lora_a_w1", "lora_b_w1"),
+                    ("w2", "lora_a_w2", "lora_b_w2"),
+                    ("w3", "lora_a_w3", "lora_b_w3"),
+                ):
+                    lora_a = getattr(self, a_name)
+                    lora_b = getattr(self, b_name)
+                    w = getattr(self, w_name)
+                    delta = _compute_expert_lora_delta(
+                        lora_a, lora_b, self._lora_scaling, w
+                    )
+                    w.data.add_(delta)
+                    deltas[w_name] = delta
+
+                try:
+                    return super().forward(x, num_tokens_per_expert)
+                finally:
+                    # Unmerge: subtract deltas to restore original weights
+                    for w_name, delta in deltas.items():
+                        getattr(self, w_name).data.sub_(delta)
+
+        LoRAGroupedExperts.__name__ = f"LoRA{parent_cls.__name__}"
+        LoRAGroupedExperts.__qualname__ = f"LoRA{parent_cls.__name__}"
+        _expert_lora_class_cache[parent_cls] = LoRAGroupedExperts
+
+    # pyrefly: ignore [missing-attribute]
+    return _expert_lora_class_cache[parent_cls].from_experts(experts, rank, alpha)
+
+
 class LoRAConverter(Configurable):
-    """Apply LoRA adapters to all Linear layers in a model."""
+    """Apply LoRA adapters to all Linear layers and GroupedExperts in a model."""
 
     @dataclass(kw_only=True, slots=True)
     class Config(Configurable.Config):
@@ -125,9 +285,18 @@ def convert(self, model: nn.Module) -> None:
         }
 
     def _replace_linears_with_lora(self, module: nn.Module) -> None:
+        # Collect router gate linears so we can skip them — routing scores
+        # must stay frozen to preserve expert load balancing.
+        router_gate_ids: set[int] = set()
+        for child in module.modules():
+            if isinstance(child, TokenChoiceTopKRouter):
+                router_gate_ids.add(id(child.gate))
+
         for _, child in list(module.named_modules()):
-            if isinstance(child, nn.Linear):
+            if isinstance(child, nn.Linear) and id(child) not in router_gate_ids:
                 apply_lora(child, self.rank, self.alpha)
+            elif isinstance(child, GroupedExperts):
+                apply_expert_lora(child, self.rank, self.alpha)
 
     def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
         pass
diff --git a/torchtitan/models/deepseek_v3/config_registry.py b/torchtitan/models/deepseek_v3/config_registry.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from torchtitan.components.checkpoint import CheckpointManager
+from torchtitan.components.lora import LoRAConverter
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.metrics import MetricsProcessor
 from torchtitan.components.optimizer import OptimizersContainer
@@ -58,6 +59,19 @@ def deepseek_v3_debugmodel() -> Trainer.Config:
     )
 
 
+def deepseek_v3_debugmodel_lora() -> Trainer.Config:
+    config = deepseek_v3_debugmodel()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+            ),
+        ],
+    )
+    return config
+
+
 def deepseek_v3_debugmodel_flex_attn() -> Trainer.Config:
     config = deepseek_v3_debugmodel()
     config.model_spec = model_registry("debugmodel_flex_attn")