fix: handle zero active experts for 1 ep rank in GroupedExperts (#935)

hemildesai · web-flow · commit 6c38eaee059d · 2025-12-09T14:12:36.000-08:00
Signed-off-by: Hemil Desai &lt;hemild@nvidia.com&gt;
diff --git a/nemo_automodel/components/moe/layers.py b/nemo_automodel/components/moe/layers.py
@@ -303,6 +303,29 @@ def get_local_proj(proj, expert_id):
 
             y.scatter_add_(dim=0, index=idx_b, src=expert_out.to(x.dtype))
 
+        # Handle the edge case where no tokens are routed to any local experts.
+        # This can occur during expert parallelism when all tokens on a particular
+        # rank happen to select experts hosted on other ranks. We perform a dummy
+        # computation through the local expert weights to ensure:
+        # 1. Gradient flow through local expert parameters during backpropagation
+        # 2. Proper participation in collective operations (reduce-scatter)
+        # The computation is a no-op since we multiply by zero (using zeros_like input).
+        if active_local_experts == 0:
+            gate_and_up_proj = get_local_proj(self.gate_and_up_projs, experts_start_idx)
+            down_proj = get_local_proj(self.down_projs, experts_start_idx)
+            gate_up_proj_bias = get_local_proj(self.gate_up_proj_bias, experts_start_idx) if self.expert_bias else None
+            down_proj_bias = get_local_proj(self.down_proj_bias, experts_start_idx) if self.expert_bias else None
+
+            expert_out = (
+                self.expert_activation(
+                    torch.zeros_like(x[0]).unsqueeze(0),
+                    gate_and_up_proj=gate_and_up_proj,
+                    down_proj=down_proj,
+                )
+                * weights[0, 0, None]
+            )
+            y[0] += expert_out[0]
+
         if ep_size > 1:
             y = DTensor.from_local(y, device_mesh=ep_mesh, placements=[Partial()])
             y = y.redistribute(placements=[Shard(0)]).to_local()
diff --git a/tests/unit_tests/moe/run_zero_active_experts_gradient_test.py b/tests/unit_tests/moe/run_zero_active_experts_gradient_test.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import torch
+import torch.nn.functional as F
+
+from nemo_automodel.components.moe.layers import GroupedExperts, MoEConfig
+
+# Track whether expert_activation was called
+activation_called = [False]
+
+
+def tracking_swiglu(x, *, gate_and_up_proj, down_proj, gate_up_proj_bias=None, down_proj_bias=None):
+    """Tracking version of swiglu that sets activation_called[0] = True."""
+    global activation_called
+    activation_called[0] = True
+    gate_and_up_out = x @ gate_and_up_proj
+    if gate_up_proj_bias is not None:
+        gate_and_up_out = gate_and_up_out + gate_up_proj_bias
+    gate_out, up_out = torch.chunk(gate_and_up_out, 2, -1)
+    inter = F.silu(gate_out) * up_out
+    inter = inter @ down_proj
+    if down_proj_bias is not None:
+        inter = inter + down_proj_bias
+    return inter
+
+
+def main(device_str: str = "cuda:0") -> int:
+    """
+    Run the zero active experts gradient test.
+
+    Args:
+        device_str: Device to run on ("cuda:0" or "cpu")
+
+    Returns:
+        0 if test passed, 1 if test failed
+    """
+    # Use global activation_called to track across function boundaries
+    global activation_called
+    activation_called[0] = False  # Reset at start
+
+    moe_config = MoEConfig(
+        n_routed_experts=8,
+        n_shared_experts=2,
+        n_activated_experts=2,
+        n_expert_groups=1,
+        n_limited_groups=1,
+        train_gate=True,
+        gate_bias_update_factor=0.1,
+        aux_loss_coeff=0.01,
+        score_func="softmax",
+        route_scale=1.0,
+        dim=128,
+        inter_dim=256,
+        moe_inter_dim=256,
+        norm_topk_prob=False,
+        router_bias=False,
+        expert_bias=False,
+        expert_activation="swiglu",
+        activation_alpha=1.702,
+        activation_limit=7.0,
+        dtype=torch.float32,
+    )
+
+    device = torch.device(device_str)
+    experts = GroupedExperts(moe_config)
+    experts.expert_activation = tracking_swiglu
+    experts = experts.to(device)
+
+    with torch.no_grad():
+        experts.gate_and_up_projs.normal_(0, 0.02)
+        experts.down_projs.normal_(0, 0.02)
+
+    num_tokens = 8
+    x = torch.randn(num_tokens, moe_config.dim, dtype=torch.float32, device=device)
+    token_mask = torch.ones(num_tokens, dtype=torch.bool, device=device)
+    weights = torch.rand(num_tokens, moe_config.n_activated_experts, dtype=torch.float32, device=device)
+
+    # Set indices to non-existent expert (simulates all tokens routed elsewhere)
+    indices = torch.full(
+        (num_tokens, moe_config.n_activated_experts),
+        fill_value=moe_config.n_routed_experts + 100,
+        dtype=torch.long,
+        device=device,
+    )
+
+    output = experts.forward(x, token_mask, weights, indices)
+
+    if activation_called[0]:
+        print("SUCCESS: expert_activation was called even when no tokens select any expert")
+        return 0
+    else:
+        print("FAIL: expert_activation was NOT called - the zero active experts fix is missing or broken")
+        return 1
+
+
+if __name__ == "__main__":
+    device = sys.argv[1] if len(sys.argv) > 1 else "cuda:0"
+    sys.exit(main(device))
diff --git a/tests/unit_tests/moe/test_layers.py b/tests/unit_tests/moe/test_layers.py