[main] feat(moe): Support moe shared expert gate for Qwen3-Next (2/4) (NVIDIA#2751)

yuzhongw-nvidia · Phlip79 · maanug-nv · commit 45ee0a463e6c · 2026-01-20T12:29:53.000-08:00
Co-authored-by: Philip Petrakian &lt;pgpetrak@gmail.com&gt;
diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py
@@ -57,7 +57,7 @@ def get_moe_module_spec_for_backend(
     experts = ModuleSpec(module=expert_module, submodules=expert_submodule)
 
     # shared experts spec
-    shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp)
+    shared_experts = ModuleSpec(module=SharedExpertMLP, submodules=mlp)
 
     # MoE module spec
     moe_module_spec = ModuleSpec(
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
@@ -189,7 +189,10 @@ def __init__(
         # Initialize shared experts
         if self.use_shared_expert:
             self.shared_experts = build_module(
-                self.submodules.shared_experts, config=self.config, pg_collection=pg_collection
+                self.submodules.shared_experts,
+                config=self.config,
+                pg_collection=pg_collection,
+                gate=self.config.moe_shared_expert_gate,
             )
             if self.shared_expert_overlap:
                 self.token_dispatcher.set_shared_experts(self.shared_experts)
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
@@ -460,6 +460,10 @@ class TransformerConfig(ModelParallelConfig):
     different orders to the hidden_states, causing minor numerical differences
     in the hidden_states gradient."""
 
+    moe_shared_expert_gate: bool = False
+    """Enable gate for shared expert. Only effective when 
+    moe-shared-expert-intermediate-size is set."""
+
     moe_shared_expert_overlap: bool = False
     """Enable overlapping between shared expert computations and dispatcher communications.
     Without this, the shared experts execute before the router."""
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -3153,6 +3153,8 @@ def _add_moe_args(parser):
                        'This makes the gradients from the router and the shared experts added in '
                        'different orders to the hidden_states, causing minor numerical differences '
                        'in the hidden_states gradient.')
+    group.add_argument('--moe-shared-expert-gate', action='store_true',
+                       help='Enable gate for shared expert. Only effective when moe-shared-expert-intermediate-size is set.')
     group.add_argument('--moe-shared-expert-overlap', action='store_true',
                        help='Enable overlapping between shared expert computations and dispatcher communications. '
                        'Without this, the shared experts execute before the router. '
diff --git a/tests/unit_tests/transformer/moe/test_shared_experts.py b/tests/unit_tests/transformer/moe/test_shared_experts.py
@@ -20,7 +20,8 @@ def teardown_method(self, method):
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @pytest.mark.internal
-    def test_gpu_forward(self):
+    @pytest.mark.parametrize("shared_expert_gate", [False, True])
+    def test_gpu_forward(self, shared_expert_gate):
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
         print("done intializing")
@@ -38,6 +39,7 @@ def test_gpu_forward(self):
             moe_router_load_balancing_type="sinkhorn",
             moe_router_topk=1,
             add_bias_linear=False,
+            moe_shared_expert_gate=shared_expert_gate,
         )
         transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
@@ -49,7 +51,10 @@ def test_gpu_forward(self):
         assert isinstance(self.moe_layer, MoELayer)
 
         num_weights = sum([p.numel() for p in self.moe_layer.parameters()])
-        assert num_weights == 3480 + 1152
+        if shared_expert_gate:
+            assert num_weights == 3480 + 1152 + 12  # 12 is the weight of the gate
+        else:
+            assert num_weights == 3480 + 1152
         assert self.moe_layer.shared_experts is not None
         assert self.moe_layer.shared_experts.stream is None
         assert self.moe_layer.token_dispatcher.shared_experts is None