fix moe token forward

jenchen13 · jenchen13 · commit 95f6c25ba1a6 · 2025-10-27T14:30:59.000-07:00
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -345,7 +345,16 @@ def backward(ctx, grad_output):
 _transposed_quantize = _TransposedQuantization.apply
 
 
-class _QuantMoeSparseMoe(QuantModule):
+class _QuantSparseMoe(QuantModule):
+    """Module to support special handling of token dispatching during calibration.
+
+    During calibration, we forward all tokens to all experts so that all experts see sufficient tokens to calibrate.
+    However, even in calibration mode, the actual top_k routing is used to calculate the actual outputs this instance
+    returns.
+
+    If calibration is not enabled, this module behaves as a normal MoELayer.
+    """
+
     def _setup(self):
         pass
 
@@ -480,7 +489,7 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         return self.w2_linear[expert_idx](x1)
 
 
-class _QuantDbrxFFN(_QuantMoeSparseMoe):
+class _QuantDbrxFFN(_QuantSparseMoe):
     @property
     def num_experts(self):
         return self.router.moe_num_experts
@@ -498,7 +507,7 @@ def top_k(self, value):
     from transformers.models.llama4.modeling_llama4 import Llama4TextExperts, Llama4TextMoe
 
     if Llama4TextMoe not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Llama4TextMoe: "hf.Llama4TextMoe"})(_QuantMoeSparseMoe)
+        QuantModuleRegistry.register({Llama4TextMoe: "hf.Llama4TextMoe"})(_QuantSparseMoe)
 
     if Llama4TextExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Llama4TextExperts: "hf.Llama4TextExperts"})(
@@ -526,7 +535,7 @@ def top_k(self, value):
 
     if MixtralSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({MixtralSparseMoeBlock: "hf.MixtralSparseMoeBlock"})(
-            _QuantMoeSparseMoe
+            _QuantSparseMoe
         )
 except ImportError:
     pass
@@ -544,7 +553,7 @@ def top_k(self, value):
 
     if Qwen3MoeSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3MoeSparseMoeBlock: "hf.Qwen3MoeSparseMoeBlock"})(
-            _QuantMoeSparseMoe
+            _QuantSparseMoe
         )
 except ImportError:
     pass
@@ -554,7 +563,7 @@ def top_k(self, value):
 
     if Qwen2MoeSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen2MoeSparseMoeBlock: "hf.Qwen2MoeSparseMoeBlock"})(
-            _QuantMoeSparseMoe
+            _QuantSparseMoe
         )
 except ImportError:
     pass
@@ -564,7 +573,7 @@ def top_k(self, value):
 
     if Qwen3NextSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3NextSparseMoeBlock: "hf.Qwen3NextSparseMoeBlock"})(
-            _QuantMoeSparseMoe
+            _QuantSparseMoe
         )
 except ImportError:
     pass
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -23,6 +23,7 @@
 import megatron.core.tensor_parallel.layers as megatron_parallel
 import megatron.core.transformer.mlp as megatron_mlp
 import megatron.core.transformer.moe.experts as megatron_moe
+import megatron.core.transformer.moe.moe_layer as megatron_moe_layer
 import torch
 from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
@@ -36,7 +37,7 @@
 )
 from modelopt.torch.utils.distributed import ParallelState
 
-from ..nn import QuantModuleRegistry, TensorQuantizer
+from ..nn import QuantModule, QuantModuleRegistry, TensorQuantizer
 from ..nn.modules.quant_linear import RealQuantLinear
 from ..qtensor import QTensorWrapper
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear
@@ -247,6 +248,14 @@ def _setup(self):
                 data_parallel_group,
                 mcore_parallel.get_tensor_model_parallel_group(),
             )
+
+        if getattr(self, "gradient_accumulation_fusion", False):
+            warnings.warn(
+                "gradient_accumulation_fusion is not supported with ModelOpt quantization. "
+                "Setting gradient_accumulation_fusion to False."
+            )
+            self.gradient_accumulation_fusion = False
+
         super()._setup()
 
     def _process_quantizer_amax(self, k, v, quantizer_state_dict):
@@ -580,3 +589,26 @@ def _setup(self):
             # initialize parallel state for submodules linear_fc1 and linear_fc2
             self.linear_fc1.parallel_state = self.parallel_state
             self.linear_fc2.parallel_state = self.parallel_state
+
+
+@QuantModuleRegistry.register({megatron_moe_layer.MoELayer: "megatron_moe_MoELayer"})
+class _QuantMoELayer(QuantModule):
+    """Module to support special handling of token dispatching during calibration.
+
+    During calibration, we forward all tokens to all experts so that all experts see sufficient tokens to calibrate.
+    However, even in calibration mode, the actual top_k routing is used to calculate the actual outputs this instance
+    returns.
+
+    If calibration is not enabled, this module behaves as a normal MoELayer.
+    """
+
+    def _setup(self):
+        pass
+
+    def forward(self, hidden_states):
+        if any(getattr(m, "_if_calib", False) for m in self.experts.modules()):
+            original_top_k = self.router.topk
+            self.router.topk = self.router.num_experts
+            super().forward(hidden_states)
+            self.router.topk = original_top_k
+        return super().forward(hidden_states)