NVIDIA · kinjalpatel27 · Oct 17, 2025 · Sep 24, 2025 · Sep 25, 2025 · Sep 25, 2025
@@ -80,21 +80,22 @@ def max_calibrate(model: nn.Module, forward_loop: ForwardLoop | None = None, dis
     if not distributed_sync:
         return
 
-    def sync_quantizer_amax_across_dp(quantizer, parallel_state):
-        """Synchronize the amax across all ranks in the data parallel group."""
+    def sync_quantizer_amax_across_dp_ep(quantizer, parallel_state):
+        """Synchronize the amax across all ranks in the data parallel and expert parallel groups."""
         if isinstance(quantizer, SequentialQuantizer):
             for _q in quantizer:
-                sync_quantizer_amax_across_dp(_q, parallel_state)
+                sync_quantizer_amax_across_dp_ep(_q, parallel_state)
             return
         if getattr(quantizer, "_amax", None) is not None:
             quantizer.sync_amax_across_distributed_group(parallel_state.data_parallel_group)
+            quantizer.sync_amax_across_distributed_group(parallel_state.expert_model_parallel_group)
         # TODO: create sync_bias_across_distributed_group
 
     for name, module in model.named_modules():
         if isinstance(module, QuantModule):
             for child in module.children():
                 if isinstance(child, (TensorQuantizer, SequentialQuantizer)):
-                    sync_quantizer_amax_across_dp(child, module.parallel_state)
+                    sync_quantizer_amax_across_dp_ep(child, module.parallel_state)
     # TP sync:
     # Objective: the quantization parameters when TP = 8 then changed to TP=4 then back to TP=8 should be the same
 
@@ -117,6 +118,7 @@ def sync_quantizer_amax_across_tp(
         # Syncing amax across TP for sequential quantizer
         if isinstance(quantizer, SequentialQuantizer):
             for _q in quantizer:
+                # Syncing amax across TP for sequential quantizer
                 sync_quantizer_amax_across_tp(
                     _q, linear_name, quantizer_type, axes_for_sync, parallel_state
                 )
@@ -174,6 +176,10 @@ def sync_quantizer_amax_across_tp(
                 parallel_state=module.parallel_state,
             )
 
+    for name, module in model.named_modules():
+        if hasattr(module, "sync_moe_local_experts_amax"):
+            module.sync_moe_local_experts_amax()
+
 
 def enable_stats_collection(model: nn.Module):
     """Enable stats collection for all quantizers in the model."""

@@ -22,6 +22,7 @@
 import megatron.core.parallel_state as mcore_parallel
 import megatron.core.tensor_parallel.layers as megatron_parallel
 import megatron.core.transformer.mlp as megatron_mlp
+import megatron.core.transformer.moe.experts as megatron_moe
 import torch
 from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
@@ -40,6 +41,18 @@
 from ..qtensor import QTensorWrapper
 from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear
 
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelGroupedLinear,
+        TERowParallelGroupedLinear,
+    )
+
+    from .transformer_engine import _QuantTEGroupedLinear
+
+    HAS_TE = True
+except ImportError:
+    HAS_TE = False
+
 logger = logging.getLogger(__name__)
 
 __all__ = []
@@ -221,16 +234,19 @@ class _MegatronParallelLinear(_ParallelLinear):
     ]
 
     def _setup(self):
-        data_parallel_group = None
-        try:
-            data_parallel_group = get_data_parallel_group(with_context_parallel=True)
-        except AssertionError:
-            logger.warning("Context parallel group is not initialized, using data parallel group")
-            data_parallel_group = get_data_parallel_group()
-        self.parallel_state = ParallelState(
-            data_parallel_group,
-            mcore_parallel.get_tensor_model_parallel_group(),
-        )
+        if not hasattr(self, "parallel_state") or self.parallel_state is None:
+            data_parallel_group = None
+            try:
+                data_parallel_group = get_data_parallel_group(with_context_parallel=True)
+            except AssertionError:
+                logger.warning(
+                    "Context parallel group is not initialized, using data parallel group"
+                )
+                data_parallel_group = get_data_parallel_group()
+            self.parallel_state = ParallelState(
+                data_parallel_group,
+                mcore_parallel.get_tensor_model_parallel_group(),
+            )
         super()._setup()
 
     def _process_quantizer_amax(self, k, v, quantizer_state_dict):
@@ -472,3 +488,95 @@ class _RealQuantMegatronRowParallelLinear(
 
     def forward(self, input, *args, **kwargs):
         return _MegatronRowParallelLinear.forward(self, input, *args, **kwargs)
+
+
+@QuantModuleRegistry.register({megatron_moe.SequentialMLP: "megatron_moe_SequentialMLP"})
+class _MegatronSequentialMLP(_MegatronMLP):
+    def _setup(self):
+        if not hasattr(self, "parallel_state") or self.parallel_state is None:
+            self.parallel_state = ParallelState(
+                mcore_parallel.get_expert_data_parallel_group(),
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
+            )
+
+        # Initialize parallel state for submodules local_experts.*.linear_fc1 and local_experts.*.linear_fc2
+        for expert in self.local_experts:
+            expert.linear_fc1.parallel_state = self.parallel_state
+            expert.linear_fc2.parallel_state = self.parallel_state
+
+    def sync_moe_local_experts_amax(self):
+        """Sync amax across local experts in a SequentialMLP.
+
+        amax across EP and ETP (for RowParallel) are synchronized as part of model_calib.max_calibrate().
+        This function is called to synchronize the amax values across local experts s.t. all localexperts will
+        share the same amax.
+        """
+        torch.distributed.barrier()
+        # Collect amax from all local experts
+        amax_dict = {}
+        for expert in self.local_experts:
+            for name, module in expert.named_modules():
+                if isinstance(module, TensorQuantizer) and module.amax is not None:
+                    stored_amax = amax_dict.get(name)
+                    amax_tensor = module.amax.detach().clone()
+                    amax_dict[name] = (
+                        amax_tensor
+                        if stored_amax is None
+                        else torch.maximum(stored_amax, amax_tensor)
+                    )
+
+        # Apply synchronized amax values back to all local experts
+        for expert in self.local_experts:
+            for name, module in expert.named_modules():
+                if isinstance(module, TensorQuantizer) and module.amax is not None:
+                    module.amax = amax_dict[name].detach().clone().to(module.amax.device)
+
+
+if HAS_TE:
+    # Quantized subclasses to support TEGroupedMLP quantization
+    class _QuantMegatronTEGroupedLinear(_QuantTEGroupedLinear, _MegatronParallelLinear):
+        def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+            # _sharded_state_dict_grouped adds _extra_state{gemm_idx} for gemm_idx:[1, num_gemms] in
+            # sharded_state_dict which is same as _extra_state. The _extra_state{gemm_idx} is used for
+            # TE Fp8 checkpoint, we need to remove the _extra_state{gemm_idx} for gemm_idx:[1, num_gemms]
+            # for modelopt checkpoint restore
+            filtered_state_dict = {
+                k: v
+                for k, v in state_dict.items()
+                if not any(k.endswith(f"_extra_state{num}") for num in range(1, self.num_gemms))
+            }
+            return super()._load_from_state_dict(filtered_state_dict, prefix, *args, **kwargs)
+
+        def _process_quantizer_amax(self, k, v, quantizer_state_dict):
+            assert v.numel() == 1, "TEGroupedLinear only supports per-tensor quantization"
+            quantizer_state_dict[k] = v.view(-1)
+
+    @QuantModuleRegistry.register(
+        {TEColumnParallelGroupedLinear: "megatron_TEColumnParallelGroupedLinear"}
+    )
+    class _MegatronTEGroupedColumnParallelLinear(
+        _QuantMegatronTEGroupedLinear, _MegatronColumnParallelLinear
+    ):
+        pass
+
+    @QuantModuleRegistry.register(
+        {TERowParallelGroupedLinear: "megatron_TERowParallelGroupedLinear"}
+    )
+    class _MegatronTEGroupedRowParallelLinear(
+        _QuantMegatronTEGroupedLinear, _MegatronRowParallelLinear
+    ):
+        pass
+
+    @QuantModuleRegistry.register({megatron_moe.TEGroupedMLP: "megatron_moe_TEGroupedMLP"})
+    class _MegatronTEGroupedMLP(_MegatronMLP):
+        def _setup(self):
+            if not hasattr(self, "parallel_state") or self.parallel_state is None:
+                self.parallel_state = ParallelState(
+                    mcore_parallel.get_expert_data_parallel_group(),
+                    tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(),
+                    expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
+                )
+            # initialize parallel state for submodules linear_fc1 and linear_fc2
+            self.linear_fc1.parallel_state = self.parallel_state
+            self.linear_fc2.parallel_state = self.parallel_state
@@ -17,6 +17,7 @@
 
 import torch
 import transformer_engine as te
+import transformer_engine.pytorch.module.grouped_linear as te_grouped_linear
 import transformer_engine.pytorch.module.linear as te_linear
 
 from ..nn import QuantModuleRegistry
@@ -58,3 +59,60 @@ def te_quantized_linear_fn(package, func_name, self, *args, **kwargs):
 
     # Override the quantized linear function
     _quantized_linear_fn = te_quantized_linear_fn
+
+
+# Register the public te.pytorch.GroupedLinear class
+@QuantModuleRegistry.register({te_grouped_linear.GroupedLinear: "te_GroupedLinear"})
+class _QuantTEGroupedLinear(_ParallelLinear):
+    _functionals_to_replace = [
+        (te_grouped_linear._GroupedLinear, "forward"),
+        (te_grouped_linear._GroupedLinear, "apply"),
+    ]
+
+    def _setup(self):
+        # GroupedMLP stores the weights as weight0, weight1, etc. To run setup in order to
+        # initialize the quantizer states, self.weight is used to extract shape, dtype etc. Assigning
+        # self.weight0 to self.weight to run the quantizer states initialization.
+        assert not hasattr(self, "weight"), "self.weight should not exist for TEGroupedLinear"
+        self.weight = self.weight0
+        # Memorize the original weight.dtype for modelopt_post_restore given that
+        # the dtype can change later.
+        super()._setup()
+        # Remove self.weight after setup.
+        delattr(self, "weight")
+
+    def modelopt_post_restore(self, prefix: str = ""):
+        # GroupedMLP stores the weights as weight0, weight1, etc. To run post_restore in order to
+        # initialize the quantizer states, self.weight is used to extract shape, dtype etc. Assigning
+        # self.weight0 to self.weight to run the quantizer states initialization.
+        assert not hasattr(self, "weight"), "self.weight should not exist for TEGroupedLinear"
+        self.weight = self.weight0
+        super().modelopt_post_restore(prefix=prefix)
+        # Remove self.weight after post_restore.
+        delattr(self, "weight")
+
+    @staticmethod
+    def te_grouped_quantized_linear_fn(package, func_name, self, *args):
+        idx = 1 if func_name == "_forward" else 0
+        inp = args[idx]
+        num_gemms = len(args[idx + 1])
+        weights_and_biases = args[-2 * num_gemms :]
+        weights, biases = weights_and_biases[:num_gemms], weights_and_biases[num_gemms:]
+        quantized_inputs = self.input_quantizer(inp)
+        quantized_weights = [self.weight_quantizer(weight) for weight in weights]
+
+        output = getattr(package, func_name)(
+            *(
+                args[0],
+                quantized_inputs,
+            )
+            if func_name == "_forward"
+            else (quantized_inputs,),
+            *args[idx + 1 : -2 * num_gemms],
+            *quantized_weights,
+            *biases,
+        )
+        return self.output_quantizer(output)
+
+    # Override the quantized linear function
+    _quantized_linear_fn = te_grouped_quantized_linear_fn
@@ -251,8 +251,11 @@ def is_quantized_linear(module):
         isinstance(module, QuantModule)
         and isinstance(getattr(module, "input_quantizer", None), TensorQuantizer)
         and hasattr(module, "weight_quantizer")
-        and getattr(module, "weight", None) is not None
-        and module.weight.dim() == 2
+        and (
+            (getattr(module, "weight", None) is not None and module.weight.dim() == 2)
+            # module.weight0 check is required to support TEGroupedLinear
+            or (getattr(module, "weight0", None) is not None and module.weight0.dim() == 2)
+        )
     )
 
 

@@ -241,16 +241,20 @@ def __init__(
         self,
         data_parallel_group: torch.distributed.ProcessGroup | int | None = None,
         tensor_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
+        expert_model_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
     ):
         """Initialize the parallel state."""
         self.data_parallel_group = DistributedProcessGroup(data_parallel_group)
         self.tensor_parallel_group = DistributedProcessGroup(tensor_parallel_group)
+        self.expert_model_parallel_group = DistributedProcessGroup(expert_model_parallel_group)
 
     def __repr__(self) -> str:
-        return (
+        parallel_groups = (
             f"data_parallel_group: {self.data_parallel_group}, "
             f"tensor_parallel_group: {self.tensor_parallel_group}, "
+            f"expert_model_parallel_group: {self.expert_model_parallel_group}"
         )
+        return parallel_groups
 
 
 def get_group(ranks: list[int]):