code and test cleanup

kinjalpatel27 · kinjalpatel27 · commit 1c821d884c2b · 2025-10-08T23:53:58.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -208,6 +208,8 @@ def wrapped_calib_func(
     forward_loop and the relevant kwargs and are independent of the ModelOpt framework.
     So lets wrap them to be compatible with the ModelOpt convert entrypoint.
     """
+    from .plugins.custom import register_custom_post_calibration_plugins
+
     kwargs = config.model_dump()
     method = kwargs.pop("method")
     if method is not None and "awq" in method:
@@ -218,6 +220,7 @@ def wrapped_calib_func(
         # Call the function with forward_loop as a separate argument
         func(model, forward_loop=forward_loop, **kwargs)
 
+    register_custom_post_calibration_plugins(model)
     # Lets get the latest metadata for the quantizer states
     metadata = {}
     update_quantize_metadata(model, config, metadata)
@@ -290,7 +293,10 @@ def convert(self) -> ConvertEntrypoint:
         def wrapped_func(model, config, forward_loop=None):
             # Access _calib_func as a class attribute to avoid binding
             # Check if _calib_func is defined as a class attribute
-            return wrapped_calib_func(model, config, forward_loop, func=self.__class__._calib_func)
+            calib_results = wrapped_calib_func(
+                model, config, forward_loop, func=self.__class__._calib_func
+            )
+            return calib_results
 
         return wrapped_func
 
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -89,10 +89,6 @@ def sync_quantizer_amax_across_dp_ep(quantizer, parallel_state):
         if getattr(quantizer, "_amax", None) is not None:
             quantizer.sync_amax_across_distributed_group(parallel_state.data_parallel_group)
             quantizer.sync_amax_across_distributed_group(parallel_state.expert_model_parallel_group)
-            if parallel_state.expert_tensor_parallel_group is not None:
-                quantizer.sync_amax_across_distributed_group(
-                    parallel_state.expert_tensor_parallel_group
-                )
         # TODO: create sync_bias_across_distributed_group
 
     for name, module in model.named_modules():
diff --git a/modelopt/torch/quantization/plugins/custom.py b/modelopt/torch/quantization/plugins/custom.py
@@ -30,6 +30,7 @@
 
 CUSTOM_MODEL_PLUGINS = set()
 CUSTOM_POST_CONVERSION_PLUGINS = set()
+CUSTOM_POST_CALIBRATION_PLUGINS = set()
 
 
 # TODO: This is a temporary solution
@@ -46,6 +47,12 @@ def register_custom_post_conversion_plugins(model):
         callback(model)
 
 
+def register_custom_post_calibration_plugins(model):
+    """Registers custom modules as QUANT_MODULE after calibration."""
+    for callback in CUSTOM_POST_CALIBRATION_PLUGINS:
+        callback(model)
+
+
 class _QuantFunctionalMixin(QuantModule):
     """Mixin class for quantized functionals.
 
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -40,13 +40,51 @@
 from ..nn import QuantModuleRegistry, TensorQuantizer
 from ..nn.modules.quant_linear import RealQuantLinear, _QuantLinear
 from ..qtensor import QTensorWrapper
-from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear
+from .custom import CUSTOM_MODEL_PLUGINS, CUSTOM_POST_CALIBRATION_PLUGINS, _ParallelLinear
 
 logger = logging.getLogger(__name__)
 
 __all__ = []
 
 
+def sync_amax_across_sequential_mlp(model: torch.nn.Module):
+    """Sync amax across experts in a SequentialMLP."""
+    amax_dict = {
+        "linear_fc1.input_quantizer": {},
+        "linear_fc1.weight_quantizer": {},
+        "linear_fc2.input_quantizer": {},
+        "linear_fc2.weight_quantizer": {},
+    }
+    # gather amax values from SequentialMLP experts
+    for name, module in model.named_modules():
+        if (
+            not isinstance(module, TensorQuantizer)
+            or not hasattr(module, "_amax")
+            or "local_experts" not in name
+        ):
+            continue
+        expert_name, local_expert_name = name.split("local_experts")
+        for key in amax_dict:
+            if key in local_expert_name:
+                amax_dict[key][expert_name] = max(amax_dict[key].get(expert_name, 0), module.amax)
+
+    # sync amax values across experts in SequentialMLP
+    for name, module in model.named_modules():
+        if (
+            not isinstance(module, TensorQuantizer)
+            or not hasattr(module, "_amax")
+            or "local_experts" not in name
+        ):
+            continue
+        expert_name, local_expert_name = name.split("local_experts")
+        for key in amax_dict:
+            if key in local_expert_name:
+                module.amax = amax_dict[key][expert_name]
+
+
+CUSTOM_POST_CALIBRATION_PLUGINS.add(sync_amax_across_sequential_mlp)
+
+
 def real_quant_module_get_extra_state(self) -> dict:
     """Populating real_quantizer_state and q_tensor_state."""
     extra_state = {}
@@ -223,24 +261,19 @@ class _MegatronParallelLinear(_ParallelLinear):
     ]
 
     def _setup(self):
-        data_parallel_group = None
-        try:
-            data_parallel_group = get_data_parallel_group(with_context_parallel=True)
-        except AssertionError:
-            logger.warning("Context parallel group is not initialized, using data parallel group")
-            data_parallel_group = get_data_parallel_group()
-
-        try:
-            expert_tensor_parallel_group = mcore_parallel.get_expert_tensor_parallel_group()
-        except AssertionError:
-            expert_tensor_parallel_group = None
-
-        self.parallel_state = ParallelState(
-            data_parallel_group,
-            mcore_parallel.get_tensor_model_parallel_group(),
-            mcore_parallel.get_expert_model_parallel_group(),
-            expert_tensor_parallel_group,
-        )
+        if not hasattr(self, "parallel_state") or self.parallel_state is None:
+            data_parallel_group = None
+            try:
+                data_parallel_group = get_data_parallel_group(with_context_parallel=True)
+            except AssertionError:
+                logger.warning(
+                    "Context parallel group is not initialized, using data parallel group"
+                )
+                data_parallel_group = get_data_parallel_group()
+            self.parallel_state = ParallelState(
+                data_parallel_group,
+                mcore_parallel.get_tensor_model_parallel_group(),
+            )
         super()._setup()
 
     def _process_quantizer_amax(self, k, v, quantizer_state_dict):
@@ -488,26 +521,22 @@ def forward(self, input, *args, **kwargs):
 @QuantModuleRegistry.register({te_grouped_linear.GroupedLinear: "te_GroupedLinear_public"})
 class _QuantTEGroupedLinear(_MegatronParallelLinear):
     def _setup(self):
-        data_parallel_group = None
-        try:
-            data_parallel_group = get_data_parallel_group(with_context_parallel=True)
-        except AssertionError:
-            data_parallel_group = get_data_parallel_group()
-
-        try:
-            expert_tensor_parallel_group = mcore_parallel.get_expert_tensor_parallel_group()
-        except AssertionError:
-            expert_tensor_parallel_group = None
-        self.parallel_state = ParallelState(
-            data_parallel_group,
-            mcore_parallel.get_tensor_model_parallel_group(),
-            mcore_parallel.get_expert_model_parallel_group(),
-            expert_tensor_parallel_group,
-        )
-        self.input_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_input)
-        self.weight_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_weight)
-        self.output_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_output)
-        self.output_quantizer.disable()
+        if not hasattr(self, "parallel_state") or self.parallel_state is None:
+            data_parallel_group = None
+            try:
+                data_parallel_group = get_data_parallel_group(with_context_parallel=True)
+            except AssertionError:
+                data_parallel_group = get_data_parallel_group()
+
+            self.parallel_state = ParallelState(
+                data_parallel_group,
+                tensor_parallel_group=mcore_parallel.get_expert_tensor_parallel_group(),
+                expert_model_parallel_group=mcore_parallel.get_expert_model_parallel_group(),
+            )
+            self.input_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_input)
+            self.weight_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_weight)
+            self.output_quantizer = TensorQuantizer(_QuantLinear.default_quant_desc_output)
+            self.output_quantizer.disable()
 
         # Memorize the original weight.dtype for modelopt_post_restore given that
         # the dtype can change later.
@@ -580,5 +609,5 @@ class _QuantTEGroupedColumnParallelLinear(_QuantTEGroupedLinear, _MegatronColumn
 @QuantModuleRegistry.register(
     {megatron_te.TERowParallelGroupedLinear: "megatron_TERowParallelGroupedLinear"}
 )
-class _QuantTEGroupedRowParallelLinear(_QuantTEGroupedLinear, _MegatronColumnParallelLinear):
+class _QuantTEGroupedRowParallelLinear(_QuantTEGroupedLinear, _MegatronRowParallelLinear):
     _is_row_parallel = True
diff --git a/modelopt/torch/utils/distributed.py b/modelopt/torch/utils/distributed.py
@@ -242,26 +242,18 @@ def __init__(
         data_parallel_group: torch.distributed.ProcessGroup | int | None = None,
         tensor_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
         expert_model_parallel_group: torch.distributed.ProcessGroup | int | None = -1,
-        expert_tensor_parallel_group: torch.distributed.ProcessGroup | int | None = None,
     ):
         """Initialize the parallel state."""
         self.data_parallel_group = DistributedProcessGroup(data_parallel_group)
         self.tensor_parallel_group = DistributedProcessGroup(tensor_parallel_group)
         self.expert_model_parallel_group = DistributedProcessGroup(expert_model_parallel_group)
-        self.expert_tensor_parallel_group = None
-        if expert_tensor_parallel_group is not None:
-            self.expert_tensor_parallel_group = DistributedProcessGroup(
-                expert_tensor_parallel_group
-            )
 
     def __repr__(self) -> str:
         parallel_groups = (
             f"data_parallel_group: {self.data_parallel_group}, "
             f"tensor_parallel_group: {self.tensor_parallel_group}, "
             f"expert_model_parallel_group: {self.expert_model_parallel_group}"
         )
-        if self.expert_tensor_parallel_group:
-            parallel_groups += f"expert_tensor_parallel_group: {self.expert_tensor_parallel_group}"
         return parallel_groups
 
 
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import re
 from warnings import warn
 
 import torch
@@ -57,7 +58,6 @@
     save_sharded_modelopt_state,
 )
 from modelopt.torch.utils import to_empty_if_meta_device
-from modelopt.torch.utils.distributed import DistributedProcessGroup
 
 try:
     from megatron.core.extensions.transformer_engine import TENorm
@@ -143,7 +143,7 @@ def get_mcore_gpt_model(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     expert_model_parallel_size: int = 1,
-    expert_tensor_parallel_size: int = 1,
+    expert_tensor_parallel_size: int | None = None,
     initialize_megatron: bool = False,
     *,
     num_layers: int = 2,
@@ -497,61 +497,6 @@ def convert_maybe_fp8(v):
     )
 
 
-def compare_model_outputs(grouped_model, non_grouped_model, forward_fn, tolerance=1e-6):
-    """Compare outputs of grouped and non-grouped models."""
-    # Set both models to eval mode
-    grouped_model.eval()
-    non_grouped_model.eval()
-
-    with torch.no_grad():
-        # Get outputs from both models
-        grouped_output = forward_fn(grouped_model)
-        non_grouped_output = forward_fn(non_grouped_model)
-
-        # Compare outputs
-        if isinstance(grouped_output, tuple):
-            grouped_output = grouped_output[0]
-        if isinstance(non_grouped_output, tuple):
-            non_grouped_output = non_grouped_output[0]
-
-    output_close = torch.allclose(
-        grouped_output, non_grouped_output, atol=tolerance, rtol=tolerance
-    )
-    return output_close
-
-
-def sync_amax(model):
-    amax_dict = {
-        "linear_fc1.input_quantizer": {},
-        "linear_fc1.weight_quantizer": {},
-        "linear_fc2.input_quantizer": {},
-        "linear_fc2.weight_quantizer": {},
-    }
-    for name, module in model.named_modules():
-        if not isinstance(module, mtq.nn.TensorQuantizer):
-            continue
-        if not hasattr(module, "_amax"):
-            continue
-        if "local_experts" not in name:
-            continue
-        expert_name, local_expert_name = name.split("local_experts")
-        for key in amax_dict:
-            if key in local_expert_name:
-                amax_dict[key][expert_name] = max(amax_dict[key].get(expert_name, 0), module.amax)
-
-    for name, module in model.named_modules():
-        if not isinstance(module, mtq.nn.TensorQuantizer):
-            continue
-        if not hasattr(module, "_amax"):
-            continue
-        if "local_experts" not in name:
-            continue
-        expert_name, local_expert_name = name.split("local_experts")
-        for key in amax_dict:
-            if key in local_expert_name:
-                module.amax = amax_dict[key][expert_name]
-
-
 def copy_weights_from_grouped_to_non_grouped(grouped_model, non_grouped_model):
     """Copy weights from grouped MoE model to non-grouped MoE model."""
     grouped_state = grouped_model.state_dict()
@@ -625,8 +570,6 @@ def compare_amax_sync_across_expert_parallel(model):
             # Create quantizer type key by normalizing the name
             if "local_experts" in name:
                 # Non-grouped MoE: replace expert index with wildcard
-                import re
-
                 quantizer_type = re.sub(r"local_experts\.\d+", "local_experts.*", name)
             else:
                 # Grouped MoE: use the name as-is since experts are grouped
@@ -641,50 +584,7 @@ def compare_amax_sync_across_expert_parallel(model):
         if len(rank_values) > 1:  # Only check if we have multiple ranks
             values = list(rank_values.values())
             max_diff = max(values) - min(values)
-
             if max_diff > 1e-6:  # Allow for small floating point differences
-                return False
+                return False, quantizer_type, rank_values
 
-    return True
-
-
-def disable_distributed_parallel_sync(model, expert_parallel_type: str = "tensor"):
-    """Disable distributed parallel synchronization groups."""
-    module_parallel_groups = {}
-
-    for name, module in model.named_modules():
-        if isinstance(module, mtq.nn.QuantModule):
-            # Store original groups
-            module_parallel_groups[name] = {
-                "data_parallel_group": module.parallel_state.data_parallel_group,
-                "expert_tensor_parallel_group": module.parallel_state.expert_tensor_parallel_group,
-                "expert_model_parallel_group": module.parallel_state.expert_model_parallel_group,
-            }
-
-            # Disable groups
-            module.parallel_state.data_parallel_group = DistributedProcessGroup(-1)
-
-            if expert_parallel_type in ["tensor", "both"]:
-                module.parallel_state.expert_tensor_parallel_group = DistributedProcessGroup(-1)
-            if expert_parallel_type in ["model", "both"]:
-                module.parallel_state.expert_model_parallel_group = DistributedProcessGroup(-1)
-
-    return module_parallel_groups
-
-
-def enable_distributed_parallel_sync(
-    model, module_parallel_groups, expert_parallel_type: str = "tensor"
-):
-    """Re-enable distributed parallel synchronization groups."""
-    for name, module in model.named_modules():
-        if isinstance(module, mtq.nn.QuantModule) and name in module_parallel_groups:
-            groups = module_parallel_groups[name]
-
-            if expert_parallel_type in ["tensor", "both"]:
-                module.parallel_state.expert_tensor_parallel_group = groups[
-                    "expert_tensor_parallel_group"
-                ]
-            if expert_parallel_type in ["model", "both"]:
-                module.parallel_state.expert_model_parallel_group = groups[
-                    "expert_model_parallel_group"
-                ]
+    return True, None, None
diff --git a/tests/gpu/torch/conftest.py b/tests/gpu/torch/conftest.py
@@ -40,12 +40,6 @@ def need_8_gpus():
         pytest.skip("Need at least 8 GPUs to run this test")
 
 
-@pytest.fixture
-def need_4_gpus():
-    if torch.cuda.device_count() < 4:
-        pytest.skip("Need at least 4 GPUs to run this test")
-
-
 @pytest.fixture(scope="module")
 def set_torch_dtype(request):
     orig_dtype = torch.get_default_dtype()
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py