fix import and export moe router dtype

jenchen13 · jenchen13 · commit 5f7158f7c65d · 2025-10-29T15:55:46.000-07:00
Signed-off-by: jenchen13 &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py
@@ -77,11 +77,14 @@ def __init__(
         dequantize: bool = True,
         trust_remote_code: bool = True,
         verbose: bool = False,
+        moe_router_dtype: torch.dtype | None = None,
     ):
         """Create a GPTModel importer instance."""
         self._hf_config = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, trust_remote_code=trust_remote_code
         )
+        self.moe_router_dtype = moe_router_dtype
+        
         pretrained_model_path = Path(pretrained_model_name_or_path)
         if not pretrained_model_path.is_dir():
             if workspace_dir is None:
@@ -118,7 +121,7 @@ def _custom_mapping_to_lambda(mapping):
             func = method_map[mapping.func_name]
             prefix = mapping.target_name_or_prefix
             func_kwargs = mapping.func_kwargs
-            return lambda m, *args: func(m, prefix.format(*args), **func_kwargs)
+            return lambda m, *args, **kwargs: func(m, prefix.format(*args), **{**func_kwargs, **kwargs})
 
         for arch, mappings in all_mcore_hf_import_mapping.items():
             all_rules[arch] = {
@@ -140,6 +143,7 @@ def _name_remapping(
         prefix,
         mapping={},
         parallel_config: ParallelConfig | None = None,
+        dtype: torch.dtype | None = None,
     ):
         if isinstance(module, torch.Tensor):
             tensor = self._get_safetensor(prefix, parallel_config=parallel_config)
@@ -523,7 +527,7 @@ def _import_state_dict(self):
                 if not isinstance(layer.mlp, IdentityOp):
                     if "MoE" in str(type(layer.mlp)):
                         layer_pbar.set_description("Importing MoE")
-                        self.rules["router"](layer.mlp.router, layer_id)
+                        self.rules["router"](layer.mlp.router, layer_id, dtype=self.moe_router_dtype)
                         if (
                             hasattr(layer.mlp, "shared_experts")
                             and layer.mlp.shared_experts is not None
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -35,7 +35,6 @@
 
 from modelopt import __version__
 from modelopt.torch.utils import import_plugin
-from megatron.core import ModelParallelConfig
 
 from .model_config import (
     KV_CACHE_FP8,
@@ -187,7 +186,7 @@ def __init__(
         export_extra_modules: bool = False,
         dtype=torch.bfloat16,
         trust_remote_code: bool = True,
-        config: ModelParallelConfig | None = None,
+        moe_router_dtype: torch.dtype | None = None,
     ):
         """Create a GPTModel exporter instance."""
         if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
@@ -198,9 +197,7 @@ def __init__(
         self._hf_config = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, trust_remote_code=trust_remote_code
         )
-        if config.moe_router_dtype:
-            if config.moe_router_dtype == "fp32":
-                self.moe_router_dtype = torch.float32
+        self.moe_router_dtype = moe_router_dtype
         # If multimodal, extra the text_config
         self._hf_text_config = getattr(self._hf_config, "text_config", self._hf_config)
 
@@ -1147,7 +1144,7 @@ def export_mcore_gpt_to_hf(
     export_extra_modules: bool = False,
     dtype: torch.dtype = torch.float16,
     export_dir: Path | str = tempfile.gettempdir(),
-    config: ModelParallelConfig = None,
+    moe_router_dtype: torch.dtype | None = None,
 ):
     """Export Megatron Core GPTModel to unified checkpoint and save to export_dir.
 
@@ -1163,7 +1160,7 @@ def export_mcore_gpt_to_hf(
         export_dir: The target export path.
     """
     exporter = GPTModelExporter(
-        model, pretrained_model_name_or_path, export_extra_modules=export_extra_modules, dtype=dtype, config=config
+        model, pretrained_model_name_or_path, export_extra_modules=export_extra_modules, dtype=dtype, moe_router_dtype=moe_router_dtype
     )
     exporter.save_pretrained(export_dir, pretrained_model_name_or_path)
 
@@ -1173,6 +1170,7 @@ def import_mcore_gpt_from_hf(
     pretrained_model_path: str,
     workspace_dir: str | None = None,
     dtype: torch.dtype = torch.float16,
+    moe_router_dtype: torch.dtype | None = None,
 ):
     """Import GPTModel state_dict from supported HuggingFace pretrained model path.
 
@@ -1183,6 +1181,6 @@ def import_mcore_gpt_from_hf(
         dtype: The weights data type to import.
     """
     importer = GPTModelImporter(
-        model, pretrained_model_path, workspace_dir=workspace_dir, dtype=dtype,
+        model, pretrained_model_path, workspace_dir=workspace_dir, dtype=dtype, moe_router_dtype=moe_router_dtype
     )
     importer._import_state_dict()