fix router type

jenchen13 · jenchen13 · commit f9066c1ee4fb · 2025-10-29T16:09:19.000-07:00
Signed-off-by: jenchen13 &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py
@@ -83,7 +83,11 @@ def __init__(
         self._hf_config = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, trust_remote_code=trust_remote_code
         )
-        self.moe_router_dtype = moe_router_dtype
+        self.moe_router_dtype = None
+        if moe_router_dtype == "fp32":
+            self.moe_router_dtype = torch.float32
+        elif moe_router_dtype == "fp64":
+            self.moe_router_dtype = torch.float64
         
         pretrained_model_path = Path(pretrained_model_name_or_path)
         if not pretrained_model_path.is_dir():
@@ -145,6 +149,8 @@ def _name_remapping(
         parallel_config: ParallelConfig | None = None,
         dtype: torch.dtype | None = None,
     ):
+        if dtype is None:
+            dtype = self.dtype
         if isinstance(module, torch.Tensor):
             tensor = self._get_safetensor(prefix, parallel_config=parallel_config)
             module.data.copy_(tensor)
@@ -197,7 +203,7 @@ def _name_remapping(
                     tensor = self._get_safetensor(
                         prefix + source_key, parallel_config=parallel_config
                     )
-                state_dict[key] = tensor.to(dtype=self.dtype).to(device=val.device)
+                state_dict[key] = tensor.to(dtype=dtype).to(device=val.device)
 
         module.load_state_dict(state_dict)
 
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -109,7 +109,7 @@ def get_kv_cache_scaling_factor(kv_module: nn.Module) -> torch.Tensor:
 
 def get_quantized_state(
     module: torch.nn.Module,
-    dtype: torch.dtype = torch.float16,
+    dtype: torch.dtype = torch.bfloat16,
 ) -> tuple[dict[str, torch.Tensor], str, int]:
     """Return a state_dict, quantization format, and block_size of the module.
 
@@ -197,7 +197,12 @@ def __init__(
         self._hf_config = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, trust_remote_code=trust_remote_code
         )
-        self.moe_router_dtype = moe_router_dtype
+        self.moe_router_dtype = None
+        if moe_router_dtype == "fp32":
+            self.moe_router_dtype = torch.float32
+        elif moe_router_dtype == "fp64":
+            self.moe_router_dtype = torch.float64
+
         # If multimodal, extra the text_config
         self._hf_text_config = getattr(self._hf_config, "text_config", self._hf_config)
 
@@ -1142,7 +1147,7 @@ def export_mcore_gpt_to_hf(
     model: torch.nn.Module,
     pretrained_model_name_or_path: str | os.PathLike | None = None,
     export_extra_modules: bool = False,
-    dtype: torch.dtype = torch.float16,
+    dtype: torch.dtype = torch.bfloat16,
     export_dir: Path | str = tempfile.gettempdir(),
     moe_router_dtype: torch.dtype | None = None,
 ):
@@ -1169,7 +1174,7 @@ def import_mcore_gpt_from_hf(
     model: torch.nn.Module,
     pretrained_model_path: str,
     workspace_dir: str | None = None,
-    dtype: torch.dtype = torch.float16,
+    dtype: torch.dtype = torch.bfloat16,
     moe_router_dtype: torch.dtype | None = None,
 ):
     """Import GPTModel state_dict from supported HuggingFace pretrained model path.