update init functions

jingyu-ml · jingyu-ml · commit 0b310fb57f2c · 2025-10-04T00:13:33.000Z
Signed-off-by: Jingyu Xin &lt;jingyux@nvidia.com&gt;
diff --git a/modelopt/torch/peft/config.py b/modelopt/torch/peft/config.py
@@ -15,14 +15,19 @@
 
 """Configuration classes for PEFT methods."""
 
+import inspect
 from collections.abc import Callable
 
+import torch.nn.init
 from pydantic import field_validator
+from torch import Tensor
 
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
 
 __all__ = ["ExportPEFTConfig", "PEFTAttributeConfig", "PEFTConfig"]
 
+InitFn = Callable[..., Tensor]
+
 
 class PEFTAttributeConfig(ModeloptBaseConfig):
     """Configuration for PEFT adapter attributes."""
@@ -48,26 +53,43 @@ class PEFTAttributeConfig(ModeloptBaseConfig):
         description="Scaling factor for the LoRA output. Controls the magnitude of the adaptation.",
     )
 
-    lora_a_init: str = ModeloptField(
-        default="kaiming_init",
+    lora_a_init: InitFn = ModeloptField(
+        default=torch.nn.init.kaiming_uniform_,
         title="LoRA A matrix initializer",
-        description="Custom initialization function for LoRA A matrix. Default to Kaiming uniform initialization.",
+        description="Custom initialization function for LoRA A matrix. \
+            Default to Kaiming uniform initialization. For more init methods \
+                you can refer to https://docs.pytorch.org/docs/stable/nn.init.html",
     )
 
-    lora_b_init: str = ModeloptField(
-        default="zero_init",
+    lora_b_init: InitFn = ModeloptField(
+        default=torch.nn.init.zeros_,
         title="LoRA B matrix initializer",
-        description="Custom initialization function for LoRA B matrix. Default to zero initialization.",
+        description="Custom initialization function for LoRA B matrix. Default to zero initialization. \
+            For more init methods you can refer to https://docs.pytorch.org/docs/stable/nn.init.html",
     )
 
     @field_validator("lora_a_init", "lora_b_init")
     @classmethod
     def validate_init_method(cls, v):
         """Validate initialization method is supported."""
-        valid_methods = {"kaiming_init", "zero_init"}
-        if v not in valid_methods:
+        if callable(v):
+            # Check if this is a function from torch.nn.init
+            module = inspect.getmodule(v)
+            if module is not torch.nn.init:
+                raise ValueError(
+                    f"Callable initialization method must be from torch.nn.init module, "
+                    f"got function from {module.__name__ if module else 'unknown module'}"
+                )
+            func_name = getattr(v, "__name__", "")
+            if not func_name.endswith("_"):
+                raise ValueError(
+                    f"Initialization method must be an in-place function (name should end with '_'), "
+                    f"got '{func_name}'. For example,"
+                    f" use torch.nn.init.kaiming_uniform_ instead of torch.nn.init.kaiming_uniform"
+                )
+        else:
             raise ValueError(
-                f"Invalid initialization method: {v}. Supported methods: {', '.join(valid_methods)}"
+                f"Initialization method must be a callable function from torch.nn.init, got {type(v)}"
             )
         return v
 
diff --git a/modelopt/torch/peft/conversion.py b/modelopt/torch/peft/conversion.py
@@ -26,6 +26,7 @@
 from .config import PEFTConfig
 from .lora.layer import LoRAModule, LoRAModuleRegistry
 
+# TODO: Add test cases to cover these functions
 __all__ = [
     "freeze_base_weights",
     "freeze_lora_weights",
diff --git a/modelopt/torch/peft/lora/layer.py b/modelopt/torch/peft/lora/layer.py
@@ -1,37 +1,16 @@
 """LoRA (Low-Rank Adaptation) module implementation."""
 
-import math
 from abc import abstractmethod
 from typing import Any
 
 import torch
 import torch.nn as nn
-import torch.nn.init as init
 
 from modelopt.torch.opt.dynamic import DynamicModule, _DMRegistryCls
 
 from ..config import PEFTAttributeConfig
 
-__all__ = ["LoRAModule", "LoRAModuleRegistry", "get_init_methods"]
-
-
-def get_init_methods(init_method: str = "kaiming_init"):
-    """Get the target init method for the lora a and lora b weights.
-
-    Args:
-        init_method: the init method you want for the lora layer
-    """
-    if init_method == "kaiming_init":
-        return lambda weight: init.kaiming_uniform_(
-            weight, a=math.sqrt(5)
-        )  # LoRA A: Kaiming uniform
-    elif init_method == "zero_init":
-        return lambda weight: init.zeros_(weight)  # LoRA B: zeros
-    else:
-        raise ValueError(
-            f"Unsupported initialization method: '{init_method}'. "
-            "Supported methods: 'kaiming_init', 'zero_init'"
-        )
+__all__ = ["LoRAModule", "LoRAModuleRegistry"]
 
 
 class LoRAModule(DynamicModule):
diff --git a/modelopt/torch/peft/lora/plugins/megatron.py b/modelopt/torch/peft/lora/plugins/megatron.py
@@ -31,7 +31,7 @@
 
 from ...config import PEFTAttributeConfig
 from ...custom import CUSTOM_MODEL_PLUGINS
-from ..layer import LoRAModule, LoRAModuleRegistry, get_init_methods
+from ..layer import LoRAModule, LoRAModuleRegistry
 
 DEFAULT_LORA_RANK = 64
 DEFAULT_SCALE = 1.0
@@ -130,23 +130,21 @@ def update_layer_lora(
             adapter_name: Name for the new adapter
             rank: Rank of the LoRA decomposition
         """
-        lora_a_init = get_init_methods(attr_config.lora_a_init)
-        lora_b_init = get_init_methods(attr_config.lora_b_init)
         lora_a = nn.Linear(
             in_features=self.input_size,
             out_features=attr_config.rank,
             bias=False,
         )
         with torch.no_grad():
-            lora_a_init(lora_a.weight)
+            attr_config.lora_a_init(lora_a.weight)
 
         lora_b = ColumnParallelLinear(
             attr_config.rank,
             self.output_size,
             config=self.config,
             bias=False,
             gather_output=False,
-            init_method=lora_b_init,
+            init_method=attr_config.lora_b_init,
         )
 
         self._register_adapter_with_device(
@@ -204,16 +202,14 @@ def update_layer_lora(
             adapter_name: Name for the new adapter
             rank: Rank of the LoRA decomposition
         """
-        lora_a_init = get_init_methods(attr_config.lora_a_init)
-        lora_b_init = get_init_methods(attr_config.lora_b_init)
         lora_a = RowParallelLinear(
             self.input_size,
             attr_config.rank,
             config=self.config,
             input_is_parallel=True,
             skip_bias_add=True,
             bias=False,
-            init_method=lora_a_init,
+            init_method=attr_config.lora_a_init,
         )
 
         lora_b = nn.Linear(
@@ -222,7 +218,7 @@ def update_layer_lora(
             bias=False,
         )
         with torch.no_grad():
-            lora_b_init(lora_b.weight)
+            attr_config.lora_b_init(lora_b.weight)
 
         self._register_adapter_with_device(
             adapter_name, lora_a, lora_b, attr_config.rank, attr_config.scale, attr_config.enable
diff --git a/tests/gpu/torch/peft/test_megatron_peft.py b/tests/gpu/torch/peft/test_megatron_peft.py
@@ -1,7 +1,9 @@
+import copy
 from functools import partial
 
 import pytest
 import torch
+import torch.nn.init as init
 from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch_dist.dist_utils import get_device_counts, spawn_multiprocess_job
 from _test_utils.torch_dist.plugins.megatron_common import (
@@ -51,8 +53,6 @@
         "*": {
             "rank": 32,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "zero_init",
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -66,8 +66,6 @@
         "*": {
             "rank": 128,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "zero_init",
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -81,8 +79,8 @@
         "*": {
             "rank": 32,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "kaiming_init",
+            "lora_a_init": init.kaiming_uniform_,
+            "lora_b_init": init.kaiming_uniform_,
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -96,8 +94,8 @@
         "*": {
             "rank": 128,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "kaiming_init",
+            "lora_a_init": init.kaiming_uniform_,
+            "lora_b_init": init.kaiming_uniform_,
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -111,8 +109,8 @@
         "*": {
             "rank": 8,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "kaiming_init",
+            "lora_a_init": init.kaiming_uniform_,
+            "lora_b_init": init.kaiming_uniform_,
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -127,8 +125,6 @@
         "*self_attention*": {
             "rank": 16,
             "scale": 1,
-            "lora_a_init": "kaiming_init",
-            "lora_b_init": "zero_init",
             "enable": True,
         },
         "*output_layer*": {"enable": False},
@@ -449,14 +445,15 @@ def test_adapter_gradient_flow_freeze_base_model(device_count, lora_config, tmp_
 
 def _test_adapter_gradient_flow_freeze_lora_model(lora_config, tmp_path, rank, size):
     hidden_size = 512
-    lora_config["freeze_lora_weights"] = True
-    lora_config["freeze_base_model"] = False
+    local_cfg = copy.deepcopy(lora_config)
+    local_cfg["freeze_lora_weights"] = True
+    local_cfg["freeze_base_model"] = False
 
     initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
     model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
     prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
 
-    mtpf.update_model(model, lora_config)
+    mtpf.update_model(model, local_cfg)
     model.train()
 
     # Use a simple forward pass instead for grad check
@@ -569,7 +566,7 @@ def forward_func(mod):
             assert hasattr(module.weight_quantizer, "amax")
             assert getattr(module.input_quantizer, "amax") is not None
             assert getattr(module.weight_quantizer, "amax") is not None
-            # Check if the lora have teh quantizer, they should not have them.
+            # Check if the lora have the quantizer, they should not have them.
             for adapter_name in module._lora_adapters:
                 lora_a = module._lora_adapters[adapter_name]["lora_a"]
                 lora_b = module._lora_adapters[adapter_name]["lora_b"]
@@ -621,7 +618,7 @@ def forward_func(mod):
             assert hasattr(module.weight_quantizer, "amax")
             assert getattr(module.input_quantizer, "amax") is not None
             assert getattr(module.weight_quantizer, "amax") is not None
-            # Check if the lora have teh quantizer, they should not have them.
+            # Check if the lora have the quantizer, they should not have them.
             for adapter_name in module._lora_adapters:
                 lora_a = module._lora_adapters[adapter_name]["lora_a"]
                 lora_b = module._lora_adapters[adapter_name]["lora_b"]
@@ -701,7 +698,7 @@ def forward_func(mod):
             assert hasattr(module.weight_quantizer, "amax")
             assert getattr(module.input_quantizer, "amax") is not None
             assert getattr(module.weight_quantizer, "amax") is not None
-            # Check if the lora have teh quantizer, they should not have them.
+            # Check if the lora have the quantizer, they should not have them.
             for adapter_name in module._lora_adapters:
                 lora_a = module._lora_adapters[adapter_name]["lora_a"]
                 lora_b = module._lora_adapters[adapter_name]["lora_b"]
@@ -765,7 +762,7 @@ def forward_func(mod):
             assert hasattr(module.weight_quantizer, "amax")
             assert getattr(module.input_quantizer, "amax") is not None
             assert getattr(module.weight_quantizer, "amax") is not None
-            # Check if the lora have teh quantizer, they should not have them.
+            # Check if the lora have the quantizer, they should not have them.
             for adapter_name in module._lora_adapters:
                 lora_a = module._lora_adapters[adapter_name]["lora_a"]
                 lora_b = module._lora_adapters[adapter_name]["lora_b"]
@@ -784,7 +781,7 @@ def forward_func(mod):
         DEFAULT_LORA_CFG_RANDOM_INIT_TEST,
     ],
 )
-def test_mcore_lora_quantize_save_restore(device_count, lora_config, tmp_path):
+def test_mcore_lora_then_quantize_save_restore(device_count, lora_config, tmp_path):
     spawn_multiprocess_job(
         size=device_count,
         job=partial(_test_mcore_lora_then_quantize_save_restore, lora_config, str(tmp_path)),