Update init functions

jingyu-ml · jingyu-ml · commit 81f8d060ca05 · 2025-09-30T05:07:44.000Z
Signed-off-by: Jingyu Xin &lt;jingyux@nvidia.com&gt;
diff --git a/modelopt/torch/peft/config.py b/modelopt/torch/peft/config.py
@@ -16,11 +16,10 @@
 """Configuration classes for PEFT methods."""
 
 import math
-import pickle  # nosec B403 - Only checking picklability
 from collections.abc import Callable
 
 import torch.nn.init as init
-from pydantic import field_validator, model_validator
+from pydantic import field_validator
 
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
 
@@ -61,14 +60,14 @@ class PEFTAttributeConfig(ModeloptBaseConfig):
         description="Scaling factor for the LoRA output. Controls the magnitude of the adaptation.",
     )
 
-    lora_a_init: Callable[[object], None] | None = ModeloptField(
-        default=kaiming_init,
+    lora_a_init: str = ModeloptField(
+        default="kaiming_init",
         title="LoRA A matrix initializer",
         description="Custom initialization function for LoRA A matrix. Default to Kaiming uniform initialization.",
     )
 
-    lora_b_init: Callable[[object], None] | None = ModeloptField(
-        default=zero_init,
+    lora_b_init: str = ModeloptField(
+        default="zero_init",
         title="LoRA B matrix initializer",
         description="Custom initialization function for LoRA B matrix. Default to zero initialization.",
     )
@@ -89,33 +88,6 @@ def validate_scale(cls, v):
             raise ValueError("scale must be a positive number")
         return v
 
-    @model_validator(mode="after")
-    def validate_init_functions(self):
-        """Validate initialization functions are callable and picklable."""
-        if self.lora_a_init is not None and not callable(self.lora_a_init):
-            raise ValueError("lora_a_init must be callable")
-        if self.lora_b_init is not None and not callable(self.lora_b_init):
-            raise ValueError("lora_b_init must be callable")
-        if self.lora_a_init is not None:
-            try:
-                _del = pickle.dumps(self.lora_a_init)
-                del _del
-            except (pickle.PicklingError, TypeError, AttributeError) as e:
-                raise ValueError(
-                    f"lora_a_init cannot be pickled: {e}. "
-                    "Please use a module-level function instead of a lambda or nested function."
-                )
-        if self.lora_b_init is not None:
-            try:
-                _del = pickle.dumps(self.lora_b_init)
-                del _del
-            except (pickle.PicklingError, TypeError, AttributeError) as e:
-                raise ValueError(
-                    f"lora_b_init cannot be pickled: {e}. "
-                    "Please use a module-level function instead of a lambda or nested function."
-                )
-        return self
-
 
 # Type alias for adapter configuration
 PEFTAdapterCfgType = dict[str | Callable, PEFTAttributeConfig | dict]
diff --git a/modelopt/torch/peft/lora/layer.py b/modelopt/torch/peft/lora/layer.py
@@ -1,20 +1,33 @@
 """LoRA (Low-Rank Adaptation) module implementation."""
 
+import math
 import warnings
 from abc import abstractmethod
 from typing import Any
 
 import torch
 import torch.nn as nn
+import torch.nn.init as init
 
 from modelopt.torch.opt.dynamic import DynamicModule, _DMRegistryCls
 
 from ..config import PEFTAttributeConfig
 
-__all__ = [
-    "LoRAModule",
-    "LoRAModuleRegistry",
-]
+__all__ = ["LoRAModule", "LoRAModuleRegistry", "get_init_methods"]
+
+
+def get_init_methods(init_method: str = "kaiming_init"):
+    """Get the target init method for the lora a and lora b weights.
+
+    Args:
+        init_method: the init method you want for the lora layer
+    """
+    if init_method == "kaiming_init":
+        return lambda weight: init.kaiming_uniform_(
+            weight, a=math.sqrt(5)
+        )  # LoRA A: Kaiming uniform
+    elif init_method == "zero_init":
+        return lambda weight: init.zeros_(weight)  # LoRA B: zeros
 
 
 class LoRAModule(DynamicModule):
diff --git a/modelopt/torch/peft/lora/plugins/megatron.py b/modelopt/torch/peft/lora/plugins/megatron.py
@@ -15,12 +15,8 @@
 
 """Megatron-Core specific PEFT/LoRA plugins."""
 
-import math
-from collections.abc import Callable
-
 import torch
 import torch.nn as nn
-import torch.nn.init as init
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
@@ -35,7 +31,7 @@
 
 from ...config import PEFTAttributeConfig
 from ...custom import CUSTOM_MODEL_PLUGINS
-from ..layer import LoRAModule, LoRAModuleRegistry
+from ..layer import LoRAModule, LoRAModuleRegistry, get_init_methods
 
 DEFAULT_LORA_RANK = 64
 DEFAULT_SCALE = 1.0
@@ -73,18 +69,6 @@ class _MegatronParallelLoRABase(LoRAModule):
     LoRA implementations, reducing code duplication.
     """
 
-    def _get_init_methods(self, lora_a_init, lora_b_init) -> tuple[Callable, Callable]:
-        """Get initialization methods for LoRA A and B matrices.
-
-        Returns:
-            Tuple of (lora_a_init, lora_b_init) initialization functions
-        """
-        if lora_a_init is None:
-            lora_a_init = lambda weight: init.kaiming_uniform_(weight, a=math.sqrt(5))  # noqa: E731  # LoRA A: Kaiming uniform
-        if lora_b_init is None:
-            lora_b_init = lambda weight: init.zeros_(weight)  # noqa: E731  # LoRA B: zeros
-        return lora_a_init, lora_b_init
-
     def _register_adapter_with_device(
         self,
         adapter_name: str,
@@ -146,21 +130,23 @@ def update_layer_lora(
             adapter_name: Name for the new adapter
             rank: Rank of the LoRA decomposition
         """
+        lora_a_init = get_init_methods(attr_config.lora_a_init)
+        lora_b_init = get_init_methods(attr_config.lora_b_init)
         lora_a = nn.Linear(
             in_features=self.input_size,
             out_features=attr_config.rank,
             bias=False,
         )
         with torch.no_grad():
-            attr_config.lora_b_init(lora_a.weight)  # type: ignore[misc]
+            lora_a_init(lora_a.weight)
 
         lora_b = ColumnParallelLinear(
             attr_config.rank,
             self.output_size,
             config=self.config,
             bias=False,
             gather_output=False,
-            init_method=attr_config.lora_a_init,
+            init_method=lora_b_init,
         )
 
         self._register_adapter_with_device(
@@ -218,14 +204,16 @@ def update_layer_lora(
             adapter_name: Name for the new adapter
             rank: Rank of the LoRA decomposition
         """
+        lora_a_init = get_init_methods(attr_config.lora_a_init)
+        lora_b_init = get_init_methods(attr_config.lora_b_init)
         lora_a = RowParallelLinear(
             self.input_size,
             attr_config.rank,
             config=self.config,
             input_is_parallel=True,
             skip_bias_add=True,
             bias=False,
-            init_method=attr_config.lora_a_init,
+            init_method=lora_a_init,
         )
 
         lora_b = nn.Linear(
@@ -234,7 +222,7 @@ def update_layer_lora(
             bias=False,
         )
         with torch.no_grad():
-            attr_config.lora_b_init(lora_b.weight)  # type: ignore[misc]
+            lora_b_init(lora_b.weight)
 
         self._register_adapter_with_device(
             adapter_name, lora_a, lora_b, attr_config.rank, attr_config.scale, attr_config.enable