fixing CI and other test cases

NikhilNayak-debug · NikhilNayak-debug · commit 89c3113924b3 · 2025-10-20T20:14:17.000Z
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -145,8 +145,6 @@
       title: Model merge
     - local: package_reference/helpers
       title: Helpers
-    - local: package_reference/osf_utils
-      title: OSF utilities
     - local: package_reference/hotswap
       title: Hotswapping adapters
     - local: package_reference/functional
diff --git a/src/peft/tuners/osf/config.py b/src/peft/tuners/osf/config.py
@@ -29,8 +29,11 @@ class OSFConfig(PeftConfig):
         default=None,
         metadata={
             "help": (
-                "Preserved SVD rank (frozen). Trainable rank equals min(weight.shape) - effective_rank. "
-                "If None, defaults to 50% of the smaller weight dimension."
+                'Preserved SVD rank ("high" subspace). The top-`effective_rank` singular directions are frozen '
+                "and retained across tasks; the remaining dimensions form the trainable low-rank subspace. "
+                "Trainable rank equals min(weight.shape) - effective_rank. If None, defaults to 50% of the smaller "
+                "weight dimension per target module. Floats in (0, 1] are interpreted as a fraction of the smaller "
+                "matrix dimension per target."
             )
         },
     )
@@ -48,5 +51,30 @@ class OSFConfig(PeftConfig):
         },
     )
 
+    # Additional optional fields for compatibility with generic test harnesses
+    init_weights: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If provided, toggles custom weight initialization behavior for certain methods. OSF ignores this "
+                "flag but accepts it for config compatibility."
+            )
+        },
+    )
+    modules_to_save: Optional[list[str]] = field(
+        default=None,
+        metadata={"help": "Optional list of module names to save separately (ignored by OSF but accepted)."},
+    )
+    target_svd_config: Optional[dict[str, int]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Optional per-parameter SVD target rank mapping (e.g., {'lin0.weight': 8}). OSF currently ignores "
+                "this field but accepts it for forward compatibility."
+            )
+        },
+    )
+
     def __post_init__(self):
+        super().__post_init__()
         self.peft_type = PeftType.OSF
diff --git a/src/peft/tuners/osf/layer.py b/src/peft/tuners/osf/layer.py
@@ -123,30 +123,23 @@ def _attach_hooks(self, adapter_name: str):
             return
 
         svd_module = self.osf_svd_params[adapter_name]
-        svd_dict = {
-            "U_high": self._osf_U_high[adapter_name],
-            "S_high": self._osf_S_high[adapter_name],
-            "V_high": self._osf_V_high[adapter_name],
-            "U_low": svd_module["U_low"],
-            "S_low": svd_module["S_low"],
-            "V_low": svd_module["V_low"],
-        }
 
-        def hook(grad, name: str):
+        def hook(grad, name: str, adapter: str, layer: OSFLayer):
             # Project gradient to be orthogonal to high-rank subspace for U_low/V_low
+            # Access buffers dynamically to ensure they're on the correct device
             if name == "U_low":
-                U_high = svd_dict["U_high"]
+                U_high = layer._osf_U_high[adapter]
                 proj = U_high @ (U_high.transpose(0, 1) @ grad)
                 return grad - proj
             elif name == "V_low":
-                V_high = svd_dict["V_high"]
+                V_high = layer._osf_V_high[adapter]
                 proj = (grad @ V_high.transpose(0, 1)) @ V_high
                 return grad - proj
             return grad
 
         # Store hook handles for later cleanup
-        handle_u = svd_module["U_low"].register_hook(partial(hook, name="U_low"))
-        handle_v = svd_module["V_low"].register_hook(partial(hook, name="V_low"))
+        handle_u = svd_module["U_low"].register_hook(partial(hook, name="U_low", adapter=adapter_name, layer=self))
+        handle_v = svd_module["V_low"].register_hook(partial(hook, name="V_low", adapter=adapter_name, layer=self))
 
         self.hook_handles.extend([handle_u, handle_v])
 
@@ -249,8 +242,6 @@ def __init__(
 
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
             result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
             result = self.base_layer(x, *args, **kwargs)
@@ -263,8 +254,6 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
             active_adapter = self.active_adapters[0] if self.active_adapters else None
             if active_adapter and active_adapter in self.osf_svd_params:
                 weight = self._reconstruct_weight(active_adapter)
-                if weight.dtype != x.dtype:
-                    weight = weight.to(x.dtype)
                 result = F.linear(x, weight, bias)
             else:
                 result = self.base_layer(x, *args, **kwargs)
diff --git a/src/peft/tuners/osf/model.py b/src/peft/tuners/osf/model.py
@@ -2,6 +2,7 @@
 
 import re
 
+import torch
 import torch.nn as nn
 
 from peft.tuners.tuners_utils import BaseTuner
@@ -17,8 +18,22 @@ class OSFModel(BaseTuner):
     tuner_layer_cls = OSFLayer
     target_module_mapping = TRANSFORMERS_MODELS_TO_OSF_TARGET_MODULES_MAPPING
 
-    def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False):
-        super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage)
+    def __init__(
+        self,
+        model,
+        config,
+        adapter_name,
+        low_cpu_mem_usage: bool = False,
+        state_dict: dict[str, torch.Tensor] | None = None,
+    ):
+        # Pass state_dict through for compatibility with BaseTuner
+        super().__init__(
+            model,
+            config,
+            adapter_name,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            state_dict=state_dict,
+        )
 
     def __getattr__(self, name: str):
         """Forward missing attributes to the wrapped base model.
@@ -33,6 +48,18 @@ def __getattr__(self, name: str):
                 raise
             return getattr(self.model, name)
 
+    def _prepare_adapter_config(self, peft_config, model_config):
+        # If target_modules is unspecified, try mapping; else fall back to all linear layers for custom models
+        if getattr(peft_config, "target_modules", None) is None:
+            model_type = model_config.get("model_type")
+            if model_type in self.target_module_mapping:
+                peft_config.target_modules = set(self.target_module_mapping[model_type])
+            else:
+                from peft.utils.constants import INCLUDE_LINEAR_LAYERS_SHORTHAND
+
+                peft_config.target_modules = INCLUDE_LINEAR_LAYERS_SHORTHAND
+        return peft_config
+
     def _create_and_replace(
         self,
         osf_config,
@@ -87,7 +114,8 @@ def _resolve_rank(value, min_dim: int) -> int:
 
     def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
         for n, p in model.named_parameters():
-            if self.prefix not in n and "svd_params" not in n and not n.endswith(("_U_low", "_S_low", "_V_low")):
+            # Only OSF adapter parameters (in osf_svd_params) should be trainable
+            if "osf_svd_params" not in n:
                 p.requires_grad = False
 
     def _cast_adapter_dtype(self, adapter_name: str, autocast_adapter_dtype: bool = True) -> None:
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -1146,6 +1146,7 @@
     MissConfig: "miss_",
     TrainableTokensConfig: "trainable_tokens_",
     WaveFTConfig: "waveft_",
+    OSFConfig: "osf_",
 }
 
 
@@ -1829,9 +1830,7 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs):
         # check that none of this raises an error
         model(**X)
 
-        if model_id in ["Conv2dGroups", "Conv2dGroups2"]:
-            # this model does not support merging
-            return
+        _skip_if_merging_not_supported(model_id, config_cls)
 
         model.merge_adapter(safe_merge=False)
         model(**X)
@@ -1871,9 +1870,7 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs):
         # check that none of this raises an error
         model(**X)
 
-        if model_id in ["Conv2dGroups", "Conv2dGroups2"]:
-            # this model does not support merging
-            return
+        _skip_if_merging_not_supported(model_id, config_cls)
 
         model.merge_adapter(safe_merge=False)
         model(**X)
@@ -1912,9 +1909,7 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf
         # check that none of this raises an error
         model(**X)
 
-        if model_id in ["Conv2dGroups", "Conv2dGroups2"]:
-            # this model does not support merging
-            return
+        _skip_if_merging_not_supported(model_id, config_cls)
 
         model.merge_adapter(safe_merge=False)
         model(**X)
@@ -1953,9 +1948,7 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con
         # check that none of this raises an error
         model(**X)
 
-        if model_id in ["Conv2dGroups", "Conv2dGroups2"]:
-            # this model does not support merging
-            return
+        _skip_if_merging_not_supported(model_id, config_cls)
 
         model.merge_adapter(safe_merge=False)
         model(**X)
@@ -2032,7 +2025,7 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
             lr = 0.1  # otherwise we get nan
         elif "mha" in model_id.lower():
             lr = 1e-3  # we get exploding gradients with MHA when learning rate is too high
-        elif issubclass(config_cls, VBLoRAConfig) or issubclass(config_cls, RandLoraConfig):
+        elif issubclass(config_cls, (VBLoRAConfig, RandLoraConfig, OSFConfig)):
             lr = 0.01  # otherwise we get nan
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
@@ -2083,7 +2076,11 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
             torch.nn.init.zeros_(model.vblora_vector_bank["default"])
         model.eval()
         outputs_before = model(**X)
-        assert torch.allclose(outputs_base, outputs_before)
+        # OSF uses SVD reconstruction which introduces small numerical differences
+        if issubclass(config_cls, OSFConfig):
+            assert torch.allclose(outputs_base, outputs_before, rtol=1e-4, atol=1e-4)
+        else:
+            assert torch.allclose(outputs_base, outputs_before)
 
         if issubclass(config_cls, VBLoRAConfig):
             # initialize `vblora_vector_bank` so it can be trained
@@ -2121,7 +2118,11 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         else:
             rtol, atol = 1e-5, 1e-8
         assert not torch.allclose(outputs_before, outputs_after, rtol=rtol, atol=atol)
-        assert torch.allclose(outputs_before, outputs_disabled)
+        # OSF uses SVD reconstruction which introduces small numerical differences
+        if issubclass(config_cls, OSFConfig):
+            assert torch.allclose(outputs_before, outputs_disabled, rtol=1e-4, atol=1e-4)
+        else:
+            assert torch.allclose(outputs_before, outputs_disabled)
         assert torch.allclose(outputs_after, outputs_enabled_after_disable)
 
     @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES)
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
@@ -293,12 +293,13 @@ def _skip_if_not_conv1d_supported(model_id, config_cls):
         BoneConfig,
         HRAConfig,
         OFTConfig,
+        OSFConfig,
         RoadConfig,
         ShiraConfig,
         C3AConfig,
         MissConfig,
     ]:
-        pytest.skip("Skipping BOFT/HRA/OFT/Bone/Road/SHiRA/C3A/MiSS for GPT2LMHeadModel")
+        pytest.skip("Skipping BOFT/HRA/OFT/Bone/Road/SHiRA/C3A/MiSS/OSF for GPT2LMHeadModel")
 
 
 def _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls):