Fix huggingface#2422: Modules to save with multiple adapters (huggingface#2430)

githubnemo · saeid93 · BenjaminBossan · web-flow · commit 48e0c5de7121 · 2025-03-19T10:57:58.000+01:00
Using multiple adapters with different `modules_to_save` values leads to a scenario where
it is implicitly assumed that each `ModulesToSaveWrapper` has a module for every loaded adapter.
Since the adapters have different `modules_to_save` values this is not the case and retrieving
the state dict fails with a key lookup error.

In addition to that, after disabling a `ModulesToSaveWrapper`, setting the adapter as active does not
re-enable said adapter.

---------

Co-authored-by: Saeid Ghafouri &lt;s.ghafouri@qub.ac.uk&gt;
Co-authored-by: Benjamin Bossan &lt;BenjaminBossan@users.noreply.github.com&gt;
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -473,6 +473,13 @@ def inject_adapter(
             if not key:
                 continue
             # Check for modules_to_save in case
+            #
+            # Note that this is redundant with PeftModel.set_additional_trainable_models but might be necessary
+            # when calling inject_adapter without a PEFT model. This is outdated as it only focuses on
+            # ModulesToSaveWrapper and ignores other potentially configured AuxiliaryTrainingWrapper instances.
+            #
+            # TODO: determine if there's a good reason for this and refactor to support AuxiliaryTrainingWrapper,
+            # or remove if superfluous.
             if _check_for_modules_to_save and any(
                 key.endswith(module_to_save) for module_to_save in peft_config.modules_to_save
             ):
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -499,6 +499,10 @@ def update(self, adapter_name, **kwargs):
             add_hook_to_module(self.modules_to_save[adapter_name], new_hook)
 
         self.original_module.requires_grad_(False)
+
+        # note that there currently cannot be more than one active adapter for the same layer with modules to save
+        # since there would be no clear way to decide which adapter's weights are the correct ones. therefore we
+        # assume that there is only one active adapter. this precondition is enforced by _set_adapter.
         if adapter_name == self.active_adapter:
             self.modules_to_save[adapter_name].requires_grad_(True)
 
@@ -550,6 +554,10 @@ def adapter_state_dict_load_map(self, adapter_name):
         return {k: f"modules_to_save.{adapter_name}.{k}" for k in self.adapter_state_dict(adapter_name)}
 
     def adapter_state_dict(self, adapter_name):
+        if adapter_name not in self._adapters:
+            # In caes of multiple adapters, each bringing their own modules to save, each
+            # ModulesToSaveWrapper will be queried but not every wrapper is obliged to serve the same adapters.
+            return {}
         return self.modules_to_save[adapter_name].state_dict()
 
     def unload_and_optionally_merge_module(
@@ -732,6 +740,7 @@ def _set_trainable(
     found_modules = set()
     # disable removal of duplicates to support targeting tied weights
     key_list = [key for key, _ in model.named_modules(remove_duplicate=False)]
+
     for key in key_list:
         target_module_found = any(key.endswith(target_key) for target_key in module_names)
         if target_module_found:
@@ -776,6 +785,7 @@ def check_adapter_name(adapter_name):
             # if the adapter is found in this module, set it as the active adapter, else disable the adapters of this
             # module
             if adapter_name in module._adapters:
+                module.enable_adapters(True)
                 module.set_adapter(adapter_name)
             else:
                 module.enable_adapters(False)
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -1236,6 +1236,7 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         outputs_base = model(**X)
         if issubclass(config_cls, (FourierFTConfig, TrainableTokensConfig)):
             config_kwargs = config_kwargs.copy()
+            # override the default value and make PEFT operation a no-op
             config_kwargs["init_weights"] = True
         config = config_cls(
             base_model_name_or_path=model_id,
@@ -1255,9 +1256,9 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         model.train()
         # EmbConv1D is slow to learn for some reason
         lr = 0.01 if model_id != "EmbConv1D" else 1.0
-        if isinstance(config_cls, LNTuningConfig):
-            # LayerNorm tuning is slow to learn
-            lr = 1.0
+        if isinstance(config, TrainableTokensConfig):
+            # TrainableTokens is only changing a small subset, so we need a higher lr to see the difference
+            lr = 2.0
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
diff --git a/tests/test_other.py b/tests/test_other.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 
 import pytest
 import torch
@@ -107,6 +108,84 @@ def test_get_peft_model_revision_warning(tmp_path):
         _ = get_peft_model(base_model, lora_config, revision=overwrite_revision)
 
 
+def test_load_multiple_adapters_different_modules_to_save(tmp_path):
+    # This tests the error described in #2422 where loading multiple adapters with different modules_to_save
+    # attributes fails (due to a regression from #2376).
+
+    model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-random-LlamaForCausalLM")
+
+    def peft_config(**kwargs):
+        return LoraConfig(target_modules="all-linear", **kwargs)
+
+    original_model = copy.deepcopy(model)
+
+    peft_config_0 = peft_config(modules_to_save=["0.post_attention_layernorm"])
+    peft_config_1 = peft_config(modules_to_save=["0.post_attention_layernorm"])
+    peft_config_2 = peft_config(modules_to_save=["1.post_attention_layernorm"])
+
+    # Save adapter 0, nothing fancy, should be equal to base model weighs
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_0)
+    peft_model.save_pretrained(tmp_path / "adapter_0")
+
+    # Save adapter 1, modules to save weights are modified randomly, should be unique to adapter 1
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_1)
+    peft_model.model.model.layers[0].post_attention_layernorm.weight.data = torch.rand_like(
+        peft_model.model.model.layers[0].post_attention_layernorm.weight.data
+    )
+    adapter_1_saved = peft_model.model.model.layers[0].post_attention_layernorm.weight.data.clone()
+    peft_model.save_pretrained(tmp_path / "adapter_1")
+
+    # Save adapter 2, modules to save weights are modified randomly, should be unique to adapter 2
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_2)
+    peft_model.model.model.layers[1].post_attention_layernorm.weight.data = torch.rand_like(
+        peft_model.model.model.layers[1].post_attention_layernorm.weight.data
+    )
+    adapter_2_saved = peft_model.model.model.layers[1].post_attention_layernorm.weight.data.clone()
+    peft_model.save_pretrained(tmp_path / "adapter_2")
+
+    del peft_model
+
+    combined_model = PeftModel.from_pretrained(original_model, tmp_path / "adapter_0", adapter_name="adapter_0")
+    combined_model.load_adapter(tmp_path / "adapter_1", adapter_name="adapter_1")
+    combined_model.load_adapter(tmp_path / "adapter_2", adapter_name="adapter_2")
+
+    # For adapter 0 we expect every mentioned modules to save layer of this test to be equal to the original model
+    # since we didn't modify it for adapter 0 and only adapter 0 is active.
+    combined_model.set_adapter("adapter_0")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        original_model.model.layers[0].post_attention_layernorm.weight,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        original_model.model.layers[1].post_attention_layernorm.weight,
+    )
+
+    # For adapter 1 we expect that the modified module to save 0.post_attention_layernorm is modified, the other
+    # module to save layers mentioned above should be untouched.
+    combined_model.set_adapter("adapter_1")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        adapter_1_saved,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        original_model.model.layers[1].post_attention_layernorm.weight,
+    )
+
+    # For adapter 2 we expect its module to save layer (1.post_attention_layernorm) to be modified but the other
+    # module to save weights should be kept original.
+    combined_model.set_adapter("adapter_2")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        original_model.model.layers[0].post_attention_layernorm.weight,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        adapter_2_saved,
+    )
+
+
 class TestModulesToSaveAttributeAccess:
     """Test attribute accces on the ModulesToSaveWrapper class.