Add possibility to ignore enabled at wrong time

BenjaminBossan · BenjaminBossan · commit 3a6677ce4e5d · 2025-02-21T16:39:00.000+01:00
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
@@ -914,6 +914,12 @@ def enable_lora_hotswap(self, **kwargs) -> None:
         Args:
             target_rank (`int`):
                 The highest rank among all the adapters that will be loaded.
+            check_correct (`str`, *optional*, defaults to `"error"`):
+                How to handle the case when the model is already compiled, which should generally be avoided. The
+                options are:
+                  - "error" (default): raise an error
+                  - "warn": issue a warning
+                  - "ignore": do nothing
         """
         for key, component in self.components.items():
             if hasattr(component, "enable_lora_hotswap") and (key in self._lora_loadable_modules):
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -118,13 +118,12 @@ def load_lora_weights(
                 to call an additional method before loading the adapter:
 
                 ```py
-                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
-                model = ...  # load diffusers model with first LoRA adapter
+                pipeline = ...  # load diffusers pipeline
                 max_rank = ...  # the highest rank among all LoRAs that you want to load
-                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
-                model = torch.compile(model)
-                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                # call *before* compiling and loading the LoRA adapter
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+                pipeline.load_lora_weights(file_name)
+                # optionally compile the model now
                 ```
 
                 There are some limitations to this technique, which are documented here:
@@ -330,13 +329,12 @@ def load_lora_into_unet(
                 to call an additional method before loading the adapter:
 
                 ```py
-                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
-                model = ...  # load diffusers model with first LoRA adapter
+                pipeline = ...  # load diffusers pipeline
                 max_rank = ...  # the highest rank among all LoRAs that you want to load
-                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
-                model = torch.compile(model)
-                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                # call *before* compiling and loading the LoRA adapter
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+                pipeline.load_lora_weights(file_name)
+                # optionally compile the model now
                 ```
 
                 There are some limitations to this technique, which are documented here:
@@ -800,13 +798,12 @@ def load_lora_into_unet(
                 to call an additional method before loading the adapter:
 
                 ```py
-                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
-                model = ...  # load diffusers model with first LoRA adapter
+                pipeline = ...  # load diffusers pipeline
                 max_rank = ...  # the highest rank among all LoRAs that you want to load
-                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
-                model = torch.compile(model)
-                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                # call *before* compiling and loading the LoRA adapter
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+                pipeline.load_lora_weights(file_name)
+                # optionally compile the model now
                 ```
 
                 There are some limitations to this technique, which are documented here:
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -16,7 +16,7 @@
 import os
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
 
 import safetensors
 import torch
@@ -144,8 +144,7 @@ def _optionally_disable_offloading(cls, _pipeline):
     def load_lora_adapter(
         self, pretrained_model_name_or_path_or_dict, prefix="transformer", hotswap: bool = False, **kwargs
     ):
-        r"""
-        Loads a LoRA adapter into the underlying model.
+        r"""Loads a LoRA adapter into the underlying model.
 
         Parameters:
             pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
@@ -194,21 +193,21 @@ def load_lora_adapter(
                 However, the main advantage of hotswapping is that when the model is compiled with torch.compile,
                 loading the new adapter does not require recompilation of the model.
 
-                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
-                to call an additional method before loading the adapter:
+                If the model is compiled, or if the new adapter and the old adapter have different ranks and/or LoRA
+                alphas (i.e. scaling), you need to call an additional method before loading the adapter:
 
                 ```py
-                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
-                model = ...  # load diffusers model with first LoRA adapter
+                pipeline = ...  # load diffusers pipeline
                 max_rank = ...  # the highest rank among all LoRAs that you want to load
-                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
-                model = torch.compile(model)
-                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                # call *before* compiling and loading the LoRA adapter
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+                pipeline.load_lora_weights(file_name)
+                # optionally compile the model now
                 ```
 
                 There are some limitations to this technique, which are documented here:
                 https://huggingface.co/docs/peft/main/en/package_reference/hotswap
+
         """
         from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
         from peft.tuners.tuners_utils import BaseTunerLayer
@@ -837,16 +836,35 @@ def delete_adapters(self, adapter_names: Union[List[str], str]):
             if hasattr(self, "peft_config"):
                 self.peft_config.pop(adapter_name, None)
 
-    def enable_lora_hotswap(self, target_rank: int) -> None:
+    def enable_lora_hotswap(
+        self, target_rank: int = 128, check_compiled: Literal["error", "warn", "ignore"] = "error"
+    ) -> None:
         """Enables the possibility to hotswap LoRA adapters.
 
         Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of
         the loaded adapters differ.
 
         Args:
-            target_rank (`int`):
+            target_rank (`int`, *optional*, defaults to `128`):
                 The highest rank among all the adapters that will be loaded.
+
+            check_correct (`str`, *optional*, defaults to `"error"`):
+                How to handle the case when the model is already compiled, which should generally be avoided. The
+                options are:
+                  - "error" (default): raise an error
+                  - "warn": issue a warning
+                  - "ignore": do nothing
         """
         if getattr(self, "peft_config", {}):
-            raise RuntimeError("Call `enable_lora_hotswap` before loading the first adapter.")
-        self._prepare_lora_hotswap_kwargs = {"target_rank": target_rank}
+            if check_compiled == "error":
+                raise RuntimeError("Call `enable_lora_hotswap` before loading the first adapter.")
+            elif check_compiled == "warn":
+                logger.warning(
+                    "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
+                )
+            elif check_compiled != "ignore":
+                raise ValueError(
+                    f"check_compiles should be one of 'error', 'warn', or 'ignore', got '{check_compiled}' instead."
+                )
+
+        self._prepare_lora_hotswap_kwargs = {"target_rank": target_rank, "check_compiled": check_compiled}
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -24,6 +24,7 @@
 import unittest
 import unittest.mock as mock
 import uuid
+import warnings
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -1827,11 +1828,44 @@ def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1):
         with torch._dynamo.config.patch(error_on_recompile=True):
             self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules=target_modules)
 
-    def test_enable_lora_hotswap_called_too_late_raises(self):
+    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
         # ensure that enable_lora_hotswap is called before loading the first adapter
         lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
         unet = self.get_small_unet()
         unet.add_adapter(lora_config)
         msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
         with self.assertRaisesRegex(RuntimeError, msg):
             unet.enable_lora_hotswap(target_rank=32)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_warning(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        from diffusers.loaders.peft import logger
+
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
+        msg = (
+            "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
+        )
+        with self.assertLogs(logger=logger, level="WARNING") as cm:
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            assert any(msg in log for log in cm.output)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
+        # check possibility to ignore the error/warning
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")  # Capture all warnings
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}")
+
+    def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
+        # check that wrong argument value raises an error
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
+        msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.")
+        with self.assertRaisesRegex(ValueError, msg):
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
@@ -24,6 +24,7 @@
 import traceback
 import unittest
 import unittest.mock as mock
+import warnings
 
 import numpy as np
 import PIL.Image
@@ -2341,11 +2342,44 @@ def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1)
         with torch._dynamo.config.patch(error_on_recompile=True):
             self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules=target_modules)
 
-    def test_enable_lora_hotswap_called_too_late_raises(self):
+    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
         # ensure that enable_lora_hotswap is called before loading the first adapter
         lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
         pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
         pipeline.unet.add_adapter(lora_config)
         msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
         with self.assertRaisesRegex(RuntimeError, msg):
             pipeline.enable_lora_hotswap(target_rank=32)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_warns(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        from diffusers.loaders.peft import logger
+
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
+        pipeline.unet.add_adapter(lora_config)
+        msg = (
+            "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
+        )
+        with self.assertLogs(logger=logger, level="WARNING") as cm:
+            pipeline.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            assert any(msg in log for log in cm.output)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
+        # check possibility to ignore the error/warning
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
+        pipeline.unet.add_adapter(lora_config)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")  # Capture all warnings
+            pipeline.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}")
+
+    def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
+        # check that wrong argument value raises an error
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
+        pipeline.unet.add_adapter(lora_config)
+        msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.")
+        with self.assertRaisesRegex(ValueError, msg):
+            pipeline.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")