Add enable_lora_hotswap method

BenjaminBossan · BenjaminBossan · commit 1b834ecfef93 · 2025-02-14T12:23:34.000+01:00
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
@@ -898,3 +898,17 @@ def lora_scale(self) -> float:
         # property function that returns the lora scale which can be set at run time by the pipeline.
         # if _lora_scale has not been set, return 1
         return self._lora_scale if hasattr(self, "_lora_scale") else 1.0
+
+    def enable_lora_hotswap(self, **kwargs) -> None:
+        """Enables the possibility to hotswap LoRA adapters.
+
+        Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of
+        the loaded adapters differ.
+
+        Args:
+            target_rank (`int`):
+                The highest rank among all the adapters that will be loaded.
+        """
+        for component in self.components.values():
+            if hasattr(component, "enable_lora_hotswap"):
+                component.enable_lora_hotswap(**kwargs)
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -121,6 +121,8 @@ class PeftAdapterMixin:
     """
 
     _hf_peft_config_loaded = False
+    # kwargs for prepare_model_for_compiled_hotswap, if required
+    _prepare_lora_hotswap_kwargs: Optional[dict] = None
 
     @classmethod
     # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading
@@ -325,9 +327,13 @@ def load_lora_adapter(
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
-            if hotswap:
+            if hotswap or (self._prepare_lora_hotswap_kwargs is not None):
                 if is_peft_version(">", "0.14.0"):
-                    from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+                    from peft.utils.hotswap import (
+                        check_hotswap_configs_compatible,
+                        hotswap_adapter_from_state_dict,
+                        prepare_model_for_compiled_hotswap,
+                    )
                 else:
                     msg = (
                         "Hotswapping requires PEFT > v0.14. Please upgrade PEFT to a higher version or install it "
@@ -366,6 +372,19 @@ def map_state_dict_for_hotswap(sd):
                 else:
                     inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
                     incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
+
+                    if self._prepare_lora_hotswap_kwargs is not None:
+                        # For hotswapping of compiled models or adapters with different ranks.
+                        # If the user called enable_lora_hotswap, we need to ensure it is called:
+                        # - after the first adapter was loaded
+                        # - before the model is compiled and the 2nd adapter is being hotswapped in
+                        # Therefore, it needs to be called here
+                        prepare_model_for_compiled_hotswap(
+                            self, config=lora_config, **self._prepare_lora_hotswap_kwargs
+                        )
+                        # We only want to call prepare_model_for_compiled_hotswap once
+                        self._prepare_lora_hotswap_kwargs = None
+
             except Exception as e:
                 # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`.
                 if hasattr(self, "peft_config"):
@@ -816,3 +835,17 @@ def delete_adapters(self, adapter_names: Union[List[str], str]):
             # Pop also the corresponding adapter from the config
             if hasattr(self, "peft_config"):
                 self.peft_config.pop(adapter_name, None)
+
+    def enable_lora_hotswap(self, target_rank: int) -> None:
+        """Enables the possibility to hotswap LoRA adapters.
+
+        Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of
+        the loaded adapters differ.
+
+        Args:
+            target_rank (`int`):
+                The highest rank among all the adapters that will be loaded.
+        """
+        if getattr(self, "peft_config", {}):
+            raise RuntimeError("Call `enable_lora_hotswap` before loading the first adapter.")
+        self._prepare_lora_hotswap_kwargs = {"target_rank": target_rank}
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -15,7 +15,7 @@
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
-from typing import Callable, Dict, Union
+from typing import Callable, Dict, Optional, Union
 
 import safetensors
 import torch
@@ -62,6 +62,8 @@ class UNet2DConditionLoadersMixin:
 
     text_encoder_name = TEXT_ENCODER_NAME
     unet_name = UNET_NAME
+    # kwargs for prepare_model_for_compiled_hotswap, if required
+    _prepare_lora_hotswap_kwargs: Optional[dict] = None
 
     @validate_hf_hub_args
     def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
@@ -377,9 +379,13 @@ def _process_lora(
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
-            if hotswap:
+            if hotswap or (self._prepare_lora_hotswap_kwargs is not None):
                 if is_peft_version(">", "0.14.0"):
-                    from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+                    from peft.utils.hotswap import (
+                        check_hotswap_configs_compatible,
+                        hotswap_adapter_from_state_dict,
+                        prepare_model_for_compiled_hotswap,
+                    )
                 else:
                     msg = (
                         "Hotswapping requires PEFT > v0.14. Please upgrade PEFT to a higher version or install it "
@@ -417,6 +423,19 @@ def map_state_dict_for_hotswap(sd):
                 else:
                     inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
                     incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
+
+                    if self._prepare_lora_hotswap_kwargs is not None:
+                        # For hotswapping of compiled models or adapters with different ranks.
+                        # If the user called enable_lora_hotswap, we need to ensure it is called:
+                        # - after the first adapter was loaded
+                        # - before the model is compiled and the 2nd adapter is being hotswapped in
+                        # Therefore, it needs to be called here
+                        prepare_model_for_compiled_hotswap(
+                            self, config=lora_config, **self._prepare_lora_hotswap_kwargs
+                        )
+                        # We only want to call prepare_model_for_compiled_hotswap once
+                        self._prepare_lora_hotswap_kwargs = None
+
             except Exception as e:
                 # TODO: add test in line with:
                 # https://github.com/huggingface/diffusers/pull/10188/files#diff-b544edcc938e163009735ef4fa963abd0a41615c175552160c9e0f94ceb7f552
@@ -1002,3 +1021,17 @@ def _load_ip_adapter_loras(self, state_dicts):
                         }
                     )
         return lora_dicts
+
+    def enable_lora_hotswap(self, target_rank: int) -> None:
+        """Enables the possibility to hotswap LoRA adapters.
+
+        Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of
+        the loaded adapters differ.
+
+        Args:
+            target_rank (`int`):
+                The highest rank among all the adapters that will be loaded.
+        """
+        if getattr(self, "peft_config", {}):
+            raise RuntimeError("Call `enable_lora_hotswap` before loading the first adapter.")
+        self._prepare_lora_hotswap_kwargs = {"target_rank": target_rank}
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -1638,10 +1638,8 @@ def check_model_hotswap(self, do_compile, rank0, rank1, target_modules):
         Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would
         fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is
         fine.
-
         """
-        from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
+        # create 2 adapters with different ranks and alphas
         dummy_input = self.get_dummy_input()
         alpha0, alpha1 = rank0, rank1
         max_rank = max([rank0, rank1])
@@ -1665,29 +1663,29 @@ def check_model_hotswap(self, do_compile, rank0, rank1, target_modules):
         assert not (output1_before == 0).all()
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
+            # save the adapter checkpoints
             unet.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0")
             unet.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1")
             del unet
 
+            # load the first adapter
             unet = self.get_small_unet()
+            if do_compile or (rank0 != rank1):
+                # no need to prepare if the model is not compiled or if the ranks are identical
+                unet.enable_lora_hotswap(target_rank=max_rank)
+
             file_name0 = os.path.join(os.path.join(tmp_dirname, "0"), "pytorch_lora_weights.safetensors")
             file_name1 = os.path.join(os.path.join(tmp_dirname, "1"), "pytorch_lora_weights.safetensors")
             unet.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0")
 
-            if do_compile or (rank0 != rank1):
-                # no need to prepare if the model is not compiled or if the ranks are identical
-                prepare_model_for_compiled_hotswap(
-                    unet,
-                    config={"adapter0": lora_config0, "adapter1": lora_config1},
-                    target_rank=max_rank,
-                )
             if do_compile:
                 unet = torch.compile(unet, mode="reduce-overhead")
 
             with torch.inference_mode():
                 output0_after = unet(**dummy_input)["sample"]
             assert torch.allclose(output0_before, output0_after, atol=tol, rtol=tol)
 
+            # hotswap the 2nd adapter
             unet.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True)
 
             # we need to call forward to potentially trigger recompilation
@@ -1727,3 +1725,12 @@ def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1):
         target_modules = ["to_q", "conv"]
         with torch._dynamo.config.patch(error_on_recompile=True):
             self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules=target_modules)
+
+    def test_enable_lora_hotswap_called_too_late_raises(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
+        msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
+        with self.assertRaisesRegex(RuntimeError, msg):
+            unet.enable_lora_hotswap(target_rank=32)
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
@@ -17,6 +17,7 @@
 import json
 import os
 import random
+import re
 import shutil
 import sys
 import tempfile
@@ -2239,12 +2240,23 @@ def get_dummy_input(self):
         return pipeline_inputs
 
     def check_pipeline_hotswap(self, do_compile, rank0, rank1, target_modules):
-        # Similar to check_hotswap but more realistic: check a whole pipeline to be closer to how users would use it
-        from peft.utils.hotswap import prepare_model_for_compiled_hotswap
-
+        """
+        Check that hotswapping works on a pipeline.
+
+        Steps:
+        - create 2 LoRA adapters and save them
+        - load the first adapter
+        - hotswap the second adapter
+        - check that the outputs are correct
+        - optionally compile the model
+
+        Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would
+        fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is
+        fine.
+        """
+        # create 2 adapters with different ranks and alphas
         dummy_input = self.get_dummy_input()
         pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
-
         alpha0, alpha1 = rank0, rank1
         max_rank = max([rank0, rank1])
         lora_config0 = self.get_unet_lora_config(rank0, alpha0, target_modules)
@@ -2266,6 +2278,7 @@ def check_pipeline_hotswap(self, do_compile, rank0, rank1, target_modules):
         assert not (output1_before == 0).all()
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
+            # save the adapter checkpoints
             lora0_state_dicts = self.get_lora_state_dicts({"unet": pipeline.unet}, adapter_name="adapter0")
             StableDiffusionPipeline.save_lora_weights(
                 save_directory=os.path.join(tmp_dirname, "adapter0"), safe_serialization=True, **lora0_state_dicts
@@ -2276,17 +2289,16 @@ def check_pipeline_hotswap(self, do_compile, rank0, rank1, target_modules):
             )
             del pipeline
 
+            # load the first adapter
             pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
+            if do_compile or (rank0 != rank1):
+                # no need to prepare if the model is not compiled or if the ranks are identical
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+
             file_name0 = os.path.join(tmp_dirname, "adapter0", "pytorch_lora_weights.safetensors")
             file_name1 = os.path.join(tmp_dirname, "adapter1", "pytorch_lora_weights.safetensors")
 
             pipeline.load_lora_weights(file_name0)
-            if do_compile or (rank0 != rank1):
-                prepare_model_for_compiled_hotswap(
-                    pipeline.unet,
-                    config={"adapter0": lora_config0, "adapter1": lora_config1},
-                    target_rank=max_rank,
-                )
             if do_compile:
                 pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead")
 
@@ -2295,6 +2307,7 @@ def check_pipeline_hotswap(self, do_compile, rank0, rank1, target_modules):
             # sanity check: still same result
             assert np.allclose(output0_before, output0_after, atol=tol, rtol=tol)
 
+            # hotswap the 2nd adapter
             pipeline.load_lora_weights(file_name1, hotswap=True, adapter_name="default_0")
             output1_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0]
 
@@ -2327,3 +2340,12 @@ def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1)
         target_modules = ["to_q", "conv"]
         with torch._dynamo.config.patch(error_on_recompile=True):
             self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules=target_modules)
+
+    def test_enable_lora_hotswap_called_too_late_raises(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device)
+        pipeline.unet.add_adapter(lora_config)
+        msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
+        with self.assertRaisesRegex(RuntimeError, msg):
+            pipeline.enable_lora_hotswap(target_rank=32)