huggingface · sayakpaul · Apr 8, 2025 · Sep 17, 2024 · Sep 18, 2024 · Oct 16, 2024
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -77,7 +77,11 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin):
     text_encoder_name = TEXT_ENCODER_NAME
 
     def load_lora_weights(
-        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name=None,
+        hotswap: bool = False,
+        **kwargs,
     ):
         """
         Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
@@ -103,6 +107,28 @@ def load_lora_weights(
             low_cpu_mem_usage (`bool`, *optional*):
                 Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
                 weights.
+            hotswap : (`bool`, *optional*)
+                Defaults to `False`. Whether to substitute an existing adapter with the newly loaded adapter in-place.
+                This means that, instead of loading an additional adapter, this will take the existing adapter weights
+                and replace them with the weights of the new adapter. This can be faster and more memory efficient.
+                However, the main advantage of hotswapping is that when the model is compiled with torch.compile,
+                loading the new adapter does not require recompilation of the model.
+
+                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
+                to call an additional method before loading the adapter:
+
+                ```py
+                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+                model = ...  # load diffusers model with first LoRA adapter
+                max_rank = ...  # the highest rank among all LoRAs that you want to load
+                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
+                model = torch.compile(model)
+                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                ```
+
+                There are some limitations to this technique, which are documented here:
+                https://huggingface.co/docs/peft/main/en/package_reference/hotswap
             kwargs (`dict`, *optional*):
                 See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
         """
@@ -133,6 +159,7 @@ def load_lora_weights(
             adapter_name=adapter_name,
             _pipeline=self,
             low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
         )
         self.load_lora_into_text_encoder(
             state_dict,
@@ -263,7 +290,14 @@ def lora_state_dict(
 
     @classmethod
     def load_lora_into_unet(
-        cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False
+        cls,
+        state_dict,
+        network_alphas,
+        unet,
+        adapter_name=None,
+        _pipeline=None,
+        low_cpu_mem_usage=False,
+        hotswap: bool = False,
     ):
         """
         This will load the LoRA layers specified in `state_dict` into `unet`.
@@ -285,6 +319,28 @@ def load_lora_into_unet(
             low_cpu_mem_usage (`bool`, *optional*):
                 Speed up model loading only loading the pretrained LoRA weights and not initializing the random
                 weights.
+            hotswap : (`bool`, *optional*)
+                Defaults to `False`. Whether to substitute an existing adapter with the newly loaded adapter in-place.
+                This means that, instead of loading an additional adapter, this will take the existing adapter weights
+                and replace them with the weights of the new adapter. This can be faster and more memory efficient.
+                However, the main advantage of hotswapping is that when the model is compiled with torch.compile,
+                loading the new adapter does not require recompilation of the model.
+
+                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
+                to call an additional method before loading the adapter:
+
+                ```py
+                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+                model = ...  # load diffusers model with first LoRA adapter
+                max_rank = ...  # the highest rank among all LoRAs that you want to load
+                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
+                model = torch.compile(model)
+                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                ```
+
+                There are some limitations to this technique, which are documented here:
+                https://huggingface.co/docs/peft/main/en/package_reference/hotswap
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -309,6 +365,7 @@ def load_lora_into_unet(
                 adapter_name=adapter_name,
                 _pipeline=_pipeline,
                 low_cpu_mem_usage=low_cpu_mem_usage,
+                hotswap=hotswap,
             )
 
     @classmethod
@@ -703,7 +760,14 @@ def lora_state_dict(
     @classmethod
     # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_unet
     def load_lora_into_unet(
-        cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False
+        cls,
+        state_dict,
+        network_alphas,
+        unet,
+        adapter_name=None,
+        _pipeline=None,
+        low_cpu_mem_usage=False,
+        hotswap: bool = False,
     ):
         """
         This will load the LoRA layers specified in `state_dict` into `unet`.
@@ -725,6 +789,28 @@ def load_lora_into_unet(
             low_cpu_mem_usage (`bool`, *optional*):
                 Speed up model loading only loading the pretrained LoRA weights and not initializing the random
                 weights.
+            hotswap : (`bool`, *optional*)
+                Defaults to `False`. Whether to substitute an existing adapter with the newly loaded adapter in-place.
+                This means that, instead of loading an additional adapter, this will take the existing adapter weights
+                and replace them with the weights of the new adapter. This can be faster and more memory efficient.
+                However, the main advantage of hotswapping is that when the model is compiled with torch.compile,
+                loading the new adapter does not require recompilation of the model.
+
+                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
+                to call an additional method before loading the adapter:
+
+                ```py
+                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+                model = ...  # load diffusers model with first LoRA adapter
+                max_rank = ...  # the highest rank among all LoRAs that you want to load
+                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
+                model = torch.compile(model)
+                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                ```
+
+                There are some limitations to this technique, which are documented here:
+                https://huggingface.co/docs/peft/main/en/package_reference/hotswap
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -749,6 +835,7 @@ def load_lora_into_unet(
                 adapter_name=adapter_name,
                 _pipeline=_pipeline,
                 low_cpu_mem_usage=low_cpu_mem_usage,
+                hotswap=hotswap,
             )
 
     @classmethod

diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -138,7 +138,9 @@ def _optionally_disable_offloading(cls, _pipeline):
         """
         return _func_optionally_disable_offloading(_pipeline=_pipeline)
 
-    def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="transformer", **kwargs):
+    def load_lora_adapter(
+        self, pretrained_model_name_or_path_or_dict, prefix="transformer", hotswap: bool = False, **kwargs
+    ):
         r"""
         Loads a LoRA adapter into the underlying model.
 
@@ -182,6 +184,28 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
             low_cpu_mem_usage (`bool`, *optional*):
                 Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
                 weights.
+            hotswap : (`bool`, *optional*)
+                Defaults to `False`. Whether to substitute an existing adapter with the newly loaded adapter in-place.
+                This means that, instead of loading an additional adapter, this will take the existing adapter weights
+                and replace them with the weights of the new adapter. This can be faster and more memory efficient.
+                However, the main advantage of hotswapping is that when the model is compiled with torch.compile,
+                loading the new adapter does not require recompilation of the model.
+
+                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
+                to call an additional method before loading the adapter:
+
+                ```py
+                from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+                model = ...  # load diffusers model with first LoRA adapter
+                max_rank = ...  # the highest rank among all LoRAs that you want to load
+                prepare_model_for_compiled_hotswap(model, target_rank=max_rank)  # call *before* compiling
+                model = torch.compile(model)
+                model.load_lora_adapter(..., hotswap=True)  # now hotswap the 2nd adapter
+                ```
+
+                There are some limitations to this technique, which are documented here:
+                https://huggingface.co/docs/peft/main/en/package_reference/hotswap
         """
         from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
         from peft.tuners.tuners_utils import BaseTunerLayer
@@ -235,10 +259,15 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
                 state_dict = {k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in model_keys}
 
         if len(state_dict) > 0:
-            if adapter_name in getattr(self, "peft_config", {}):
+            if adapter_name in getattr(self, "peft_config", {}) and not hotswap:
                 raise ValueError(
                     f"Adapter name {adapter_name} already in use in the model - please select a new adapter name."
                 )
+            elif adapter_name not in getattr(self, "peft_config", {}) and hotswap:
+                raise ValueError(
+                    f"Trying to hotswap LoRA adapter '{adapter_name}' but there is no existing adapter by that name. "
+                    "Please choose an existing adapter name or set `hotswap=False` to prevent hotswapping."
+                )
 
             # check with first key if is not in peft format
             first_key = next(iter(state_dict.keys()))
@@ -296,11 +325,47 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
+            if hotswap:
+                try:
+                    from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+                except ImportError as exc:
+                    msg = (
+                        "Hotswapping requires PEFT > v0.14. Please upgrade PEFT to a higher version or install it "
+                        "from source."
+                    )
+                    raise ImportError(msg) from exc
 def is_peft_version(operation: str, version: str): 
 def is_peft_version(operation: str, version: str): 
+
+            if hotswap:
+
+                def map_state_dict_for_hotswap(sd):
+                    # For hotswapping, we need the adapter name to be present in the state dict keys
+                    new_sd = {}
+                    for k, v in sd.items():
+                        if k.endswith("lora_A.weight") or key.endswith("lora_B.weight"):
+                            k = k[: -len(".weight")] + f".{adapter_name}.weight"
+                        elif k.endswith("lora_B.bias"):  # lora_bias=True option
+                            k = k[: -len(".bias")] + f".{adapter_name}.bias"
+                        new_sd[k] = v
+                    return new_sd
+
             # To handle scenarios where we cannot successfully set state dict. If it's unsucessful,
             # we should also delete the `peft_config` associated to the `adapter_name`.
             try:
-                inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
-                incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
+                if hotswap:
+                    state_dict = map_state_dict_for_hotswap(state_dict)
+                    check_hotswap_configs_compatible(self.peft_config[adapter_name], lora_config)
+                    hotswap_adapter_from_state_dict(
+                        model=self,
+                        state_dict=state_dict,
+                        adapter_name=adapter_name,
+                        config=lora_config,
+                    )
+                    # the hotswap function raises if there are incompatible keys, so if we reach this point we can set
+                    # it to None
+                    incompatible_keys = None
+                else:
+                    inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
+                    incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
             except Exception as e:
                 # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`.
                 if hasattr(self, "peft_config"):

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -281,7 +281,14 @@ def _process_custom_diffusion(self, state_dict):
         return attn_processors
 
     def _process_lora(
-        self, state_dict, unet_identifier_key, network_alphas, adapter_name, _pipeline, low_cpu_mem_usage
+        self,
+        state_dict,
+        unet_identifier_key,
+        network_alphas,
+        adapter_name,
+        _pipeline,
+        low_cpu_mem_usage,
+        hotswap: bool = False,
     ):
         # This method does the following things:
         # 1. Filters the `state_dict` with keys matching  `unet_identifier_key` when using the non-legacy
@@ -294,6 +301,7 @@ def _process_lora(
             raise ValueError("PEFT backend is required for this method.")
 
         from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+        from peft.tuners.tuners_utils import BaseTunerLayer
 
         keys = list(state_dict.keys())
 
@@ -313,10 +321,15 @@ def _process_lora(
         state_dict_to_be_used = unet_state_dict if len(unet_state_dict) > 0 else state_dict
 
         if len(state_dict_to_be_used) > 0:
-            if adapter_name in getattr(self, "peft_config", {}):
+            if adapter_name in getattr(self, "peft_config", {}) and not hotswap:
                 raise ValueError(
                     f"Adapter name {adapter_name} already in use in the Unet - please select a new adapter name."
                 )
+            elif adapter_name not in getattr(self, "peft_config", {}) and hotswap:
+                raise ValueError(
+                    f"Trying to hotswap LoRA adapter '{adapter_name}' but there is no existing adapter by that name. "
+                    "Please choose an existing adapter name or set `hotswap=False` to prevent hotswapping."
+                )
 
             state_dict = convert_unet_state_dict_to_peft(state_dict_to_be_used)
 
@@ -364,8 +377,59 @@ def _process_lora(
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
-            inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
-            incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
+            if hotswap:
+                try:
+                    from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+                except ImportError as exc:
+                    msg = (
+                        "Hotswapping requires PEFT > v0.14. Please upgrade PEFT to a higher version or install it "
+                        "from source."
+                    )
+                    raise ImportError(msg) from exc
 def is_peft_version(operation: str, version: str): 
 def is_peft_version(operation: str, version: str): 
+
+            if hotswap:
+
+                def map_state_dict_for_hotswap(sd):
+                    # For hotswapping, we need the adapter name to be present in the state dict keys
+                    new_sd = {}
+                    for k, v in sd.items():
+                        if k.endswith("lora_A.weight") or key.endswith("lora_B.weight"):
+                            k = k[:-7] + f".{adapter_name}.weight"
+                        elif k.endswith("lora_B.bias"):  # lora_bias=True option
+                            k = k[:-5] + f".{adapter_name}.bias"
+                        new_sd[k] = v
+                    return new_sd
+
+            # To handle scenarios where we cannot successfully set state dict. If it's unsucessful,
+            # we should also delete the `peft_config` associated to the `adapter_name`.
+            try:
+                if hotswap:
+                    check_hotswap_configs_compatible(self.peft_config[adapter_name], lora_config)
+                    hotswap_adapter_from_state_dict(
+                        model=self,
+                        state_dict=state_dict,
+                        adapter_name=adapter_name,
+                        config=lora_config,
+                    )
+                    # the hotswap function raises if there are incompatible keys, so if we reach this point we can set
+                    # it to None
+                    incompatible_keys = None
+                else:
+                    inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
+                    incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
+            except Exception as e:
+                # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`.
+                if hasattr(self, "peft_config"):
+                    for module in self.modules():
+                        if isinstance(module, BaseTunerLayer):
+                            active_adapters = module.active_adapters
+                            for active_adapter in active_adapters:
+                                if adapter_name in active_adapter:
+                                    module.delete_adapter(adapter_name)
+
+                    self.peft_config.pop(adapter_name)
+                logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
+                raise
 try: 
     inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs) 
     incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs) 
 except Exception as e: 
     # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`. 
     if hasattr(self, "peft_config"): 
         for module in self.modules(): 
             if isinstance(module, BaseTunerLayer): 
                 active_adapters = module.active_adapters 
                 for active_adapter in active_adapters: 
                     if adapter_name in active_adapter: 
                         module.delete_adapter(adapter_name) 
         self.peft_config.pop(adapter_name) 
     logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}") 
     raise 
 try: 
     inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs) 
     incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs) 
 except Exception as e: 
     # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`. 
     if hasattr(self, "peft_config"): 
         for module in self.modules(): 
             if isinstance(module, BaseTunerLayer): 
                 active_adapters = module.active_adapters 
                 for active_adapter in active_adapters: 
                     if adapter_name in active_adapter: 
                         module.delete_adapter(adapter_name) 
  
         self.peft_config.pop(adapter_name) 
     logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}") 
     raise 
 
             warn_msg = ""
             if incompatible_keys is not None: