huggingface
diff --git a/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 140 additions & 153 deletions b/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 140 additions & 153 deletions
diff --git a/‎src/diffusers/loaders/lora_base.py‎
Lines changed: 30 additions & 17 deletions b/‎src/diffusers/loaders/lora_base.py‎
Lines changed: 30 additions & 17 deletions
diff --git a/‎src/diffusers/loaders/peft.py‎
Lines changed: 22 additions & 4 deletions b/‎src/diffusers/loaders/peft.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎src/diffusers/loaders/unet.py‎
Lines changed: 15 additions & 4 deletions b/‎src/diffusers/loaders/unet.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎src/diffusers/loaders/unet_loader_utils.py‎
Lines changed: 5 additions & 2 deletions b/‎src/diffusers/loaders/unet_loader_utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/diffusers/schedulers/scheduling_scm.py‎
Lines changed: 0 additions & 1 deletion b/‎src/diffusers/schedulers/scheduling_scm.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/diffusers/utils/peft_utils.py‎
Lines changed: 49 additions & 8 deletions b/‎src/diffusers/utils/peft_utils.py‎
Lines changed: 49 additions & 8 deletions
diff --git a/‎tests/lora/test_lora_layers_cogvideox.py‎
Lines changed: 9 additions & 0 deletions b/‎tests/lora/test_lora_layers_cogvideox.py‎
Lines changed: 9 additions & 0 deletions
@@ -25,6 +25,7 @@
 from huggingface_hub import model_info
 from huggingface_hub.constants import HF_HUB_OFFLINE
 
+from ..hooks.group_offloading import _is_group_offload_enabled, _maybe_remove_and_reapply_group_offloading
 from ..models.modeling_utils import ModelMixin, load_state_dict
 from ..utils import (
     USE_PEFT_BACKEND,
@@ -391,7 +392,9 @@ def _load_lora_into_text_encoder(
             adapter_name = get_adapter_name(text_encoder)
 
         # <Unsafe code
-        is_model_cpu_offload, is_sequential_cpu_offload = _func_optionally_disable_offloading(_pipeline)
+        is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload = _func_optionally_disable_offloading(
+            _pipeline
+        )
         # inject LoRA layers and load the state dict
         # in transformers we automatically check whether the adapter name is already in use or not
         text_encoder.load_adapter(
@@ -410,6 +413,10 @@ def _load_lora_into_text_encoder(
             _pipeline.enable_model_cpu_offload()
         elif is_sequential_cpu_offload:
             _pipeline.enable_sequential_cpu_offload()
+        elif is_group_offload:
+            for component in _pipeline.components.values():
+                if isinstance(component, torch.nn.Module):
+                    _maybe_remove_and_reapply_group_offloading(component)
         # Unsafe code />
 
     if prefix is not None and not state_dict:
@@ -433,30 +440,36 @@ def _func_optionally_disable_offloading(_pipeline):
 
     Returns:
         tuple:
-            A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
+            A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` or `is_group_offload` is True.
     """
     is_model_cpu_offload = False
     is_sequential_cpu_offload = False
+    is_group_offload = False
 
     if _pipeline is not None and _pipeline.hf_device_map is None:
         for _, component in _pipeline.components.items():
-            if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
-                if not is_model_cpu_offload:
-                    is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
-                if not is_sequential_cpu_offload:
-                    is_sequential_cpu_offload = (
-                        isinstance(component._hf_hook, AlignDevicesHook)
-                        or hasattr(component._hf_hook, "hooks")
-                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
-                    )
+            if not isinstance(component, nn.Module):
+                continue
+            is_group_offload = is_group_offload or _is_group_offload_enabled(component)
+            if not hasattr(component, "_hf_hook"):
+                continue
+            is_model_cpu_offload = is_model_cpu_offload or isinstance(component._hf_hook, CpuOffload)
+            is_sequential_cpu_offload = is_sequential_cpu_offload or (
+                isinstance(component._hf_hook, AlignDevicesHook)
+                or hasattr(component._hf_hook, "hooks")
+                and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+            )
 
-                logger.info(
-                    "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                )
-                if is_sequential_cpu_offload or is_model_cpu_offload:
-                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+        if is_sequential_cpu_offload or is_model_cpu_offload:
+            logger.info(
+                "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+            )
+            for _, component in _pipeline.components.items():
+                if not isinstance(component, nn.Module) or not hasattr(component, "_hf_hook"):
+                    continue
+            remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
 
-    return (is_model_cpu_offload, is_sequential_cpu_offload)
+    return (is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload)
 
 
 class LoraBaseMixin:
 
@@ -22,6 +22,7 @@
 import safetensors
 import torch
 
+from ..hooks.group_offloading import _maybe_remove_and_reapply_group_offloading
 from ..utils import (
     MIN_PEFT_VERSION,
     USE_PEFT_BACKEND,
@@ -243,20 +244,29 @@ def load_lora_adapter(
                     k.removeprefix(f"{prefix}."): v for k, v in network_alphas.items() if k in alpha_keys
                 }
 
-            # create LoraConfig
-            lora_config = _create_lora_config(state_dict, network_alphas, metadata, rank)
-
             # adapter_name
             if adapter_name is None:
                 adapter_name = get_adapter_name(self)
 
+            # create LoraConfig
+            lora_config = _create_lora_config(
+                state_dict,
+                network_alphas,
+                metadata,
+                rank,
+                model_state_dict=self.state_dict(),
+                adapter_name=adapter_name,
+            )
+
             # <Unsafe code
             # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
             # Now we remove any existing hooks to `_pipeline`.
 
             # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
             # otherwise loading LoRA weights will lead to an error.
-            is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline)
+            is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload = self._optionally_disable_offloading(
+                _pipeline
+            )
             peft_kwargs = {}
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
@@ -347,6 +357,10 @@ def map_state_dict_for_hotswap(sd):
                 _pipeline.enable_model_cpu_offload()
             elif is_sequential_cpu_offload:
                 _pipeline.enable_sequential_cpu_offload()
+            elif is_group_offload:
+                for component in _pipeline.components.values():
+                    if isinstance(component, torch.nn.Module):
+                        _maybe_remove_and_reapply_group_offloading(component)
             # Unsafe code />
 
         if prefix is not None and not state_dict:
@@ -686,6 +700,10 @@ def unload_lora(self):
         recurse_remove_peft_layers(self)
         if hasattr(self, "peft_config"):
             del self.peft_config
+        if hasattr(self, "_hf_peft_config_loaded"):
+            self._hf_peft_config_loaded = None
+
+        _maybe_remove_and_reapply_group_offloading(self)
 
     def disable_lora(self):
         """
 
@@ -22,6 +22,7 @@
 import torch.nn.functional as F
 from huggingface_hub.utils import validate_hf_hub_args
 
+from ..hooks.group_offloading import _maybe_remove_and_reapply_group_offloading
 from ..models.embeddings import (
     ImageProjection,
     IPAdapterFaceIDImageProjection,
@@ -203,6 +204,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys())
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
+        is_group_offload = False
 
         if is_lora:
             deprecation_message = "Using the `load_attn_procs()` method has been deprecated and will be removed in a future version. Please use `load_lora_adapter()`."
@@ -211,7 +213,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         if is_custom_diffusion:
             attn_processors = self._process_custom_diffusion(state_dict=state_dict)
         elif is_lora:
-            is_model_cpu_offload, is_sequential_cpu_offload = self._process_lora(
+            is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload = self._process_lora(
                 state_dict=state_dict,
                 unet_identifier_key=self.unet_name,
                 network_alphas=network_alphas,
@@ -230,7 +232,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
         # For LoRA, the UNet is already offloaded at this stage as it is handled inside `_process_lora`.
         if is_custom_diffusion and _pipeline is not None:
-            is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline=_pipeline)
+            is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload = self._optionally_disable_offloading(
+                _pipeline=_pipeline
+            )
 
             # only custom diffusion needs to set attn processors
             self.set_attn_processor(attn_processors)
@@ -241,6 +245,10 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             _pipeline.enable_model_cpu_offload()
         elif is_sequential_cpu_offload:
             _pipeline.enable_sequential_cpu_offload()
+        elif is_group_offload:
+            for component in _pipeline.components.values():
+                if isinstance(component, torch.nn.Module):
+                    _maybe_remove_and_reapply_group_offloading(component)
         # Unsafe code />
 
     def _process_custom_diffusion(self, state_dict):
@@ -307,6 +315,7 @@ def _process_lora(
 
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
+        is_group_offload = False
         state_dict_to_be_used = unet_state_dict if len(unet_state_dict) > 0 else state_dict
 
         if len(state_dict_to_be_used) > 0:
@@ -356,7 +365,9 @@ def _process_lora(
 
             # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
             # otherwise loading LoRA weights will lead to an error
-            is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline)
+            is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload = self._optionally_disable_offloading(
+                _pipeline
+            )
             peft_kwargs = {}
             if is_peft_version(">=", "0.13.1"):
                 peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
@@ -389,7 +400,7 @@ def _process_lora(
             if warn_msg:
                 logger.warning(warn_msg)
 
-        return is_model_cpu_offload, is_sequential_cpu_offload
+        return is_model_cpu_offload, is_sequential_cpu_offload, is_group_offload
 
     @classmethod
     # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading
 
@@ -14,6 +14,8 @@
 import copy
 from typing import TYPE_CHECKING, Dict, List, Union
 
+from torch import nn
+
 from ..utils import logging
 
 
@@ -52,7 +54,7 @@ def _maybe_expand_lora_scales(
             weight_for_adapter,
             blocks_with_transformer,
             transformer_per_block,
-            unet.state_dict(),
+            model=unet,
             default_scale=default_scale,
         )
         for weight_for_adapter in weight_scales
@@ -65,7 +67,7 @@ def _maybe_expand_lora_scales_for_one_adapter(
     scales: Union[float, Dict],
     blocks_with_transformer: Dict[str, int],
     transformer_per_block: Dict[str, int],
-    state_dict: None,
+    model: nn.Module,
     default_scale: float = 1.0,
 ):
     """
@@ -154,6 +156,7 @@ def _maybe_expand_lora_scales_for_one_adapter(
 
         del scales[updown]
 
+    state_dict = model.state_dict()
     for layer in scales.keys():
         if not any(_translate_into_actual_layer_name(layer) in module for module in state_dict.keys()):
             raise ValueError(
 
@@ -168,7 +168,6 @@ def set_timesteps(
         else:
             # max_timesteps=arctan(80/0.5)=1.56454 is the default from sCM paper, we choose a different value here
             self.timesteps = torch.linspace(max_timesteps, 0, num_inference_steps + 1, device=device).float()
-        print(f"Set timesteps: {self.timesteps}")
 
         self._step_index = None
         self._begin_index = None
 
@@ -150,7 +150,9 @@ def unscale_lora_layers(model, weight: Optional[float] = None):
                     module.set_scale(adapter_name, 1.0)
 
 
-def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True):
+def get_peft_kwargs(
+    rank_dict, network_alpha_dict, peft_state_dict, is_unet=True, model_state_dict=None, adapter_name=None
+):
     rank_pattern = {}
     alpha_pattern = {}
     r = lora_alpha = list(rank_dict.values())[0]
@@ -180,7 +182,6 @@ def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True
         else:
             lora_alpha = set(network_alpha_dict.values()).pop()
 
-    # layer names without the Diffusers specific
     target_modules = list({name.split(".lora")[0] for name in peft_state_dict.keys()})
     use_dora = any("lora_magnitude_vector" in k for k in peft_state_dict)
     # for now we know that the "bias" keys are only associated with `lora_B`.
@@ -195,6 +196,21 @@ def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True
         "use_dora": use_dora,
         "lora_bias": lora_bias,
     }
+
+    # Example: try load FusionX LoRA into Wan VACE
+    exclude_modules = _derive_exclude_modules(model_state_dict, peft_state_dict, adapter_name)
+    if exclude_modules:
+        if not is_peft_version(">=", "0.14.0"):
+            msg = """
+It seems like there are certain modules that need to be excluded when initializing `LoraConfig`. Your current `peft`
+version doesn't support passing an `exclude_modules` to `LoraConfig`. Please update it by running `pip install -U
+peft`. For most cases, this can be completely ignored. But if it seems unexpected, please file an issue -
+https://github.com/huggingface/diffusers/issues/new
+            """
+            logger.debug(msg)
+        else:
+            lora_config_kwargs.update({"exclude_modules": exclude_modules})
+
     return lora_config_kwargs
 
 
@@ -294,19 +310,20 @@ def check_peft_version(min_version: str) -> None:
 
 
 def _create_lora_config(
-    state_dict,
-    network_alphas,
-    metadata,
-    rank_pattern_dict,
-    is_unet: bool = True,
+    state_dict, network_alphas, metadata, rank_pattern_dict, is_unet=True, model_state_dict=None, adapter_name=None
 ):
     from peft import LoraConfig
 
     if metadata is not None:
         lora_config_kwargs = metadata
     else:
         lora_config_kwargs = get_peft_kwargs(
-            rank_pattern_dict, network_alpha_dict=network_alphas, peft_state_dict=state_dict, is_unet=is_unet
+            rank_pattern_dict,
+            network_alpha_dict=network_alphas,
+            peft_state_dict=state_dict,
+            is_unet=is_unet,
+            model_state_dict=model_state_dict,
+            adapter_name=adapter_name,
         )
 
     _maybe_raise_error_for_ambiguous_keys(lora_config_kwargs)
@@ -371,3 +388,27 @@ def _maybe_warn_for_unhandled_keys(incompatible_keys, adapter_name):
 
     if warn_msg:
         logger.warning(warn_msg)
+
+
+def _derive_exclude_modules(model_state_dict, peft_state_dict, adapter_name=None):
+    """
+    Derives the modules to exclude while initializing `LoraConfig` through `exclude_modules`. It works by comparing the
+    `model_state_dict` and `peft_state_dict` and adds a module from `model_state_dict` to the exclusion set if it
+    doesn't exist in `peft_state_dict`.
+    """
+    if model_state_dict is None:
+        return
+    all_modules = set()
+    string_to_replace = f"{adapter_name}." if adapter_name else ""
+
+    for name in model_state_dict.keys():
+        if string_to_replace:
+            name = name.replace(string_to_replace, "")
+        if "." in name:
+            module_name = name.rsplit(".", 1)[0]
+            all_modules.add(module_name)
+
+    target_modules_set = {name.split(".lora")[0] for name in peft_state_dict.keys()}
+    exclude_modules = list(all_modules - target_modules_set)
+
+    return exclude_modules
@@ -16,6 +16,7 @@
 import unittest
 
 import torch
+from parameterized import parameterized
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import (
@@ -28,6 +29,7 @@
 from diffusers.utils.testing_utils import (
     floats_tensor,
     require_peft_backend,
+    require_torch_accelerator,
 )
 
 
@@ -127,6 +129,13 @@ def test_simple_inference_with_text_denoiser_lora_unfused(self):
     def test_lora_scale_kwargs_match_fusion(self):
         super().test_lora_scale_kwargs_match_fusion(expected_atol=9e-3, expected_rtol=9e-3)
 
+    @parameterized.expand([("block_level", True), ("leaf_level", False)])
+    @require_torch_accelerator
+    def test_group_offloading_inference_denoiser(self, offload_type, use_stream):
+        # TODO: We don't run the (leaf_level, True) test here that is enabled for other models.
+        # The reason for this can be found here: https://github.com/huggingface/diffusers/pull/11804#issuecomment-3013325338
+        super()._test_group_offloading_inference_denoiser(offload_type, use_stream)
+
     @unittest.skip("Not supported in CogVideoX.")
     def test_simple_inference_with_text_denoiser_block_scale(self):
         pass