From 4fc12e2846b2014eb23b55312e591a13e02e7307 Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Fri, 28 Nov 2025 16:02:53 +0800
Subject: [PATCH 01/21] created test for pinning first and last block on device

---
 tests/hooks/test_group_offloading.py | 84 ++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 96cbecfbf530..1a8e6dddc46d 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -362,3 +362,87 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
             self.assertLess(
                 cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
             )
+
+    def test_block_level_pin_first_last_groups_stay_on_device(self):
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        def first_param_device(mod):
+            p = next(mod.parameters(), None)  # recurse=True by default
+            self.assertIsNotNone(p, f"No parameters found for module {mod}")
+            return p.device
+
+        def assert_all_modules_device(mods, expected_type: str, msg: str = ""):
+            bad = []
+            for i, m in enumerate(mods):
+                dev_type = first_param_device(m).type
+                if dev_type != expected_type:
+                    bad.append((i, m.__class__.__name__, dev_type))
+            self.assertFalse(
+                bad,
+                (msg + "\n" if msg else "")
+                + f"Expected all modules on {expected_type}, but found mismatches: {bad}",
+            )
+
+        def get_param_modules_from_exec_order(model):
+            root_registry = HookRegistry.check_if_exists_or_initialize(model)
+
+            lazy_hook = root_registry.get_hook("lazy_prefetch_group_offloading")
+            self.assertIsNotNone(lazy_hook, "lazy_prefetch_group_offloading hook was not registered")
+
+            with torch.no_grad():
+                #record execution order with first forward
+                model(self.input)
+
+            mods = [m for _, m in lazy_hook.execution_order]
+            param_mods = [m for m in mods if next(m.parameters(), None) is not None]
+            self.assertGreaterEqual(
+                len(param_mods), 2, f"Expected >=2 param-bearing modules in execution_order, got {len(param_mods)}"
+            )
+
+            first = param_mods[0]
+            last = param_mods[-1]
+            middle = param_mods[1:-1]  # <- ALL middle layers
+            return first, middle, last
+
+        accel_type = torch.device(torch_device).type
+
+        # -------------------------
+        # No pin: everything on CPU
+        # -------------------------
+        model_no_pin = self.get_model()
+        model_no_pin.enable_group_offload(
+            torch_device,
+            offload_type="block_level",
+            num_blocks_per_group=1,
+            use_stream=True,
+        )
+        model_no_pin.eval()
+        first, middle, last = get_param_modules_from_exec_order(model_no_pin)
+
+        self.assertEqual(first_param_device(first).type, "cpu")
+        self.assertEqual(first_param_device(last).type, "cpu")
+        assert_all_modules_device(middle, "cpu", msg="No-pin: expected ALL middle layers on CPU")
+
+        model_pin = self.get_model()
+        model_pin.enable_group_offload(
+            torch_device,
+            offload_type="block_level",
+            num_blocks_per_group=1,
+            use_stream=True,
+            pin_first_last=True,
+        )
+        model_pin.eval()
+        first, middle, last = get_param_modules_from_exec_order(model_pin)
+
+        self.assertEqual(first_param_device(first).type, accel_type)
+        self.assertEqual(first_param_device(last).type, accel_type)
+        assert_all_modules_device(middle, "cpu", msg="Pin: expected ALL middle layers on CPU")
+
+        # Should still hold after another invocation
+        with torch.no_grad():
+            model_pin(self.input)
+
+        self.assertEqual(first_param_device(first).type, accel_type)
+        self.assertEqual(first_param_device(last).type, accel_type)
+        assert_all_modules_device(middle, "cpu", msg="Pin (2nd forward): expected ALL middle layers on CPU")

From 93e6d311c788b8d6dc7ee1688bede2fee7fd03d5 Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Fri, 28 Nov 2025 16:09:43 +0800
Subject: [PATCH 02/21] fix comments in tests for cleaner code

---
 tests/hooks/test_group_offloading.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 1a8e6dddc46d..00b8f2df98e5 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -368,7 +368,7 @@ def test_block_level_pin_first_last_groups_stay_on_device(self):
             return
 
         def first_param_device(mod):
-            p = next(mod.parameters(), None)  # recurse=True by default
+            p = next(mod.parameters(), None)
             self.assertIsNotNone(p, f"No parameters found for module {mod}")
             return p.device
 
@@ -390,8 +390,8 @@ def get_param_modules_from_exec_order(model):
             lazy_hook = root_registry.get_hook("lazy_prefetch_group_offloading")
             self.assertIsNotNone(lazy_hook, "lazy_prefetch_group_offloading hook was not registered")
 
+            #record execution order with first forward
             with torch.no_grad():
-                #record execution order with first forward
                 model(self.input)
 
             mods = [m for _, m in lazy_hook.execution_order]
@@ -402,14 +402,11 @@ def get_param_modules_from_exec_order(model):
 
             first = param_mods[0]
             last = param_mods[-1]
-            middle = param_mods[1:-1]  # <- ALL middle layers
-            return first, middle, last
+            middle_layers = param_mods[1:-1]
+            return first, middle_layers, last
 
         accel_type = torch.device(torch_device).type
 
-        # -------------------------
-        # No pin: everything on CPU
-        # -------------------------
         model_no_pin = self.get_model()
         model_no_pin.enable_group_offload(
             torch_device,

From 3455019349695db0abce88cc67068181b227c14d Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 03/21] Support explicit block modules in group offloading

---
 src/diffusers/hooks/group_offloading.py       | 171 +++++++++++++-----
 .../models/autoencoders/autoencoder_kl.py     |   1 +
 .../models/autoencoders/autoencoder_kl_wan.py |   1 +
 src/diffusers/models/modeling_utils.py        |   2 +
 4 files changed, 131 insertions(+), 44 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 38f291f5203c..f9189443ee0f 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -59,6 +59,7 @@ class GroupOffloadingConfig:
     num_blocks_per_group: Optional[int] = None
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
+    block_modules: Optional[List[str]] = None
 
 
 class ModuleGroup:
@@ -77,7 +78,7 @@ def __init__(
         low_cpu_mem_usage: bool = False,
         onload_self: bool = True,
         offload_to_disk_path: Optional[str] = None,
-        group_id: Optional[int] = None,
+        group_id: Optional[Union[int, str]] = None,
     ) -> None:
         self.modules = modules
         self.offload_device = offload_device
@@ -453,6 +454,7 @@ def apply_group_offloading(
     record_stream: bool = False,
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
+    block_modules: Optional[List[str]] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -510,6 +512,9 @@ def apply_group_offloading(
             If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
+        block_modules (`List[str]`, *optional*):
+            List of module names that should be treated as blocks for offloading. If provided, only these modules
+            will be considered for block-level offloading. If not provided, the default block detection logic will be used.
 
     Example:
         ```python
@@ -561,6 +566,7 @@ def apply_group_offloading(
         record_stream=record_stream,
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
+        block_modules=block_modules,
     )
     _apply_group_offloading(module, config)
 
@@ -576,28 +582,123 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
-    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
-    the "leaf_level" offloading, which is more fine-grained, this offloading is done at the top-level blocks.
-    """
+    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading
+    is done at the top-level blocks and modules specified in block_modules.
 
+    When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
+    module, we either offload the entire submodule or recursively apply block offloading to it.
+    """
     if config.stream is not None and config.num_blocks_per_group != 1:
         logger.warning(
             f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
         )
         config.num_blocks_per_group = 1
 
-    # Create module groups for ModuleList and Sequential blocks
+    block_modules = set(config.block_modules) if config.block_modules is not None else set()
+
+    # Create module groups for ModuleList and Sequential blocks, and explicitly defined block modules
     modules_with_group_offloading = set()
     unmatched_modules = []
     matched_module_groups = []
+
     for name, submodule in module.named_children():
-        if not isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+        # Check if this is an explicitly defined block module
+        if name in block_modules:
+            # Apply block offloading to the specified submodule
+            _apply_block_offloading_to_submodule(
+                submodule, name, config, modules_with_group_offloading, matched_module_groups
+            )
+        elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+            # Handle ModuleList and Sequential blocks as before
+            for i in range(0, len(submodule), config.num_blocks_per_group):
+                current_modules = list(submodule[i : i + config.num_blocks_per_group])
+                if len(current_modules) == 0:
+                    continue
+
+                group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+                group = ModuleGroup(
+                    modules=current_modules,
+                    offload_device=config.offload_device,
+                    onload_device=config.onload_device,
+                    offload_to_disk_path=config.offload_to_disk_path,
+                    offload_leader=current_modules[-1],
+                    onload_leader=current_modules[0],
+                    non_blocking=config.non_blocking,
+                    stream=config.stream,
+                    record_stream=config.record_stream,
+                    low_cpu_mem_usage=config.low_cpu_mem_usage,
+                    onload_self=True,
+                    group_id=group_id,
+                )
+                matched_module_groups.append(group)
+                for j in range(i, i + len(current_modules)):
+                    modules_with_group_offloading.add(f"{name}.{j}")
+        else:
+            # This is an unmatched module
             unmatched_modules.append((name, submodule))
-            modules_with_group_offloading.add(name)
-            continue
 
+    # Apply group offloading hooks to the module groups
+    for i, group in enumerate(matched_module_groups):
+        for group_module in group.modules:
+            _apply_group_offloading_hook(group_module, group, config=config)
+
+    # Parameters and Buffers of the top-level module need to be offloaded/onloaded separately
+    # when the forward pass of this module is called. This is because the top-level module is not
+    # part of any group (as doing so would lead to no VRAM savings).
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    parameters = [param for _, param in parameters]
+    buffers = [buffer for _, buffer in buffers]
+
+    # Create a group for the remaining unmatched submodules of the top-level
+    # module so that they are on the correct device when the forward pass is called.
+    unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
+    if len(unmatched_modules) > 0 or len(parameters) > 0 or len(buffers) > 0:
+        unmatched_group = ModuleGroup(
+            modules=unmatched_modules,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            onload_self=True,
+            group_id=f"{module.__class__.__name__}_unmatched_group",
+        )
+        if config.stream is None:
+            _apply_group_offloading_hook(module, unmatched_group, config=config)
+        else:
+            _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+
+
+def _apply_block_offloading_to_submodule(
+    submodule: torch.nn.Module,
+    name: str,
+    config: GroupOffloadingConfig,
+    modules_with_group_offloading: Set[str],
+    matched_module_groups: List[ModuleGroup],
+) -> None:
+    r"""
+    Apply block offloading to a explicitly defined submodule. This function either:
+    1. Offloads the entire submodule as a single group ( SIMPLE APPROACH)
+    2. Recursively applies block offloading to the submodule
+
+    For now, we use the simple approach - offload the entire submodule as a single group.
+    """
+    # Simple approach: offload the entire submodule as a single group
+    # Since AEs are typically small, this is usually okay
+    if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+        # If it's a ModuleList or Sequential, apply the normal block-level logic
         for i in range(0, len(submodule), config.num_blocks_per_group):
-            current_modules = submodule[i : i + config.num_blocks_per_group]
+            current_modules = list(submodule[i : i + config.num_blocks_per_group])
+            if len(current_modules) == 0:
+                continue
+
             group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
             group = ModuleGroup(
                 modules=current_modules,
@@ -616,42 +717,24 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             matched_module_groups.append(group)
             for j in range(i, i + len(current_modules)):
                 modules_with_group_offloading.add(f"{name}.{j}")
-
-    # Apply group offloading hooks to the module groups
-    for i, group in enumerate(matched_module_groups):
-        for group_module in group.modules:
-            _apply_group_offloading_hook(group_module, group, config=config)
-
-    # Parameters and Buffers of the top-level module need to be offloaded/onloaded separately
-    # when the forward pass of this module is called. This is because the top-level module is not
-    # part of any group (as doing so would lead to no VRAM savings).
-    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
-    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
-    parameters = [param for _, param in parameters]
-    buffers = [buffer for _, buffer in buffers]
-
-    # Create a group for the unmatched submodules of the top-level module so that they are on the correct
-    # device when the forward pass is called.
-    unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
-    unmatched_group = ModuleGroup(
-        modules=unmatched_modules,
-        offload_device=config.offload_device,
-        onload_device=config.onload_device,
-        offload_to_disk_path=config.offload_to_disk_path,
-        offload_leader=module,
-        onload_leader=module,
-        parameters=parameters,
-        buffers=buffers,
-        non_blocking=False,
-        stream=None,
-        record_stream=False,
-        onload_self=True,
-        group_id=f"{module.__class__.__name__}_unmatched_group",
-    )
-    if config.stream is None:
-        _apply_group_offloading_hook(module, unmatched_group, config=config)
     else:
-        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+        # For other modules, treat the entire submodule as a single group
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        matched_module_groups.append(group)
+        modules_with_group_offloading.add(name)
 
 
 def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index ffc8778e7aca..4096b7c07609 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -72,6 +72,7 @@ class AutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModel
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
+    _group_offload_block_modules = ["quant_conv", "post_quant_conv", "encoder", "decoder"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
index b0b2960aaf18..6b29a6273cd9 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -964,6 +964,7 @@ class AutoencoderKLWan(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalMo
     # keys toignore when AlignDeviceHook moves inputs/outputs between devices
     # these are shared mutable state modified in-place
     _skip_keys = ["feat_cache", "feat_idx"]
+    _group_offload_block_modules = ["quant_conv", "post_quant_conv", "encoder", "decoder"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index f06822c741ca..5cee737d0b2e 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -570,6 +570,7 @@ def enable_group_offload(
                 f"`_supports_group_offloading` to `True` in the class definition. If you believe this is a mistake, please "
                 f"open an issue at https://github.com/huggingface/diffusers/issues."
             )
+        block_modules = getattr(self, "_group_offload_block_modules", None)
         apply_group_offloading(
             module=self,
             onload_device=onload_device,
@@ -581,6 +582,7 @@ def enable_group_offload(
             record_stream=record_stream,
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
+            block_modules=block_modules,
         )
 
     def set_attention_backend(self, backend: str) -> None:

From 9c3c14f52aa74f7dc2e93d91e000feeba04239c8 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 04/21] Add pinning support to group offloading hooks

---
 src/diffusers/hooks/group_offloading.py | 112 +++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index f9189443ee0f..8b6d734f1e3f 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -17,7 +17,7 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import safetensors.torch
 import torch
@@ -60,6 +60,7 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
+    pin_groups: Optional[Union[str, Callable]] = None
 
 
 class ModuleGroup:
@@ -92,6 +93,7 @@ def __init__(
         self.record_stream = record_stream
         self.onload_self = onload_self
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.pinned = False
 
         self.offload_to_disk_path = offload_to_disk_path
         self._is_offloaded_to_disk = False
@@ -297,6 +299,24 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         if self.group.onload_leader is None:
             self.group.onload_leader = module
 
+        if self.group.pinned:
+            if self.group.onload_leader == module and not self._is_group_on_device():
+                self.group.onload_()
+
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                self.group.stream.synchronize()
+
+            args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            return args, kwargs
+
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
         # to onload itself. In the case of using prefetching with streams, we onload the next group if
         # it is not supposed to onload itself.
@@ -325,10 +345,26 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
+        if self.group.pinned:
+            return output
+
         if self.group.offload_leader == module:
             self.group.offload_()
         return output
 
+    def _is_group_on_device(self) -> bool:
+        tensors = []
+        for group_module in self.group.modules:
+            tensors.extend(list(group_module.parameters()))
+            tensors.extend(list(group_module.buffers()))
+        tensors.extend(self.group.parameters)
+        tensors.extend(self.group.buffers)
+
+        if len(tensors) == 0:
+            return True
+
+        return all(t.device == self.group.onload_device for t in tensors)
+
 
 class LazyPrefetchGroupOffloadingHook(ModelHook):
     r"""
@@ -424,6 +460,51 @@ def post_forward(self, module, output):
             group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
             group_offloading_hooks[i].next_group.onload_self = False
 
+        pin_groups = getattr(base_module_registry, "_group_offload_pin_groups", None)
+        if pin_groups is not None and num_executed > 0:
+            param_exec_info = []
+            for idx, ((name, submodule), hook) in enumerate(zip(self.execution_order, group_offloading_hooks)):
+                if hook is None:
+                    continue
+                if next(submodule.parameters(), None) is None and next(submodule.buffers(), None) is None:
+                    continue
+                param_exec_info.append((name, submodule, hook))
+
+            num_param_modules = len(param_exec_info)
+            if num_param_modules > 0:
+                pinned_indices = set()
+                if isinstance(pin_groups, str):
+                    if pin_groups == "all":
+                        pinned_indices = set(range(num_param_modules))
+                    elif pin_groups == "first_last":
+                        pinned_indices.add(0)
+                        pinned_indices.add(num_param_modules - 1)
+                elif callable(pin_groups):
+                    for idx, (name, submodule, _) in enumerate(param_exec_info):
+                        should_pin = False
+                        try:
+                            should_pin = bool(pin_groups(submodule))
+                        except TypeError:
+                            try:
+                                should_pin = bool(pin_groups(name, submodule))
+                            except TypeError:
+                                should_pin = bool(pin_groups(name, submodule, idx))
+                        if should_pin:
+                            pinned_indices.add(idx)
+
+                pinned_groups = set()
+                for idx in pinned_indices:
+                    if idx >= num_param_modules:
+                        continue
+                    group = param_exec_info[idx][2].group
+                    if group not in pinned_groups:
+                        group.pinned = True
+                        pinned_groups.add(group)
+
+                for group in pinned_groups:
+                    if group.offload_device != group.onload_device:
+                        group.onload_()
+
         return output
 
 
@@ -455,6 +536,8 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
+    pin_groups: Optional[Union[str, Callable]] = None,
+    pin_first_last: bool = False,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -515,6 +598,12 @@ def apply_group_offloading(
         block_modules (`List[str]`, *optional*):
             List of module names that should be treated as blocks for offloading. If provided, only these modules
             will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
+            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            receives a module (and optionally the module name and index) and returns `True` to pin that group.
+        pin_first_last (`bool`, *optional*, defaults to `False`):
+            Deprecated alias for `pin_groups="first_last"`.
 
     Example:
         ```python
@@ -554,7 +643,24 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
+    if pin_first_last:
+        if pin_groups is not None and pin_groups != "first_last":
+            raise ValueError("`pin_first_last` cannot be combined with a different `pin_groups` setting.")
+        pin_groups = "first_last"
+
+    normalized_pin_groups = pin_groups
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    elif pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+
+    pin_groups = normalized_pin_groups
+
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry._group_offload_pin_groups = pin_groups
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -567,11 +673,15 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
+        pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
 
 
 def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry._group_offload_pin_groups = config.pin_groups
+
     if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
         _apply_group_offloading_block_level(module, config)
     elif config.offload_type == GroupOffloadingType.LEAF_LEVEL:

From 3b3813d7af04194da04144d919a4b86b7fc79dbf Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 05/21] Expose group offload pinning options in API

---
 src/diffusers/models/modeling_utils.py    | 4 ++++
 src/diffusers/pipelines/pipeline_utils.py | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 5cee737d0b2e..86d2024f0a95 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -531,6 +531,8 @@ def enable_group_offload(
         record_stream: bool = False,
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
+        pin_groups: Optional[Union[str, Callable]] = None,
+        pin_first_last: bool = False,
     ) -> None:
         r"""
         Activates group offloading for the current model.
@@ -583,6 +585,8 @@ def enable_group_offload(
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
             block_modules=block_modules,
+            pin_groups=pin_groups,
+            pin_first_last=pin_first_last,
         )
 
     def set_attention_backend(self, backend: str) -> None:
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 392d5fb3feb4..d0fab44a6187 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1342,6 +1342,8 @@ def enable_group_offload(
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
         exclude_modules: Optional[Union[str, List[str]]] = None,
+        pin_groups: Optional[Union[str, Callable]] = None,
+        pin_first_last: bool = False,
     ) -> None:
         r"""
         Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
@@ -1402,6 +1404,11 @@ def enable_group_offload(
                 This option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be
                 useful when the CPU memory is a bottleneck but may counteract the benefits of using streams.
             exclude_modules (`Union[str, List[str]]`, defaults to `None`): List of modules to exclude from offloading.
+            pin_groups (`\"first_last\"` | `\"all\"` | `Callable`, *optional*):
+                Optionally keep selected groups on the onload device permanently. See `ModelMixin.enable_group_offload`
+                for details.
+            pin_first_last (`bool`, *optional*, defaults to `False`):
+                Deprecated alias for `pin_groups=\"first_last\"`.
 
         Example:
             ```python
@@ -1442,6 +1449,8 @@ def enable_group_offload(
             "record_stream": record_stream,
             "low_cpu_mem_usage": low_cpu_mem_usage,
             "offload_to_disk_path": offload_to_disk_path,
+            "pin_groups": pin_groups,
+            "pin_first_last": pin_first_last,
         }
         for name, component in self.components.items():
             if name not in exclude_modules and isinstance(component, torch.nn.Module):

From b9e0994c5f87f6999f7b7704ed8e5c11e8614dfa Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Fri, 28 Nov 2025 16:02:53 +0800
Subject: [PATCH 06/21] created test for pinning first and last block on device

---
 tests/hooks/test_group_offloading.py | 401 ++++++++++++++-------------
 1 file changed, 208 insertions(+), 193 deletions(-)

diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 236094109d07..58520bef9aa5 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -19,13 +19,14 @@
 import torch
 from parameterized import parameterized
 
-from diffusers import AutoencoderKL
 from diffusers.hooks import HookRegistry, ModelHook
 from diffusers.models import ModelMixin
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.utils import get_logger
 from diffusers.utils.import_utils import compare_versions
 
+from typing import Any, Iterable, List, Optional, Sequence, Union
+
 from ..testing_utils import (
     backend_empty_cache,
     backend_max_memory_allocated,
@@ -148,74 +149,66 @@ def __init__(self):
     def post_forward(self, module, output):
         self.outputs.append(output)
         return output
-
-
-# Model with only standalone computational layers at top level
-class DummyModelWithStandaloneLayers(ModelMixin):
-    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
-        super().__init__()
-
-        self.layer1 = torch.nn.Linear(in_features, hidden_features)
-        self.activation = torch.nn.ReLU()
-        self.layer2 = torch.nn.Linear(hidden_features, hidden_features)
-        self.layer3 = torch.nn.Linear(hidden_features, out_features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.layer1(x)
-        x = self.activation(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        return x
-
-
-# Model with deeply nested structure
-class DummyModelWithDeeplyNestedBlocks(ModelMixin):
-    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
-        super().__init__()
-
-        self.input_layer = torch.nn.Linear(in_features, hidden_features)
-        self.container = ContainerWithNestedModuleList(hidden_features)
-        self.output_layer = torch.nn.Linear(hidden_features, out_features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.input_layer(x)
-        x = self.container(x)
-        x = self.output_layer(x)
-        return x
-
-
-class ContainerWithNestedModuleList(torch.nn.Module):
-    def __init__(self, features: int) -> None:
-        super().__init__()
-
-        # Top-level computational layer
-        self.proj_in = torch.nn.Linear(features, features)
-
-        # Nested container with ModuleList
-        self.nested_container = NestedContainer(features)
-
-        # Another top-level computational layer
-        self.proj_out = torch.nn.Linear(features, features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj_in(x)
-        x = self.nested_container(x)
-        x = self.proj_out(x)
-        return x
-
-
-class NestedContainer(torch.nn.Module):
-    def __init__(self, features: int) -> None:
-        super().__init__()
-
-        self.blocks = torch.nn.ModuleList([torch.nn.Linear(features, features), torch.nn.Linear(features, features)])
-        self.norm = torch.nn.LayerNorm(features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for block in self.blocks:
-            x = block(x)
-        x = self.norm(x)
-        return x
+    
+
+# Test for https://github.com/huggingface/diffusers/pull/12747
+class DummyCallableBySubmodule:
+    """
+    Callable group offloading pinner that pins first and last DummyBlock
+    called in the program by callable(submodule)
+    """
+    def __init__(self, pin_targets: Iterable[torch.nn.Module]) -> None:
+        self.pin_targets = set(pin_targets)
+        self.calls_track = [] # testing only
+
+    def __call__(self, submodule: torch.nn.Module) -> bool:
+        self.calls_track.append(submodule)
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+    def _normalize_module_type(self, obj: Any) -> Optional[torch.nn.Module]:
+        # group might be a single module, or a container of modules
+        # The group-offloading code may pass either:
+        #   - a single `torch.nn.Module`, or
+        #   - a container (list/tuple) of modules.
+
+        # Only return a module when the mapping is unambiguous:
+        #   - if `obj` is a module -> return it
+        #   - if `obj` is a list/tuple containing exactly one module -> return that module
+        #   - otherwise -> return None (won't be considered as a target candidate)
+        if isinstance(obj, torch.nn.Module):
+            return obj
+        if isinstance(obj, (list, tuple)):
+            mods = [m for m in obj if isinstance(m, torch.nn.Module)]
+            return mods[0] if len(mods) == 1 else None
+        return None
+    
+class DummyCallableByNameSubmodule(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that pins first and last DummyBlock
+    Same behaviour with DummyCallableBySubmodule, only with different call signature
+    called in the program by callable(name, submodule)
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module) -> bool:
+        self.calls_track.append((name, submodule))
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+class DummyCallableByNameSubmoduleIdx(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that pins first and last DummyBlock.
+    Same behaviour with DummyCallableBySubmodule, only with different call signature
+    Called in the program by callable(name, submodule, idx)
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module, idx: int) -> bool:
+        self.calls_track.append((name, submodule, idx))
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+class DummyInvalidCallable(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that uses invalid call signature
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module, idx: int, extra: Any) -> bool:
+        self.calls_track.append((name, submodule, idx, extra))
+        return self._normalize_module_type(submodule) in self.pin_targets
 
 
 @require_torch_accelerator
@@ -409,7 +402,7 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
         out = model(x)
         self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match.")
 
-        num_repeats = 2
+        num_repeats = 4
         for i in range(num_repeats):
             out_ref = model_ref(x)
             out = model(x)
@@ -431,138 +424,160 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
             self.assertLess(
                 cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
             )
-
-    def test_vae_like_model_without_streams(self):
-        """Test VAE-like model with block-level offloading but without streams."""
+    
+    def test_block_level_offloading_with_pin_groups_stay_on_device(self):
         if torch.device(torch_device).type not in ["cuda", "xpu"]:
             return
 
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
-
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=False)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            out_ref = model_ref(x).sample
-            out = model(x).sample
+        def assert_all_modules_on_expected_device(modules: Sequence[torch.nn.Module],
+                                                  expected_device: Union[torch.device, str],
+                                                  header_error_msg: str = "") -> None:
+            def first_param_device(modules: torch.nn.Module) -> torch.device:
+                p = next(modules.parameters(), None)
+                self.assertIsNotNone(p, f"No parameters found for module {modules}")
+                return p.device
+
+            if isinstance(expected_device, torch.device):
+                expected_device = expected_device.type
+            
+            bad = []
+            for i, m in enumerate(modules):
+                dev_type = first_param_device(m).type
+                if dev_type != expected_device:
+                    bad.append((i, m.__class__.__name__, dev_type))
+            self.assertTrue(
+                len(bad) == 0,
+                (header_error_msg + "\n" if header_error_msg else "")
+                + f"Expected all modules on {expected_device}, but found mismatches: {bad}",
+            )
 
-        self.assertTrue(
-            torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match for VAE-like model without streams."
+        def get_param_modules_from_execution_order(model: DummyModel) -> List[torch.nn.Module]:
+            model.eval()
+            root_registry = HookRegistry.check_if_exists_or_initialize(model)
+
+            lazy_hook = root_registry.get_hook("lazy_prefetch_group_offloading")
+            self.assertIsNotNone(lazy_hook, "lazy_prefetch_group_offloading hook was not registered")
+
+            #record execution order with first forward
+            with torch.no_grad():
+                model(self.input)
+
+            mods = [m for _, m in lazy_hook.execution_order]
+            param_modules = [m for m in mods if next(m.parameters(), None) is not None]
+            return param_modules
+        
+        def assert_callables_offloading_tests(
+            param_modules: Sequence[torch.nn.Module],
+            callable: Any,
+            header_error_msg: str = "",
+        ) -> None:
+            pinned_modules = [m for m in param_modules if m in callable.pin_targets]
+            unpinned_modules = [m for m in param_modules if m not in callable.pin_targets]
+            self.assertTrue(len(callable.calls_track) > 0, f"{header_error_msg}: callable should have been called at least once")
+            assert_all_modules_on_expected_device(pinned_modules, torch_device, f"{header_error_msg}: pinned blocks should stay on device")
+            assert_all_modules_on_expected_device(unpinned_modules, "cpu", f"{header_error_msg}: unpinned blocks should be offloaded")
+
+
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+        }
+        model_default_no_pin = self.get_model()
+        model_default_no_pin.enable_group_offload(
+            **default_parameters
         )
-
-    def test_model_with_only_standalone_layers(self):
-        """Test that models with only standalone layers (no ModuleList/Sequential) work with block-level offloading."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        model = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
-
-        model_ref = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 64).to(torch_device)
-
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x)
-                out = model(x)
-                self.assertTrue(
-                    torch.allclose(out_ref, out, atol=1e-5),
-                    f"Outputs do not match at iteration {i} for model with standalone layers.",
-                )
-
-    @parameterized.expand([("block_level",), ("leaf_level",)])
-    def test_standalone_conv_layers_with_both_offload_types(self, offload_type: str):
-        """Test that standalone Conv2d layers work correctly with both block-level and leaf-level offloading."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
-
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type=offload_type, num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            out_ref = model_ref(x).sample
-            out = model(x).sample
-
-        self.assertTrue(
-            torch.allclose(out_ref, out, atol=1e-5),
-            f"Outputs do not match for standalone Conv layers with {offload_type}.",
+        param_modules = get_param_modules_from_execution_order(model_default_no_pin)
+        assert_all_modules_on_expected_device(param_modules, 
+                                              expected_device="cpu", 
+                                              header_error_msg="default pin_groups: expected ALL modules offloaded to CPU")
+
+        model_pin_all = self.get_model()
+        model_pin_all.enable_group_offload(
+            **default_parameters,
+            pin_groups="all",
         )
+        param_modules = get_param_modules_from_execution_order(model_pin_all)
+        assert_all_modules_on_expected_device(param_modules, 
+                                              expected_device=torch_device, 
+                                              header_error_msg="pin_groups = all: expected ALL layers on accelerator device")
 
-    def test_multiple_invocations_with_vae_like_model(self):
-        """Test that multiple forward passes work correctly with VAE-like model."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
 
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x).sample
-                out = model(x).sample
-                self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), f"Outputs do not match at iteration {i}.")
-
-    def test_nested_container_parameters_offloading(self):
-        """Test that parameters from non-computational layers in nested containers are handled correctly."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        model = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
-
-        model_ref = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
+        model_pin_first_last = self.get_model()
+        model_pin_first_last.enable_group_offload(
+            **default_parameters,
+            pin_groups="first_last",
+        )
+        param_modules = get_param_modules_from_execution_order(model_pin_first_last)
+        assert_all_modules_on_expected_device([param_modules[0], param_modules[-1]], 
+                                              expected_device=torch_device, 
+                                              header_error_msg="pin_groups = first_last: expected first and last layers on accelerator device")
+        assert_all_modules_on_expected_device(param_modules[1:-1], 
+                                              expected_device="cpu", 
+                                              header_error_msg="pin_groups = first_last: expected ALL middle layers offloaded to CPU")
+
+        
+        model = self.get_model()
+        callable_by_submodule = DummyCallableBySubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_submodule)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_submodule,
+                                          header_error_msg="pin_groups with callable(submodule)")
 
-        x = torch.randn(2, 64).to(torch_device)
+        model = self.get_model()
+        callable_by_name_submodule = DummyCallableByNameSubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_name_submodule)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_name_submodule,
+                                          header_error_msg="pin_groups with callable(name, submodule)")
 
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x)
-                out = model(x)
-                self.assertTrue(
-                    torch.allclose(out_ref, out, atol=1e-5),
-                    f"Outputs do not match at iteration {i} for nested parameters.",
-                )
+        model = self.get_model()
+        callable_by_name_submodule_idx = DummyCallableByNameSubmoduleIdx(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_name_submodule_idx)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_name_submodule_idx,
+                                          header_error_msg="pin_groups with callable(name, submodule, idx)")
+        
+    def test_error_raised_if_pin_groups_received_invalid_value(self):
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+        }
+        model = self.get_model()
+        with self.assertRaisesRegex(ValueError, 
+                                    "`pin_groups` must be one of `None`, 'first_last', 'all', or a callable."):
+            model.enable_group_offload(
+                **default_parameters,
+                pin_groups="invalid value",
+            )
 
-    def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None):
-        block_out_channels = block_out_channels or [2, 4]
-        norm_num_groups = norm_num_groups or 2
-        init_dict = {
-            "block_out_channels": block_out_channels,
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
-            "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
-            "latent_channels": 4,
-            "norm_num_groups": norm_num_groups,
-            "layers_per_block": 1,
+    def test_error_raised_if_pin_groups_received_invalid_callables(self):
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
         }
-        return init_dict
+        model = self.get_model()
+        invalid_callable = DummyInvalidCallable(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(
+            **default_parameters,
+            pin_groups=invalid_callable,
+        )
+        with self.assertRaisesRegex(TypeError, 
+                                    r"missing\s+\d+\s+required\s+positional\s+argument(s)?:"):
+            with torch.no_grad():
+                model(self.input)
+            
+        
+
+        
\ No newline at end of file

From a99755a74d3d586f08778d61f76b53de650652f9 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 07/21] Support explicit block modules in group offloading

---
 src/diffusers/hooks/group_offloading.py | 242 ++++++++++++++++++------
 src/diffusers/models/modeling_utils.py  |   7 +-
 2 files changed, 186 insertions(+), 63 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 47f1f4199615..36b09cb692dc 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -15,9 +15,9 @@
 import hashlib
 import os
 from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import safetensors.torch
 import torch
@@ -60,8 +60,7 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
-    exclude_kwargs: Optional[List[str]] = None
-    module_prefix: Optional[str] = ""
+    pin_groups: Optional[Union[str, Callable]] = None
 
 
 class ModuleGroup:
@@ -94,6 +93,7 @@ def __init__(
         self.record_stream = record_stream
         self.onload_self = onload_self
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.pinned = False
 
         self.offload_to_disk_path = offload_to_disk_path
         self._is_offloaded_to_disk = False
@@ -156,27 +156,27 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
-    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
+    def _transfer_tensor_to_device(self, tensor, source_tensor):
         tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
         if self.record_stream:
-            tensor.data.record_stream(default_stream)
+            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
 
-    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
+    def _process_tensors_from_modules(self, pinned_memory=None):
         for group_module in self.modules:
             for param in group_module.parameters():
                 source = pinned_memory[param] if pinned_memory else param.data
-                self._transfer_tensor_to_device(param, source, default_stream)
+                self._transfer_tensor_to_device(param, source)
             for buffer in group_module.buffers():
                 source = pinned_memory[buffer] if pinned_memory else buffer.data
-                self._transfer_tensor_to_device(buffer, source, default_stream)
+                self._transfer_tensor_to_device(buffer, source)
 
         for param in self.parameters:
             source = pinned_memory[param] if pinned_memory else param.data
-            self._transfer_tensor_to_device(param, source, default_stream)
+            self._transfer_tensor_to_device(param, source)
 
         for buffer in self.buffers:
             source = pinned_memory[buffer] if pinned_memory else buffer.data
-            self._transfer_tensor_to_device(buffer, source, default_stream)
+            self._transfer_tensor_to_device(buffer, source)
 
     def _onload_from_disk(self):
         if self.stream is not None:
@@ -211,12 +211,10 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
-        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
-
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
-                    self._process_tensors_from_modules(pinned_memory, default_stream=default_stream)
+                    self._process_tensors_from_modules(pinned_memory)
             else:
                 self._process_tensors_from_modules(None)
 
@@ -301,6 +299,24 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         if self.group.onload_leader is None:
             self.group.onload_leader = module
 
+        if self.group.pinned:
+            if self.group.onload_leader == module and not self._is_group_on_device():
+                self.group.onload_()
+
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                self.group.stream.synchronize()
+
+            args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            return args, kwargs
+
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
         # to onload itself. In the case of using prefetching with streams, we onload the next group if
         # it is not supposed to onload itself.
@@ -325,28 +341,30 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-
-        # Some Autoencoder models use a feature cache that is passed through submodules
-        # and modified in place. The `send_to_device` call returns a copy of this feature cache object
-        # which breaks the inplace updates. Use `exclude_kwargs` to mark these cache features
-        exclude_kwargs = self.config.exclude_kwargs or []
-        if exclude_kwargs:
-            moved_kwargs = send_to_device(
-                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
-                self.group.onload_device,
-                non_blocking=self.group.non_blocking,
-            )
-            kwargs.update(moved_kwargs)
-        else:
-            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
-
+        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
+        if self.group.pinned:
+            return output
+
         if self.group.offload_leader == module:
             self.group.offload_()
         return output
 
+    def _is_group_on_device(self) -> bool:
+        tensors = []
+        for group_module in self.group.modules:
+            tensors.extend(list(group_module.parameters()))
+            tensors.extend(list(group_module.buffers()))
+        tensors.extend(self.group.parameters)
+        tensors.extend(self.group.buffers)
+
+        if len(tensors) == 0:
+            return True
+
+        return all(t.device == self.group.onload_device for t in tensors)
+
 
 class LazyPrefetchGroupOffloadingHook(ModelHook):
     r"""
@@ -358,9 +376,10 @@ class LazyPrefetchGroupOffloadingHook(ModelHook):
 
     _is_stateful = False
 
-    def __init__(self):
+    def __init__(self, pin_groups: Optional[Union[str, Callable]] = None):
         self.execution_order: List[Tuple[str, torch.nn.Module]] = []
         self._layer_execution_tracker_module_names = set()
+        self.pin_groups = pin_groups
 
     def initialize_hook(self, module):
         def make_execution_order_update_callback(current_name, current_submodule):
@@ -442,6 +461,50 @@ def post_forward(self, module, output):
             group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
             group_offloading_hooks[i].next_group.onload_self = False
 
+        if self.pin_groups is not None and num_executed > 0:
+            param_exec_info = []
+            for idx, ((name, submodule), hook) in enumerate(zip(self.execution_order, group_offloading_hooks)):
+                if hook is None:
+                    continue
+                if next(submodule.parameters(), None) is None and next(submodule.buffers(), None) is None:
+                    continue
+                param_exec_info.append((name, submodule, hook))
+
+            num_param_modules = len(param_exec_info)
+            if num_param_modules > 0:
+                pinned_indices = set()
+                if isinstance(self.pin_groups, str):
+                    if self.pin_groups == "all":
+                        pinned_indices = set(range(num_param_modules))
+                    elif self.pin_groups == "first_last":
+                        pinned_indices.add(0)
+                        pinned_indices.add(num_param_modules - 1)
+                elif callable(self.pin_groups):
+                    for idx, (name, submodule, _) in enumerate(param_exec_info):
+                        should_pin = False
+                        try:
+                            should_pin = bool(self.pin_groups(submodule))
+                        except TypeError:
+                            try:
+                                should_pin = bool(self.pin_groups(name, submodule))
+                            except TypeError:
+                                should_pin = bool(self.pin_groups(name, submodule, idx))
+                        if should_pin:
+                            pinned_indices.add(idx)
+
+                pinned_groups = set()
+                for idx in pinned_indices:
+                    if idx >= num_param_modules:
+                        continue
+                    group = param_exec_info[idx][2].group
+                    if group not in pinned_groups:
+                        group.pinned = True
+                        pinned_groups.add(group)
+
+                for group in pinned_groups:
+                    if group.offload_device != group.onload_device:
+                        group.onload_()
+
         return output
 
 
@@ -473,7 +536,7 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
-    exclude_kwargs: Optional[List[str]] = None,
+    pin_groups: Optional[Union[str, Callable]] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -532,12 +595,12 @@ def apply_group_offloading(
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
         block_modules (`List[str]`, *optional*):
-            List of module names that should be treated as blocks for offloading. If provided, only these modules will
-            be considered for block-level offloading. If not provided, the default block detection logic will be used.
-        exclude_kwargs (`List[str]`, *optional*):
-            List of kwarg keys that should not be processed by send_to_device. This is useful for mutable state like
-            caching lists that need to maintain their object identity across forward passes. If not provided, will be
-            inferred from the module's `_skip_keys` attribute if it exists.
+            List of module names that should be treated as blocks for offloading. If provided, only these modules
+            will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
+            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            receives a module (and optionally the module name and index) and returns `True` to pin that group.
 
     Example:
         ```python
@@ -577,13 +640,17 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
-    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    normalized_pin_groups = pin_groups
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    elif pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
 
-    if block_modules is None:
-        block_modules = getattr(module, "_group_offload_block_modules", None)
+    pin_groups = normalized_pin_groups
 
-    if exclude_kwargs is None:
-        exclude_kwargs = getattr(module, "_skip_keys", None)
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -596,7 +663,7 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
-        exclude_kwargs=exclude_kwargs,
+        pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
 
@@ -613,11 +680,11 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
-    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
-    done at the top-level blocks and modules specified in block_modules.
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading
+    is done at the top-level blocks and modules specified in block_modules.
 
     When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
-    module, recursively apply block offloading to it.
+    module, we either offload the entire submodule or recursively apply block offloading to it.
     """
     if config.stream is not None and config.num_blocks_per_group != 1:
         logger.warning(
@@ -635,15 +702,10 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
     for name, submodule in module.named_children():
         # Check if this is an explicitly defined block module
         if name in block_modules:
-            # Track submodule using a prefix to avoid filename collisions during disk offload.
-            # Without this, submodules sharing the same model class would be assigned identical
-            # filenames (derived from the class name).
-            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
-            submodule_config = replace(config, module_prefix=prefix)
-
-            _apply_group_offloading_block_level(submodule, submodule_config)
-            modules_with_group_offloading.add(name)
-
+            # Apply block offloading to the specified submodule
+            _apply_block_offloading_to_submodule(
+                submodule, name, config, modules_with_group_offloading, matched_module_groups
+            )
         elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
             # Handle ModuleList and Sequential blocks as before
             for i in range(0, len(submodule), config.num_blocks_per_group):
@@ -651,7 +713,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
                 if len(current_modules) == 0:
                     continue
 
-                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
                 group = ModuleGroup(
                     modules=current_modules,
                     offload_device=config.offload_device,
@@ -672,6 +734,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
         else:
             # This is an unmatched module
             unmatched_modules.append((name, submodule))
+            modules_with_group_offloading.add(name)
 
     # Apply group offloading hooks to the module groups
     for i, group in enumerate(matched_module_groups):
@@ -703,7 +766,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             stream=None,
             record_stream=False,
             onload_self=True,
-            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
+            group_id=f"{module.__class__.__name__}_unmatched_group",
         )
         if config.stream is None:
             _apply_group_offloading_hook(module, unmatched_group, config=config)
@@ -711,6 +774,67 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
 
 
+def _apply_block_offloading_to_submodule(
+    submodule: torch.nn.Module,
+    name: str,
+    config: GroupOffloadingConfig,
+    modules_with_group_offloading: Set[str],
+    matched_module_groups: List[ModuleGroup],
+) -> None:
+    r"""
+    Apply block offloading to a explicitly defined submodule. This function either:
+    1. Offloads the entire submodule as a single group ( SIMPLE APPROACH)
+    2. Recursively applies block offloading to the submodule
+
+    For now, we use the simple approach - offload the entire submodule as a single group.
+    """
+    # Simple approach: offload the entire submodule as a single group
+    # Since AEs are typically small, this is usually okay
+    if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+        # If it's a ModuleList or Sequential, apply the normal block-level logic
+        for i in range(0, len(submodule), config.num_blocks_per_group):
+            current_modules = list(submodule[i : i + config.num_blocks_per_group])
+            if len(current_modules) == 0:
+                continue
+
+            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+            group = ModuleGroup(
+                modules=current_modules,
+                offload_device=config.offload_device,
+                onload_device=config.onload_device,
+                offload_to_disk_path=config.offload_to_disk_path,
+                offload_leader=current_modules[-1],
+                onload_leader=current_modules[0],
+                non_blocking=config.non_blocking,
+                stream=config.stream,
+                record_stream=config.record_stream,
+                low_cpu_mem_usage=config.low_cpu_mem_usage,
+                onload_self=True,
+                group_id=group_id,
+            )
+            matched_module_groups.append(group)
+            for j in range(i, i + len(current_modules)):
+                modules_with_group_offloading.add(f"{name}.{j}")
+    else:
+        # For other modules, treat the entire submodule as a single group
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        matched_module_groups.append(group)
+        modules_with_group_offloading.add(name)
+
+
 def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
@@ -837,8 +961,8 @@ def _apply_lazy_group_offloading_hook(
     if registry.get_hook(_GROUP_OFFLOADING) is None:
         hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
-
-    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
+        
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups = config.pin_groups)
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
 
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 41da95d3a2a2..3263be4e046e 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -531,8 +531,7 @@ def enable_group_offload(
         record_stream: bool = False,
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
-        block_modules: Optional[str] = None,
-        exclude_kwargs: Optional[str] = None,
+        pin_groups: Optional[Union[str, Callable]] = None
     ) -> None:
         r"""
         Activates group offloading for the current model.
@@ -572,7 +571,7 @@ def enable_group_offload(
                 f"`_supports_group_offloading` to `True` in the class definition. If you believe this is a mistake, please "
                 f"open an issue at https://github.com/huggingface/diffusers/issues."
             )
-
+        block_modules = getattr(self, "_group_offload_block_modules", None)
         apply_group_offloading(
             module=self,
             onload_device=onload_device,
@@ -585,7 +584,7 @@ def enable_group_offload(
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
             block_modules=block_modules,
-            exclude_kwargs=exclude_kwargs,
+            pin_groups=pin_groups
         )
 
     def set_attention_backend(self, backend: str) -> None:

From ffad3163e2a0fdd0a6089a8f09a9f8e9a9727add Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 08/21] Expose group offload pinning options in API

---
 src/diffusers/pipelines/pipeline_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 392d5fb3feb4..d0fab44a6187 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1342,6 +1342,8 @@ def enable_group_offload(
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
         exclude_modules: Optional[Union[str, List[str]]] = None,
+        pin_groups: Optional[Union[str, Callable]] = None,
+        pin_first_last: bool = False,
     ) -> None:
         r"""
         Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
@@ -1402,6 +1404,11 @@ def enable_group_offload(
                 This option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be
                 useful when the CPU memory is a bottleneck but may counteract the benefits of using streams.
             exclude_modules (`Union[str, List[str]]`, defaults to `None`): List of modules to exclude from offloading.
+            pin_groups (`\"first_last\"` | `\"all\"` | `Callable`, *optional*):
+                Optionally keep selected groups on the onload device permanently. See `ModelMixin.enable_group_offload`
+                for details.
+            pin_first_last (`bool`, *optional*, defaults to `False`):
+                Deprecated alias for `pin_groups=\"first_last\"`.
 
         Example:
             ```python
@@ -1442,6 +1449,8 @@ def enable_group_offload(
             "record_stream": record_stream,
             "low_cpu_mem_usage": low_cpu_mem_usage,
             "offload_to_disk_path": offload_to_disk_path,
+            "pin_groups": pin_groups,
+            "pin_first_last": pin_first_last,
         }
         for name, component in self.components.items():
             if name not in exclude_modules and isinstance(component, torch.nn.Module):

From 33d8b528a1b8b62795432b280e011d3dfd44633f Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Sun, 30 Nov 2025 22:47:39 +0800
Subject: [PATCH 09/21] removed deprecated flag pin_first_last

---
 src/diffusers/pipelines/pipeline_utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index d0fab44a6187..a8084688d498 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1343,7 +1343,6 @@ def enable_group_offload(
         offload_to_disk_path: Optional[str] = None,
         exclude_modules: Optional[Union[str, List[str]]] = None,
         pin_groups: Optional[Union[str, Callable]] = None,
-        pin_first_last: bool = False,
     ) -> None:
         r"""
         Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
@@ -1407,8 +1406,6 @@ def enable_group_offload(
             pin_groups (`\"first_last\"` | `\"all\"` | `Callable`, *optional*):
                 Optionally keep selected groups on the onload device permanently. See `ModelMixin.enable_group_offload`
                 for details.
-            pin_first_last (`bool`, *optional*, defaults to `False`):
-                Deprecated alias for `pin_groups=\"first_last\"`.
 
         Example:
             ```python
@@ -1450,7 +1447,6 @@ def enable_group_offload(
             "low_cpu_mem_usage": low_cpu_mem_usage,
             "offload_to_disk_path": offload_to_disk_path,
             "pin_groups": pin_groups,
-            "pin_first_last": pin_first_last,
         }
         for name, component in self.components.items():
             if name not in exclude_modules and isinstance(component, torch.nn.Module):

From ed8a97ab790ce8984c6fd1b5a070557bee0d7358 Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Fri, 28 Nov 2025 16:02:53 +0800
Subject: [PATCH 10/21] created test for pinning first and last block on device

---
 tests/hooks/test_group_offloading.py | 401 ++++++++++++++-------------
 1 file changed, 208 insertions(+), 193 deletions(-)

diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 236094109d07..58520bef9aa5 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -19,13 +19,14 @@
 import torch
 from parameterized import parameterized
 
-from diffusers import AutoencoderKL
 from diffusers.hooks import HookRegistry, ModelHook
 from diffusers.models import ModelMixin
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.utils import get_logger
 from diffusers.utils.import_utils import compare_versions
 
+from typing import Any, Iterable, List, Optional, Sequence, Union
+
 from ..testing_utils import (
     backend_empty_cache,
     backend_max_memory_allocated,
@@ -148,74 +149,66 @@ def __init__(self):
     def post_forward(self, module, output):
         self.outputs.append(output)
         return output
-
-
-# Model with only standalone computational layers at top level
-class DummyModelWithStandaloneLayers(ModelMixin):
-    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
-        super().__init__()
-
-        self.layer1 = torch.nn.Linear(in_features, hidden_features)
-        self.activation = torch.nn.ReLU()
-        self.layer2 = torch.nn.Linear(hidden_features, hidden_features)
-        self.layer3 = torch.nn.Linear(hidden_features, out_features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.layer1(x)
-        x = self.activation(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        return x
-
-
-# Model with deeply nested structure
-class DummyModelWithDeeplyNestedBlocks(ModelMixin):
-    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
-        super().__init__()
-
-        self.input_layer = torch.nn.Linear(in_features, hidden_features)
-        self.container = ContainerWithNestedModuleList(hidden_features)
-        self.output_layer = torch.nn.Linear(hidden_features, out_features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.input_layer(x)
-        x = self.container(x)
-        x = self.output_layer(x)
-        return x
-
-
-class ContainerWithNestedModuleList(torch.nn.Module):
-    def __init__(self, features: int) -> None:
-        super().__init__()
-
-        # Top-level computational layer
-        self.proj_in = torch.nn.Linear(features, features)
-
-        # Nested container with ModuleList
-        self.nested_container = NestedContainer(features)
-
-        # Another top-level computational layer
-        self.proj_out = torch.nn.Linear(features, features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj_in(x)
-        x = self.nested_container(x)
-        x = self.proj_out(x)
-        return x
-
-
-class NestedContainer(torch.nn.Module):
-    def __init__(self, features: int) -> None:
-        super().__init__()
-
-        self.blocks = torch.nn.ModuleList([torch.nn.Linear(features, features), torch.nn.Linear(features, features)])
-        self.norm = torch.nn.LayerNorm(features)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for block in self.blocks:
-            x = block(x)
-        x = self.norm(x)
-        return x
+    
+
+# Test for https://github.com/huggingface/diffusers/pull/12747
+class DummyCallableBySubmodule:
+    """
+    Callable group offloading pinner that pins first and last DummyBlock
+    called in the program by callable(submodule)
+    """
+    def __init__(self, pin_targets: Iterable[torch.nn.Module]) -> None:
+        self.pin_targets = set(pin_targets)
+        self.calls_track = [] # testing only
+
+    def __call__(self, submodule: torch.nn.Module) -> bool:
+        self.calls_track.append(submodule)
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+    def _normalize_module_type(self, obj: Any) -> Optional[torch.nn.Module]:
+        # group might be a single module, or a container of modules
+        # The group-offloading code may pass either:
+        #   - a single `torch.nn.Module`, or
+        #   - a container (list/tuple) of modules.
+
+        # Only return a module when the mapping is unambiguous:
+        #   - if `obj` is a module -> return it
+        #   - if `obj` is a list/tuple containing exactly one module -> return that module
+        #   - otherwise -> return None (won't be considered as a target candidate)
+        if isinstance(obj, torch.nn.Module):
+            return obj
+        if isinstance(obj, (list, tuple)):
+            mods = [m for m in obj if isinstance(m, torch.nn.Module)]
+            return mods[0] if len(mods) == 1 else None
+        return None
+    
+class DummyCallableByNameSubmodule(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that pins first and last DummyBlock
+    Same behaviour with DummyCallableBySubmodule, only with different call signature
+    called in the program by callable(name, submodule)
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module) -> bool:
+        self.calls_track.append((name, submodule))
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+class DummyCallableByNameSubmoduleIdx(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that pins first and last DummyBlock.
+    Same behaviour with DummyCallableBySubmodule, only with different call signature
+    Called in the program by callable(name, submodule, idx)
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module, idx: int) -> bool:
+        self.calls_track.append((name, submodule, idx))
+        return self._normalize_module_type(submodule) in self.pin_targets
+    
+class DummyInvalidCallable(DummyCallableBySubmodule):
+    """
+    Callable group offloading pinner that uses invalid call signature
+    """
+    def __call__(self, name: str, submodule: torch.nn.Module, idx: int, extra: Any) -> bool:
+        self.calls_track.append((name, submodule, idx, extra))
+        return self._normalize_module_type(submodule) in self.pin_targets
 
 
 @require_torch_accelerator
@@ -409,7 +402,7 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
         out = model(x)
         self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match.")
 
-        num_repeats = 2
+        num_repeats = 4
         for i in range(num_repeats):
             out_ref = model_ref(x)
             out = model(x)
@@ -431,138 +424,160 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
             self.assertLess(
                 cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
             )
-
-    def test_vae_like_model_without_streams(self):
-        """Test VAE-like model with block-level offloading but without streams."""
+    
+    def test_block_level_offloading_with_pin_groups_stay_on_device(self):
         if torch.device(torch_device).type not in ["cuda", "xpu"]:
             return
 
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
-
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=False)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            out_ref = model_ref(x).sample
-            out = model(x).sample
+        def assert_all_modules_on_expected_device(modules: Sequence[torch.nn.Module],
+                                                  expected_device: Union[torch.device, str],
+                                                  header_error_msg: str = "") -> None:
+            def first_param_device(modules: torch.nn.Module) -> torch.device:
+                p = next(modules.parameters(), None)
+                self.assertIsNotNone(p, f"No parameters found for module {modules}")
+                return p.device
+
+            if isinstance(expected_device, torch.device):
+                expected_device = expected_device.type
+            
+            bad = []
+            for i, m in enumerate(modules):
+                dev_type = first_param_device(m).type
+                if dev_type != expected_device:
+                    bad.append((i, m.__class__.__name__, dev_type))
+            self.assertTrue(
+                len(bad) == 0,
+                (header_error_msg + "\n" if header_error_msg else "")
+                + f"Expected all modules on {expected_device}, but found mismatches: {bad}",
+            )
 
-        self.assertTrue(
-            torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match for VAE-like model without streams."
+        def get_param_modules_from_execution_order(model: DummyModel) -> List[torch.nn.Module]:
+            model.eval()
+            root_registry = HookRegistry.check_if_exists_or_initialize(model)
+
+            lazy_hook = root_registry.get_hook("lazy_prefetch_group_offloading")
+            self.assertIsNotNone(lazy_hook, "lazy_prefetch_group_offloading hook was not registered")
+
+            #record execution order with first forward
+            with torch.no_grad():
+                model(self.input)
+
+            mods = [m for _, m in lazy_hook.execution_order]
+            param_modules = [m for m in mods if next(m.parameters(), None) is not None]
+            return param_modules
+        
+        def assert_callables_offloading_tests(
+            param_modules: Sequence[torch.nn.Module],
+            callable: Any,
+            header_error_msg: str = "",
+        ) -> None:
+            pinned_modules = [m for m in param_modules if m in callable.pin_targets]
+            unpinned_modules = [m for m in param_modules if m not in callable.pin_targets]
+            self.assertTrue(len(callable.calls_track) > 0, f"{header_error_msg}: callable should have been called at least once")
+            assert_all_modules_on_expected_device(pinned_modules, torch_device, f"{header_error_msg}: pinned blocks should stay on device")
+            assert_all_modules_on_expected_device(unpinned_modules, "cpu", f"{header_error_msg}: unpinned blocks should be offloaded")
+
+
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+        }
+        model_default_no_pin = self.get_model()
+        model_default_no_pin.enable_group_offload(
+            **default_parameters
         )
-
-    def test_model_with_only_standalone_layers(self):
-        """Test that models with only standalone layers (no ModuleList/Sequential) work with block-level offloading."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        model = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
-
-        model_ref = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 64).to(torch_device)
-
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x)
-                out = model(x)
-                self.assertTrue(
-                    torch.allclose(out_ref, out, atol=1e-5),
-                    f"Outputs do not match at iteration {i} for model with standalone layers.",
-                )
-
-    @parameterized.expand([("block_level",), ("leaf_level",)])
-    def test_standalone_conv_layers_with_both_offload_types(self, offload_type: str):
-        """Test that standalone Conv2d layers work correctly with both block-level and leaf-level offloading."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
-
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type=offload_type, num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            out_ref = model_ref(x).sample
-            out = model(x).sample
-
-        self.assertTrue(
-            torch.allclose(out_ref, out, atol=1e-5),
-            f"Outputs do not match for standalone Conv layers with {offload_type}.",
+        param_modules = get_param_modules_from_execution_order(model_default_no_pin)
+        assert_all_modules_on_expected_device(param_modules, 
+                                              expected_device="cpu", 
+                                              header_error_msg="default pin_groups: expected ALL modules offloaded to CPU")
+
+        model_pin_all = self.get_model()
+        model_pin_all.enable_group_offload(
+            **default_parameters,
+            pin_groups="all",
         )
+        param_modules = get_param_modules_from_execution_order(model_pin_all)
+        assert_all_modules_on_expected_device(param_modules, 
+                                              expected_device=torch_device, 
+                                              header_error_msg="pin_groups = all: expected ALL layers on accelerator device")
 
-    def test_multiple_invocations_with_vae_like_model(self):
-        """Test that multiple forward passes work correctly with VAE-like model."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        config = self.get_autoencoder_kl_config()
-        model = AutoencoderKL(**config)
 
-        model_ref = AutoencoderKL(**config)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
-
-        x = torch.randn(2, 3, 32, 32).to(torch_device)
-
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x).sample
-                out = model(x).sample
-                self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), f"Outputs do not match at iteration {i}.")
-
-    def test_nested_container_parameters_offloading(self):
-        """Test that parameters from non-computational layers in nested containers are handled correctly."""
-        if torch.device(torch_device).type not in ["cuda", "xpu"]:
-            return
-
-        model = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
-
-        model_ref = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
-        model_ref.load_state_dict(model.state_dict(), strict=True)
-        model_ref.to(torch_device)
-
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
+        model_pin_first_last = self.get_model()
+        model_pin_first_last.enable_group_offload(
+            **default_parameters,
+            pin_groups="first_last",
+        )
+        param_modules = get_param_modules_from_execution_order(model_pin_first_last)
+        assert_all_modules_on_expected_device([param_modules[0], param_modules[-1]], 
+                                              expected_device=torch_device, 
+                                              header_error_msg="pin_groups = first_last: expected first and last layers on accelerator device")
+        assert_all_modules_on_expected_device(param_modules[1:-1], 
+                                              expected_device="cpu", 
+                                              header_error_msg="pin_groups = first_last: expected ALL middle layers offloaded to CPU")
+
+        
+        model = self.get_model()
+        callable_by_submodule = DummyCallableBySubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_submodule)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_submodule,
+                                          header_error_msg="pin_groups with callable(submodule)")
 
-        x = torch.randn(2, 64).to(torch_device)
+        model = self.get_model()
+        callable_by_name_submodule = DummyCallableByNameSubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_name_submodule)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_name_submodule,
+                                          header_error_msg="pin_groups with callable(name, submodule)")
 
-        with torch.no_grad():
-            for i in range(2):
-                out_ref = model_ref(x)
-                out = model(x)
-                self.assertTrue(
-                    torch.allclose(out_ref, out, atol=1e-5),
-                    f"Outputs do not match at iteration {i} for nested parameters.",
-                )
+        model = self.get_model()
+        callable_by_name_submodule_idx = DummyCallableByNameSubmoduleIdx(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(**default_parameters, 
+                                   pin_groups=callable_by_name_submodule_idx)
+        param_modules = get_param_modules_from_execution_order(model)
+        assert_callables_offloading_tests(param_modules, 
+                                          callable_by_name_submodule_idx,
+                                          header_error_msg="pin_groups with callable(name, submodule, idx)")
+        
+    def test_error_raised_if_pin_groups_received_invalid_value(self):
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+        }
+        model = self.get_model()
+        with self.assertRaisesRegex(ValueError, 
+                                    "`pin_groups` must be one of `None`, 'first_last', 'all', or a callable."):
+            model.enable_group_offload(
+                **default_parameters,
+                pin_groups="invalid value",
+            )
 
-    def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None):
-        block_out_channels = block_out_channels or [2, 4]
-        norm_num_groups = norm_num_groups or 2
-        init_dict = {
-            "block_out_channels": block_out_channels,
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
-            "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
-            "latent_channels": 4,
-            "norm_num_groups": norm_num_groups,
-            "layers_per_block": 1,
+    def test_error_raised_if_pin_groups_received_invalid_callables(self):
+        default_parameters = {
+            "onload_device": torch_device,
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
         }
-        return init_dict
+        model = self.get_model()
+        invalid_callable = DummyInvalidCallable(pin_targets=[model.blocks[0], model.blocks[-1]])
+        model.enable_group_offload(
+            **default_parameters,
+            pin_groups=invalid_callable,
+        )
+        with self.assertRaisesRegex(TypeError, 
+                                    r"missing\s+\d+\s+required\s+positional\s+argument(s)?:"):
+            with torch.no_grad():
+                model(self.input)
+            
+        
+
+        
\ No newline at end of file

From de3812841545d245b44ff90d9e918c46c32bdf07 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 11/21] Support explicit block modules in group offloading

---
 src/diffusers/hooks/group_offloading.py | 242 ++++++++++++++++++------
 src/diffusers/models/modeling_utils.py  |   7 +-
 2 files changed, 186 insertions(+), 63 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 47f1f4199615..36b09cb692dc 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -15,9 +15,9 @@
 import hashlib
 import os
 from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import safetensors.torch
 import torch
@@ -60,8 +60,7 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
-    exclude_kwargs: Optional[List[str]] = None
-    module_prefix: Optional[str] = ""
+    pin_groups: Optional[Union[str, Callable]] = None
 
 
 class ModuleGroup:
@@ -94,6 +93,7 @@ def __init__(
         self.record_stream = record_stream
         self.onload_self = onload_self
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.pinned = False
 
         self.offload_to_disk_path = offload_to_disk_path
         self._is_offloaded_to_disk = False
@@ -156,27 +156,27 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
-    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
+    def _transfer_tensor_to_device(self, tensor, source_tensor):
         tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
         if self.record_stream:
-            tensor.data.record_stream(default_stream)
+            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
 
-    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
+    def _process_tensors_from_modules(self, pinned_memory=None):
         for group_module in self.modules:
             for param in group_module.parameters():
                 source = pinned_memory[param] if pinned_memory else param.data
-                self._transfer_tensor_to_device(param, source, default_stream)
+                self._transfer_tensor_to_device(param, source)
             for buffer in group_module.buffers():
                 source = pinned_memory[buffer] if pinned_memory else buffer.data
-                self._transfer_tensor_to_device(buffer, source, default_stream)
+                self._transfer_tensor_to_device(buffer, source)
 
         for param in self.parameters:
             source = pinned_memory[param] if pinned_memory else param.data
-            self._transfer_tensor_to_device(param, source, default_stream)
+            self._transfer_tensor_to_device(param, source)
 
         for buffer in self.buffers:
             source = pinned_memory[buffer] if pinned_memory else buffer.data
-            self._transfer_tensor_to_device(buffer, source, default_stream)
+            self._transfer_tensor_to_device(buffer, source)
 
     def _onload_from_disk(self):
         if self.stream is not None:
@@ -211,12 +211,10 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
-        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
-
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
-                    self._process_tensors_from_modules(pinned_memory, default_stream=default_stream)
+                    self._process_tensors_from_modules(pinned_memory)
             else:
                 self._process_tensors_from_modules(None)
 
@@ -301,6 +299,24 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         if self.group.onload_leader is None:
             self.group.onload_leader = module
 
+        if self.group.pinned:
+            if self.group.onload_leader == module and not self._is_group_on_device():
+                self.group.onload_()
+
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                self.group.stream.synchronize()
+
+            args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            return args, kwargs
+
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
         # to onload itself. In the case of using prefetching with streams, we onload the next group if
         # it is not supposed to onload itself.
@@ -325,28 +341,30 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-
-        # Some Autoencoder models use a feature cache that is passed through submodules
-        # and modified in place. The `send_to_device` call returns a copy of this feature cache object
-        # which breaks the inplace updates. Use `exclude_kwargs` to mark these cache features
-        exclude_kwargs = self.config.exclude_kwargs or []
-        if exclude_kwargs:
-            moved_kwargs = send_to_device(
-                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
-                self.group.onload_device,
-                non_blocking=self.group.non_blocking,
-            )
-            kwargs.update(moved_kwargs)
-        else:
-            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
-
+        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
+        if self.group.pinned:
+            return output
+
         if self.group.offload_leader == module:
             self.group.offload_()
         return output
 
+    def _is_group_on_device(self) -> bool:
+        tensors = []
+        for group_module in self.group.modules:
+            tensors.extend(list(group_module.parameters()))
+            tensors.extend(list(group_module.buffers()))
+        tensors.extend(self.group.parameters)
+        tensors.extend(self.group.buffers)
+
+        if len(tensors) == 0:
+            return True
+
+        return all(t.device == self.group.onload_device for t in tensors)
+
 
 class LazyPrefetchGroupOffloadingHook(ModelHook):
     r"""
@@ -358,9 +376,10 @@ class LazyPrefetchGroupOffloadingHook(ModelHook):
 
     _is_stateful = False
 
-    def __init__(self):
+    def __init__(self, pin_groups: Optional[Union[str, Callable]] = None):
         self.execution_order: List[Tuple[str, torch.nn.Module]] = []
         self._layer_execution_tracker_module_names = set()
+        self.pin_groups = pin_groups
 
     def initialize_hook(self, module):
         def make_execution_order_update_callback(current_name, current_submodule):
@@ -442,6 +461,50 @@ def post_forward(self, module, output):
             group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
             group_offloading_hooks[i].next_group.onload_self = False
 
+        if self.pin_groups is not None and num_executed > 0:
+            param_exec_info = []
+            for idx, ((name, submodule), hook) in enumerate(zip(self.execution_order, group_offloading_hooks)):
+                if hook is None:
+                    continue
+                if next(submodule.parameters(), None) is None and next(submodule.buffers(), None) is None:
+                    continue
+                param_exec_info.append((name, submodule, hook))
+
+            num_param_modules = len(param_exec_info)
+            if num_param_modules > 0:
+                pinned_indices = set()
+                if isinstance(self.pin_groups, str):
+                    if self.pin_groups == "all":
+                        pinned_indices = set(range(num_param_modules))
+                    elif self.pin_groups == "first_last":
+                        pinned_indices.add(0)
+                        pinned_indices.add(num_param_modules - 1)
+                elif callable(self.pin_groups):
+                    for idx, (name, submodule, _) in enumerate(param_exec_info):
+                        should_pin = False
+                        try:
+                            should_pin = bool(self.pin_groups(submodule))
+                        except TypeError:
+                            try:
+                                should_pin = bool(self.pin_groups(name, submodule))
+                            except TypeError:
+                                should_pin = bool(self.pin_groups(name, submodule, idx))
+                        if should_pin:
+                            pinned_indices.add(idx)
+
+                pinned_groups = set()
+                for idx in pinned_indices:
+                    if idx >= num_param_modules:
+                        continue
+                    group = param_exec_info[idx][2].group
+                    if group not in pinned_groups:
+                        group.pinned = True
+                        pinned_groups.add(group)
+
+                for group in pinned_groups:
+                    if group.offload_device != group.onload_device:
+                        group.onload_()
+
         return output
 
 
@@ -473,7 +536,7 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
-    exclude_kwargs: Optional[List[str]] = None,
+    pin_groups: Optional[Union[str, Callable]] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -532,12 +595,12 @@ def apply_group_offloading(
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
         block_modules (`List[str]`, *optional*):
-            List of module names that should be treated as blocks for offloading. If provided, only these modules will
-            be considered for block-level offloading. If not provided, the default block detection logic will be used.
-        exclude_kwargs (`List[str]`, *optional*):
-            List of kwarg keys that should not be processed by send_to_device. This is useful for mutable state like
-            caching lists that need to maintain their object identity across forward passes. If not provided, will be
-            inferred from the module's `_skip_keys` attribute if it exists.
+            List of module names that should be treated as blocks for offloading. If provided, only these modules
+            will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
+            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            receives a module (and optionally the module name and index) and returns `True` to pin that group.
 
     Example:
         ```python
@@ -577,13 +640,17 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
-    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    normalized_pin_groups = pin_groups
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    elif pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
 
-    if block_modules is None:
-        block_modules = getattr(module, "_group_offload_block_modules", None)
+    pin_groups = normalized_pin_groups
 
-    if exclude_kwargs is None:
-        exclude_kwargs = getattr(module, "_skip_keys", None)
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -596,7 +663,7 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
-        exclude_kwargs=exclude_kwargs,
+        pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
 
@@ -613,11 +680,11 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
-    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
-    done at the top-level blocks and modules specified in block_modules.
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading
+    is done at the top-level blocks and modules specified in block_modules.
 
     When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
-    module, recursively apply block offloading to it.
+    module, we either offload the entire submodule or recursively apply block offloading to it.
     """
     if config.stream is not None and config.num_blocks_per_group != 1:
         logger.warning(
@@ -635,15 +702,10 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
     for name, submodule in module.named_children():
         # Check if this is an explicitly defined block module
         if name in block_modules:
-            # Track submodule using a prefix to avoid filename collisions during disk offload.
-            # Without this, submodules sharing the same model class would be assigned identical
-            # filenames (derived from the class name).
-            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
-            submodule_config = replace(config, module_prefix=prefix)
-
-            _apply_group_offloading_block_level(submodule, submodule_config)
-            modules_with_group_offloading.add(name)
-
+            # Apply block offloading to the specified submodule
+            _apply_block_offloading_to_submodule(
+                submodule, name, config, modules_with_group_offloading, matched_module_groups
+            )
         elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
             # Handle ModuleList and Sequential blocks as before
             for i in range(0, len(submodule), config.num_blocks_per_group):
@@ -651,7 +713,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
                 if len(current_modules) == 0:
                     continue
 
-                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
                 group = ModuleGroup(
                     modules=current_modules,
                     offload_device=config.offload_device,
@@ -672,6 +734,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
         else:
             # This is an unmatched module
             unmatched_modules.append((name, submodule))
+            modules_with_group_offloading.add(name)
 
     # Apply group offloading hooks to the module groups
     for i, group in enumerate(matched_module_groups):
@@ -703,7 +766,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             stream=None,
             record_stream=False,
             onload_self=True,
-            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
+            group_id=f"{module.__class__.__name__}_unmatched_group",
         )
         if config.stream is None:
             _apply_group_offloading_hook(module, unmatched_group, config=config)
@@ -711,6 +774,67 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
 
 
+def _apply_block_offloading_to_submodule(
+    submodule: torch.nn.Module,
+    name: str,
+    config: GroupOffloadingConfig,
+    modules_with_group_offloading: Set[str],
+    matched_module_groups: List[ModuleGroup],
+) -> None:
+    r"""
+    Apply block offloading to a explicitly defined submodule. This function either:
+    1. Offloads the entire submodule as a single group ( SIMPLE APPROACH)
+    2. Recursively applies block offloading to the submodule
+
+    For now, we use the simple approach - offload the entire submodule as a single group.
+    """
+    # Simple approach: offload the entire submodule as a single group
+    # Since AEs are typically small, this is usually okay
+    if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+        # If it's a ModuleList or Sequential, apply the normal block-level logic
+        for i in range(0, len(submodule), config.num_blocks_per_group):
+            current_modules = list(submodule[i : i + config.num_blocks_per_group])
+            if len(current_modules) == 0:
+                continue
+
+            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+            group = ModuleGroup(
+                modules=current_modules,
+                offload_device=config.offload_device,
+                onload_device=config.onload_device,
+                offload_to_disk_path=config.offload_to_disk_path,
+                offload_leader=current_modules[-1],
+                onload_leader=current_modules[0],
+                non_blocking=config.non_blocking,
+                stream=config.stream,
+                record_stream=config.record_stream,
+                low_cpu_mem_usage=config.low_cpu_mem_usage,
+                onload_self=True,
+                group_id=group_id,
+            )
+            matched_module_groups.append(group)
+            for j in range(i, i + len(current_modules)):
+                modules_with_group_offloading.add(f"{name}.{j}")
+    else:
+        # For other modules, treat the entire submodule as a single group
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        matched_module_groups.append(group)
+        modules_with_group_offloading.add(name)
+
+
 def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
@@ -837,8 +961,8 @@ def _apply_lazy_group_offloading_hook(
     if registry.get_hook(_GROUP_OFFLOADING) is None:
         hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
-
-    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
+        
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups = config.pin_groups)
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
 
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 41da95d3a2a2..3263be4e046e 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -531,8 +531,7 @@ def enable_group_offload(
         record_stream: bool = False,
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
-        block_modules: Optional[str] = None,
-        exclude_kwargs: Optional[str] = None,
+        pin_groups: Optional[Union[str, Callable]] = None
     ) -> None:
         r"""
         Activates group offloading for the current model.
@@ -572,7 +571,7 @@ def enable_group_offload(
                 f"`_supports_group_offloading` to `True` in the class definition. If you believe this is a mistake, please "
                 f"open an issue at https://github.com/huggingface/diffusers/issues."
             )
-
+        block_modules = getattr(self, "_group_offload_block_modules", None)
         apply_group_offloading(
             module=self,
             onload_device=onload_device,
@@ -585,7 +584,7 @@ def enable_group_offload(
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
             block_modules=block_modules,
-            exclude_kwargs=exclude_kwargs,
+            pin_groups=pin_groups
         )
 
     def set_attention_backend(self, backend: str) -> None:

From c72ddbc3c70f3b559d218ea490c6841a8eb6b0fd Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 29 Nov 2025 19:28:47 +0530
Subject: [PATCH 12/21] Expose group offload pinning options in API

---
 src/diffusers/pipelines/pipeline_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 392d5fb3feb4..d0fab44a6187 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1342,6 +1342,8 @@ def enable_group_offload(
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
         exclude_modules: Optional[Union[str, List[str]]] = None,
+        pin_groups: Optional[Union[str, Callable]] = None,
+        pin_first_last: bool = False,
     ) -> None:
         r"""
         Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
@@ -1402,6 +1404,11 @@ def enable_group_offload(
                 This option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be
                 useful when the CPU memory is a bottleneck but may counteract the benefits of using streams.
             exclude_modules (`Union[str, List[str]]`, defaults to `None`): List of modules to exclude from offloading.
+            pin_groups (`\"first_last\"` | `\"all\"` | `Callable`, *optional*):
+                Optionally keep selected groups on the onload device permanently. See `ModelMixin.enable_group_offload`
+                for details.
+            pin_first_last (`bool`, *optional*, defaults to `False`):
+                Deprecated alias for `pin_groups=\"first_last\"`.
 
         Example:
             ```python
@@ -1442,6 +1449,8 @@ def enable_group_offload(
             "record_stream": record_stream,
             "low_cpu_mem_usage": low_cpu_mem_usage,
             "offload_to_disk_path": offload_to_disk_path,
+            "pin_groups": pin_groups,
+            "pin_first_last": pin_first_last,
         }
         for name, component in self.components.items():
             if name not in exclude_modules and isinstance(component, torch.nn.Module):

From 1cd3355c0c534eabb604664abfbbe1d4146cac5e Mon Sep 17 00:00:00 2001
From: bconstantine <bryan.constantine7@gmail.com>
Date: Sun, 30 Nov 2025 22:47:39 +0800
Subject: [PATCH 13/21] removed deprecated flag pin_first_last

---
 src/diffusers/pipelines/pipeline_utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index d0fab44a6187..a8084688d498 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1343,7 +1343,6 @@ def enable_group_offload(
         offload_to_disk_path: Optional[str] = None,
         exclude_modules: Optional[Union[str, List[str]]] = None,
         pin_groups: Optional[Union[str, Callable]] = None,
-        pin_first_last: bool = False,
     ) -> None:
         r"""
         Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
@@ -1407,8 +1406,6 @@ def enable_group_offload(
             pin_groups (`\"first_last\"` | `\"all\"` | `Callable`, *optional*):
                 Optionally keep selected groups on the onload device permanently. See `ModelMixin.enable_group_offload`
                 for details.
-            pin_first_last (`bool`, *optional*, defaults to `False`):
-                Deprecated alias for `pin_groups=\"first_last\"`.
 
         Example:
             ```python
@@ -1450,7 +1447,6 @@ def enable_group_offload(
             "low_cpu_mem_usage": low_cpu_mem_usage,
             "offload_to_disk_path": offload_to_disk_path,
             "pin_groups": pin_groups,
-            "pin_first_last": pin_first_last,
         }
         for name, component in self.components.items():
             if name not in exclude_modules and isinstance(component, torch.nn.Module):

From 1194a83d425d94c18ff9348030e1fd4798c903ca Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Thu, 11 Dec 2025 00:43:15 +0530
Subject: [PATCH 14/21] Address review feedback for group offload pinning

---
 src/diffusers/hooks/group_offloading.py | 97 +++++++++++++++++--------
 src/diffusers/models/modeling_utils.py  | 12 ++-
 2 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 36b09cb692dc..9fa747194a87 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -60,6 +60,8 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
+    exclude_kwargs: Optional[List[str]] = None
+    module_prefix: Optional[str] = ""
     pin_groups: Optional[Union[str, Callable]] = None
 
 
@@ -156,27 +158,27 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
-    def _transfer_tensor_to_device(self, tensor, source_tensor):
+    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream=None):
         tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
         if self.record_stream:
-            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
+            tensor.data.record_stream(default_stream)
 
-    def _process_tensors_from_modules(self, pinned_memory=None):
+    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
         for group_module in self.modules:
             for param in group_module.parameters():
                 source = pinned_memory[param] if pinned_memory else param.data
-                self._transfer_tensor_to_device(param, source)
+                self._transfer_tensor_to_device(param, source, default_stream)
             for buffer in group_module.buffers():
                 source = pinned_memory[buffer] if pinned_memory else buffer.data
-                self._transfer_tensor_to_device(buffer, source)
+                self._transfer_tensor_to_device(buffer, source, default_stream)
 
         for param in self.parameters:
             source = pinned_memory[param] if pinned_memory else param.data
-            self._transfer_tensor_to_device(param, source)
+            self._transfer_tensor_to_device(param, source, default_stream)
 
         for buffer in self.buffers:
             source = pinned_memory[buffer] if pinned_memory else buffer.data
-            self._transfer_tensor_to_device(buffer, source)
+            self._transfer_tensor_to_device(buffer, source, default_stream)
 
     def _onload_from_disk(self):
         if self.stream is not None:
@@ -211,10 +213,11 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
-                    self._process_tensors_from_modules(pinned_memory)
+                    self._process_tensors_from_modules(pinned_memory, default_stream=default_stream)
             else:
                 self._process_tensors_from_modules(None)
 
@@ -308,13 +311,16 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.next_group.onload_()
 
             should_synchronize = (
-                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+                not self.group.onload_self
+                and self.group.stream is not None
+                and not should_onload_next_group
+                and not self.group.record_stream
             )
             if should_synchronize:
                 self.group.stream.synchronize()
 
             args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = self._send_kwargs_to_device(kwargs)
             return args, kwargs
 
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
@@ -329,7 +335,10 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.next_group.onload_()
 
             should_synchronize = (
-                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+                not self.group.onload_self
+                and self.group.stream is not None
+                and not should_onload_next_group
+                and not self.group.record_stream
             )
             if should_synchronize:
                 # If this group didn't onload itself, it means it was asynchronously onloaded by the
@@ -341,7 +350,7 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+        kwargs = self._send_kwargs_to_device(kwargs)
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
@@ -360,10 +369,19 @@ def _is_group_on_device(self) -> bool:
         tensors.extend(self.group.parameters)
         tensors.extend(self.group.buffers)
 
-        if len(tensors) == 0:
-            return True
+        return len(tensors) > 0 and all(t.device == self.group.onload_device for t in tensors)
 
-        return all(t.device == self.group.onload_device for t in tensors)
+    def _send_kwargs_to_device(self, kwargs):
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+            return kwargs
+        return send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
 
 
 class LazyPrefetchGroupOffloadingHook(ModelHook):
@@ -524,6 +542,17 @@ def pre_forward(self, module, *args, **kwargs):
         return args, kwargs
 
 
+def _normalize_pin_groups(pin_groups: Optional[Union[str, Callable]]) -> Optional[Union[str, Callable]]:
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+        return normalized_pin_groups
+    if pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    return pin_groups
+
+
 def apply_group_offloading(
     module: torch.nn.Module,
     onload_device: Union[str, torch.device],
@@ -536,6 +565,7 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
+    exclude_kwargs: Optional[List[str]] = None,
     pin_groups: Optional[Union[str, Callable]] = None,
 ) -> None:
     r"""
@@ -597,6 +627,10 @@ def apply_group_offloading(
         block_modules (`List[str]`, *optional*):
             List of module names that should be treated as blocks for offloading. If provided, only these modules
             will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        exclude_kwargs (`List[str]`, *optional*):
+            List of kwarg keys that should not be processed by `send_to_device`. This is useful for mutable state like
+            caching lists that need to maintain their object identity across forward passes. If not provided, will be
+            inferred from the module's `_skip_keys` attribute if it exists.
         pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
             Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
             and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
@@ -640,17 +674,14 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
-    normalized_pin_groups = pin_groups
-    if isinstance(pin_groups, str):
-        normalized_pin_groups = pin_groups.lower()
-        if normalized_pin_groups not in {"first_last", "all"}:
-            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
-    elif pin_groups is not None and not callable(pin_groups):
-        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    pin_groups = _normalize_pin_groups(pin_groups)
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
-    pin_groups = normalized_pin_groups
+    if block_modules is None:
+        block_modules = getattr(module, "_group_offload_block_modules", None)
 
-    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    if exclude_kwargs is None:
+        exclude_kwargs = getattr(module, "_skip_keys", None)
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -663,6 +694,8 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
+        exclude_kwargs=exclude_kwargs,
+        module_prefix="",
         pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
@@ -701,7 +734,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
 
     for name, submodule in module.named_children():
         # Check if this is an explicitly defined block module
-        if name in block_modules:
+        if block_modules and name in block_modules:
             # Apply block offloading to the specified submodule
             _apply_block_offloading_to_submodule(
                 submodule, name, config, modules_with_group_offloading, matched_module_groups
@@ -713,7 +746,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
                 if len(current_modules) == 0:
                     continue
 
-                group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
                 group = ModuleGroup(
                     modules=current_modules,
                     offload_device=config.offload_device,
@@ -766,7 +799,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             stream=None,
             record_stream=False,
             onload_self=True,
-            group_id=f"{module.__class__.__name__}_unmatched_group",
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
         )
         if config.stream is None:
             _apply_group_offloading_hook(module, unmatched_group, config=config)
@@ -797,7 +830,7 @@ def _apply_block_offloading_to_submodule(
             if len(current_modules) == 0:
                 continue
 
-            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+            group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
             group = ModuleGroup(
                 modules=current_modules,
                 offload_device=config.offload_device,
@@ -829,7 +862,7 @@ def _apply_block_offloading_to_submodule(
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         matched_module_groups.append(group)
         modules_with_group_offloading.add(name)
@@ -859,7 +892,7 @@ def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOff
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         _apply_group_offloading_hook(submodule, group, config=config)
         modules_with_group_offloading.add(name)
@@ -906,7 +939,7 @@ def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOff
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         _apply_group_offloading_hook(parent_module, group, config=config)
 
@@ -962,7 +995,7 @@ def _apply_lazy_group_offloading_hook(
         hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
         
-    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups = config.pin_groups)
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups=config.pin_groups)
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
 
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 3263be4e046e..0e21d2eb1429 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -531,7 +531,9 @@ def enable_group_offload(
         record_stream: bool = False,
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
-        pin_groups: Optional[Union[str, Callable]] = None
+        block_modules: Optional[str] = None,
+        exclude_kwargs: Optional[str] = None,
+        pin_groups: Optional[Union[str, Callable]] = None,
     ) -> None:
         r"""
         Activates group offloading for the current model.
@@ -571,7 +573,10 @@ def enable_group_offload(
                 f"`_supports_group_offloading` to `True` in the class definition. If you believe this is a mistake, please "
                 f"open an issue at https://github.com/huggingface/diffusers/issues."
             )
-        block_modules = getattr(self, "_group_offload_block_modules", None)
+        if block_modules is None:
+            block_modules = getattr(self, "_group_offload_block_modules", None)
+        if exclude_kwargs is None:
+            exclude_kwargs = getattr(self, "_skip_keys", None)
         apply_group_offloading(
             module=self,
             onload_device=onload_device,
@@ -584,7 +589,8 @@ def enable_group_offload(
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
             block_modules=block_modules,
-            pin_groups=pin_groups
+            exclude_kwargs=exclude_kwargs,
+            pin_groups=pin_groups,
         )
 
     def set_attention_backend(self, backend: str) -> None:

From 3ef894d42fcf4ef6e019fd95a303235cdebe99d9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 11 Dec 2025 04:28:35 +0000
Subject: [PATCH 15/21] Apply style fixes

---
 src/diffusers/hooks/group_offloading.py |  14 +--
 tests/hooks/test_group_offloading.py    | 136 +++++++++++++-----------
 2 files changed, 81 insertions(+), 69 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 9fa747194a87..a22dbd9fc714 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -625,15 +625,15 @@ def apply_group_offloading(
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
         block_modules (`List[str]`, *optional*):
-            List of module names that should be treated as blocks for offloading. If provided, only these modules
-            will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+            List of module names that should be treated as blocks for offloading. If provided, only these modules will
+            be considered for block-level offloading. If not provided, the default block detection logic will be used.
         exclude_kwargs (`List[str]`, *optional*):
             List of kwarg keys that should not be processed by `send_to_device`. This is useful for mutable state like
             caching lists that need to maintain their object identity across forward passes. If not provided, will be
             inferred from the module's `_skip_keys` attribute if it exists.
         pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
-            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
-            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first and
+            last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
             receives a module (and optionally the module name and index) and returns `True` to pin that group.
 
     Example:
@@ -713,8 +713,8 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
-    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading
-    is done at the top-level blocks and modules specified in block_modules.
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
+    done at the top-level blocks and modules specified in block_modules.
 
     When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
     module, we either offload the entire submodule or recursively apply block offloading to it.
@@ -994,7 +994,7 @@ def _apply_lazy_group_offloading_hook(
     if registry.get_hook(_GROUP_OFFLOADING) is None:
         hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
-        
+
     lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups=config.pin_groups)
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
 
diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 58520bef9aa5..d7c8bf158381 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -15,6 +15,7 @@
 import contextlib
 import gc
 import unittest
+from typing import Any, Iterable, List, Optional, Sequence, Union
 
 import torch
 from parameterized import parameterized
@@ -25,8 +26,6 @@
 from diffusers.utils import get_logger
 from diffusers.utils.import_utils import compare_versions
 
-from typing import Any, Iterable, List, Optional, Sequence, Union
-
 from ..testing_utils import (
     backend_empty_cache,
     backend_max_memory_allocated,
@@ -149,7 +148,7 @@ def __init__(self):
     def post_forward(self, module, output):
         self.outputs.append(output)
         return output
-    
+
 
 # Test for https://github.com/huggingface/diffusers/pull/12747
 class DummyCallableBySubmodule:
@@ -157,14 +156,15 @@ class DummyCallableBySubmodule:
     Callable group offloading pinner that pins first and last DummyBlock
     called in the program by callable(submodule)
     """
+
     def __init__(self, pin_targets: Iterable[torch.nn.Module]) -> None:
         self.pin_targets = set(pin_targets)
-        self.calls_track = [] # testing only
+        self.calls_track = []  # testing only
 
     def __call__(self, submodule: torch.nn.Module) -> bool:
         self.calls_track.append(submodule)
         return self._normalize_module_type(submodule) in self.pin_targets
-    
+
     def _normalize_module_type(self, obj: Any) -> Optional[torch.nn.Module]:
         # group might be a single module, or a container of modules
         # The group-offloading code may pass either:
@@ -181,31 +181,37 @@ def _normalize_module_type(self, obj: Any) -> Optional[torch.nn.Module]:
             mods = [m for m in obj if isinstance(m, torch.nn.Module)]
             return mods[0] if len(mods) == 1 else None
         return None
-    
+
+
 class DummyCallableByNameSubmodule(DummyCallableBySubmodule):
     """
     Callable group offloading pinner that pins first and last DummyBlock
     Same behaviour with DummyCallableBySubmodule, only with different call signature
     called in the program by callable(name, submodule)
     """
+
     def __call__(self, name: str, submodule: torch.nn.Module) -> bool:
         self.calls_track.append((name, submodule))
         return self._normalize_module_type(submodule) in self.pin_targets
-    
+
+
 class DummyCallableByNameSubmoduleIdx(DummyCallableBySubmodule):
     """
     Callable group offloading pinner that pins first and last DummyBlock.
     Same behaviour with DummyCallableBySubmodule, only with different call signature
     Called in the program by callable(name, submodule, idx)
     """
+
     def __call__(self, name: str, submodule: torch.nn.Module, idx: int) -> bool:
         self.calls_track.append((name, submodule, idx))
         return self._normalize_module_type(submodule) in self.pin_targets
-    
+
+
 class DummyInvalidCallable(DummyCallableBySubmodule):
     """
     Callable group offloading pinner that uses invalid call signature
     """
+
     def __call__(self, name: str, submodule: torch.nn.Module, idx: int, extra: Any) -> bool:
         self.calls_track.append((name, submodule, idx, extra))
         return self._normalize_module_type(submodule) in self.pin_targets
@@ -424,14 +430,14 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
             self.assertLess(
                 cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
             )
-    
+
     def test_block_level_offloading_with_pin_groups_stay_on_device(self):
         if torch.device(torch_device).type not in ["cuda", "xpu"]:
             return
 
-        def assert_all_modules_on_expected_device(modules: Sequence[torch.nn.Module],
-                                                  expected_device: Union[torch.device, str],
-                                                  header_error_msg: str = "") -> None:
+        def assert_all_modules_on_expected_device(
+            modules: Sequence[torch.nn.Module], expected_device: Union[torch.device, str], header_error_msg: str = ""
+        ) -> None:
             def first_param_device(modules: torch.nn.Module) -> torch.device:
                 p = next(modules.parameters(), None)
                 self.assertIsNotNone(p, f"No parameters found for module {modules}")
@@ -439,7 +445,7 @@ def first_param_device(modules: torch.nn.Module) -> torch.device:
 
             if isinstance(expected_device, torch.device):
                 expected_device = expected_device.type
-            
+
             bad = []
             for i, m in enumerate(modules):
                 dev_type = first_param_device(m).type
@@ -458,14 +464,14 @@ def get_param_modules_from_execution_order(model: DummyModel) -> List[torch.nn.M
             lazy_hook = root_registry.get_hook("lazy_prefetch_group_offloading")
             self.assertIsNotNone(lazy_hook, "lazy_prefetch_group_offloading hook was not registered")
 
-            #record execution order with first forward
+            # record execution order with first forward
             with torch.no_grad():
                 model(self.input)
 
             mods = [m for _, m in lazy_hook.execution_order]
             param_modules = [m for m in mods if next(m.parameters(), None) is not None]
             return param_modules
-        
+
         def assert_callables_offloading_tests(
             param_modules: Sequence[torch.nn.Module],
             callable: Any,
@@ -473,10 +479,15 @@ def assert_callables_offloading_tests(
         ) -> None:
             pinned_modules = [m for m in param_modules if m in callable.pin_targets]
             unpinned_modules = [m for m in param_modules if m not in callable.pin_targets]
-            self.assertTrue(len(callable.calls_track) > 0, f"{header_error_msg}: callable should have been called at least once")
-            assert_all_modules_on_expected_device(pinned_modules, torch_device, f"{header_error_msg}: pinned blocks should stay on device")
-            assert_all_modules_on_expected_device(unpinned_modules, "cpu", f"{header_error_msg}: unpinned blocks should be offloaded")
-
+            self.assertTrue(
+                len(callable.calls_track) > 0, f"{header_error_msg}: callable should have been called at least once"
+            )
+            assert_all_modules_on_expected_device(
+                pinned_modules, torch_device, f"{header_error_msg}: pinned blocks should stay on device"
+            )
+            assert_all_modules_on_expected_device(
+                unpinned_modules, "cpu", f"{header_error_msg}: unpinned blocks should be offloaded"
+            )
 
         default_parameters = {
             "onload_device": torch_device,
@@ -485,13 +496,13 @@ def assert_callables_offloading_tests(
             "use_stream": True,
         }
         model_default_no_pin = self.get_model()
-        model_default_no_pin.enable_group_offload(
-            **default_parameters
-        )
+        model_default_no_pin.enable_group_offload(**default_parameters)
         param_modules = get_param_modules_from_execution_order(model_default_no_pin)
-        assert_all_modules_on_expected_device(param_modules, 
-                                              expected_device="cpu", 
-                                              header_error_msg="default pin_groups: expected ALL modules offloaded to CPU")
+        assert_all_modules_on_expected_device(
+            param_modules,
+            expected_device="cpu",
+            header_error_msg="default pin_groups: expected ALL modules offloaded to CPU",
+        )
 
         model_pin_all = self.get_model()
         model_pin_all.enable_group_offload(
@@ -499,10 +510,11 @@ def assert_callables_offloading_tests(
             pin_groups="all",
         )
         param_modules = get_param_modules_from_execution_order(model_pin_all)
-        assert_all_modules_on_expected_device(param_modules, 
-                                              expected_device=torch_device, 
-                                              header_error_msg="pin_groups = all: expected ALL layers on accelerator device")
-
+        assert_all_modules_on_expected_device(
+            param_modules,
+            expected_device=torch_device,
+            header_error_msg="pin_groups = all: expected ALL layers on accelerator device",
+        )
 
         model_pin_first_last = self.get_model()
         model_pin_first_last.enable_group_offload(
@@ -510,41 +522,45 @@ def assert_callables_offloading_tests(
             pin_groups="first_last",
         )
         param_modules = get_param_modules_from_execution_order(model_pin_first_last)
-        assert_all_modules_on_expected_device([param_modules[0], param_modules[-1]], 
-                                              expected_device=torch_device, 
-                                              header_error_msg="pin_groups = first_last: expected first and last layers on accelerator device")
-        assert_all_modules_on_expected_device(param_modules[1:-1], 
-                                              expected_device="cpu", 
-                                              header_error_msg="pin_groups = first_last: expected ALL middle layers offloaded to CPU")
-
-        
+        assert_all_modules_on_expected_device(
+            [param_modules[0], param_modules[-1]],
+            expected_device=torch_device,
+            header_error_msg="pin_groups = first_last: expected first and last layers on accelerator device",
+        )
+        assert_all_modules_on_expected_device(
+            param_modules[1:-1],
+            expected_device="cpu",
+            header_error_msg="pin_groups = first_last: expected ALL middle layers offloaded to CPU",
+        )
+
         model = self.get_model()
         callable_by_submodule = DummyCallableBySubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
-        model.enable_group_offload(**default_parameters, 
-                                   pin_groups=callable_by_submodule)
+        model.enable_group_offload(**default_parameters, pin_groups=callable_by_submodule)
         param_modules = get_param_modules_from_execution_order(model)
-        assert_callables_offloading_tests(param_modules, 
-                                          callable_by_submodule,
-                                          header_error_msg="pin_groups with callable(submodule)")
+        assert_callables_offloading_tests(
+            param_modules, callable_by_submodule, header_error_msg="pin_groups with callable(submodule)"
+        )
 
         model = self.get_model()
         callable_by_name_submodule = DummyCallableByNameSubmodule(pin_targets=[model.blocks[0], model.blocks[-1]])
-        model.enable_group_offload(**default_parameters, 
-                                   pin_groups=callable_by_name_submodule)
+        model.enable_group_offload(**default_parameters, pin_groups=callable_by_name_submodule)
         param_modules = get_param_modules_from_execution_order(model)
-        assert_callables_offloading_tests(param_modules, 
-                                          callable_by_name_submodule,
-                                          header_error_msg="pin_groups with callable(name, submodule)")
+        assert_callables_offloading_tests(
+            param_modules, callable_by_name_submodule, header_error_msg="pin_groups with callable(name, submodule)"
+        )
 
         model = self.get_model()
-        callable_by_name_submodule_idx = DummyCallableByNameSubmoduleIdx(pin_targets=[model.blocks[0], model.blocks[-1]])
-        model.enable_group_offload(**default_parameters, 
-                                   pin_groups=callable_by_name_submodule_idx)
+        callable_by_name_submodule_idx = DummyCallableByNameSubmoduleIdx(
+            pin_targets=[model.blocks[0], model.blocks[-1]]
+        )
+        model.enable_group_offload(**default_parameters, pin_groups=callable_by_name_submodule_idx)
         param_modules = get_param_modules_from_execution_order(model)
-        assert_callables_offloading_tests(param_modules, 
-                                          callable_by_name_submodule_idx,
-                                          header_error_msg="pin_groups with callable(name, submodule, idx)")
-        
+        assert_callables_offloading_tests(
+            param_modules,
+            callable_by_name_submodule_idx,
+            header_error_msg="pin_groups with callable(name, submodule, idx)",
+        )
+
     def test_error_raised_if_pin_groups_received_invalid_value(self):
         default_parameters = {
             "onload_device": torch_device,
@@ -553,8 +569,9 @@ def test_error_raised_if_pin_groups_received_invalid_value(self):
             "use_stream": True,
         }
         model = self.get_model()
-        with self.assertRaisesRegex(ValueError, 
-                                    "`pin_groups` must be one of `None`, 'first_last', 'all', or a callable."):
+        with self.assertRaisesRegex(
+            ValueError, "`pin_groups` must be one of `None`, 'first_last', 'all', or a callable."
+        ):
             model.enable_group_offload(
                 **default_parameters,
                 pin_groups="invalid value",
@@ -573,11 +590,6 @@ def test_error_raised_if_pin_groups_received_invalid_callables(self):
             **default_parameters,
             pin_groups=invalid_callable,
         )
-        with self.assertRaisesRegex(TypeError, 
-                                    r"missing\s+\d+\s+required\s+positional\s+argument(s)?:"):
+        with self.assertRaisesRegex(TypeError, r"missing\s+\d+\s+required\s+positional\s+argument(s)?:"):
             with torch.no_grad():
                 model(self.input)
-            
-        
-
-        
\ No newline at end of file

From 1bd4539880e8e2d94c8494d20e8a2d704bd6b34d Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Thu, 11 Dec 2025 23:28:16 +0530
Subject: [PATCH 16/21] Fix disk offload block_modules recursion to avoid extra
 files

---
 src/diffusers/hooks/group_offloading.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index eaf195291885..f73a5a470cae 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -15,7 +15,7 @@
 import hashlib
 import os
 from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from enum import Enum
 from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
@@ -751,10 +751,14 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
     for name, submodule in module.named_children():
         # Check if this is an explicitly defined block module
         if block_modules and name in block_modules:
-            # Apply block offloading to the specified submodule
-            _apply_block_offloading_to_submodule(
-                submodule, name, config, modules_with_group_offloading, matched_module_groups
-            )
+            # Track submodule using a prefix to avoid filename collisions during disk offload.
+            # Without this, submodules sharing the same model class would be assigned identical
+            # filenames (derived from the class name).
+            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
+            submodule_config = replace(config, module_prefix=prefix)
+
+            _apply_group_offloading_block_level(submodule, submodule_config)
+            modules_with_group_offloading.add(name)
         elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
             # Handle ModuleList and Sequential blocks as before
             for i in range(0, len(submodule), config.num_blocks_per_group):

From 93c253fb0316d25f8879fd23e7e8d1f977eb39a8 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Fri, 12 Dec 2025 14:31:27 +0530
Subject: [PATCH 17/21] Prefix block offload group ids with module prefix

---
 src/diffusers/hooks/group_offloading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index f73a5a470cae..036b73c189ec 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -766,7 +766,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
                 if len(current_modules) == 0:
                     continue
 
-                group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
                 group = ModuleGroup(
                     modules=current_modules,
                     offload_device=config.offload_device,
@@ -819,7 +819,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             stream=None,
             record_stream=False,
             onload_self=True,
-            group_id=f"{module.__class__.__name__}_unmatched_group",
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
         )
         if config.stream is None:
             _apply_group_offloading_hook(module, unmatched_group, config=config)

From 8d059e60f678698e00828354657b5d19156453c8 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Fri, 12 Dec 2025 23:49:05 +0530
Subject: [PATCH 18/21] Attach group offload hook to root when fully grouped

---
 src/diffusers/hooks/group_offloading.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 036b73c189ec..8a89c58724e0 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -825,6 +825,25 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
             _apply_group_offloading_hook(module, unmatched_group, config=config)
         else:
             _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+    elif config.stream is None and config.offload_to_disk_path is None:
+        # Ensure the top-level module always has a hook when no unmatched modules/params/buffers,
+        # to satisfy hook presence checks in tests. Using an empty group avoids extra offload files.
+        empty_group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=None,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=[],
+            buffers=[],
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            onload_self=True,
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_empty_group",
+        )
+        _apply_group_offloading_hook(module, empty_group, config=config)
 
 
 def _apply_block_offloading_to_submodule(

From b950c747b5c07e8835f1a8ca5415f421181e02b5 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 13 Dec 2025 00:43:06 +0530
Subject: [PATCH 19/21] Fix leaf-level group offload root hook

---
 src/diffusers/hooks/group_offloading.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 8a89c58724e0..c300684d5608 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -982,6 +982,28 @@ def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOff
         )
         _apply_group_offloading_hook(parent_module, group, config=config)
 
+    # Ensure the top-level module also has a group_offloading hook so hook presence checks pass,
+    # even when it holds no parameters/buffers itself.
+    if config.stream is None:
+        root_registry = HookRegistry.check_if_exists_or_initialize(module)
+        if root_registry.get_hook(_GROUP_OFFLOADING) is None:
+            empty_group = ModuleGroup(
+                modules=[],
+                offload_device=config.offload_device,
+                onload_device=config.onload_device,
+                offload_to_disk_path=None,
+                offload_leader=module,
+                onload_leader=module,
+                parameters=[],
+                buffers=[],
+                non_blocking=False,
+                stream=None,
+                record_stream=False,
+                onload_self=True,
+                group_id=f"{config.module_prefix}{module.__class__.__name__}_empty_group",
+            )
+            root_registry.register_hook(GroupOffloadingHook(empty_group, config=config), _GROUP_OFFLOADING)
+
     if config.stream is not None:
         # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
         # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the

From 8da39a3317fabd2076dd253ddaad71954d5dc025 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 13 Dec 2025 17:40:05 +0530
Subject: [PATCH 20/21] Apply style fixes after lint

---
 src/diffusers/hooks/group_offloading.py | 5 ++---
 tests/hooks/test_group_offloading.py    | 4 +---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index c300684d5608..be5e22c5153c 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -213,7 +213,6 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
-        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
@@ -729,8 +728,8 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
-    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading
-    is done at the top-level blocks and modules specified in block_modules.
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
+    done at the top-level blocks and modules specified in block_modules.
 
     When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
     module, we either offload the entire submodule or recursively apply block offloading to it.
diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 76bde244c06e..d7c8bf158381 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -26,8 +26,6 @@
 from diffusers.utils import get_logger
 from diffusers.utils.import_utils import compare_versions
 
-from typing import Any, Iterable, List, Optional, Sequence, Union
-
 from ..testing_utils import (
     backend_empty_cache,
     backend_max_memory_allocated,
@@ -150,7 +148,7 @@ def __init__(self):
     def post_forward(self, module, output):
         self.outputs.append(output)
         return output
-    
+
 
 # Test for https://github.com/huggingface/diffusers/pull/12747
 class DummyCallableBySubmodule:

From 6c5e41a731c19fca0d252ba50bda7e37116818b2 Mon Sep 17 00:00:00 2001
From: Aki-07 <akileshramesh2003@gmail.com>
Date: Sat, 13 Dec 2025 18:07:41 +0530
Subject: [PATCH 21/21] Avoid eager offload before adapters load

---
 src/diffusers/hooks/group_offloading.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index be5e22c5153c..4d340e978de2 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -291,8 +291,6 @@ def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None
         self.config = config
 
     def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
-        if self.group.offload_leader == module:
-            self.group.offload_()
         return module
 
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs):