Address review feedback for group offload pinning

Aki-07 · Aki-07 · commit 335dca80fb2c · 2026-01-04T23:42:43.000+05:30
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -27,6 +27,9 @@
 from .hooks import HookRegistry, ModelHook
 
 
+VALID_PIN_GROUPS = {"all", "first_last"}
+
+
 if is_accelerate_available():
     from accelerate.hooks import AlignDevicesHook, CpuOffload
     from accelerate.utils import send_to_device
@@ -302,36 +305,19 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         # method is the onload_leader of the group.
         if self.group.onload_leader is None:
             self.group.onload_leader = module
+        is_leader = self.group.onload_leader == module
+        should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+        should_orchestrate = self.group.pinned or is_leader
+
+        if should_orchestrate:
+            # Pinned groups keep their params on the onload device; orchestrate onload/prefetch/sync every call.
+            if self.group.pinned:
+                if is_leader and not self._is_group_on_device():
+                    self.group.onload_()
+            else:
+                if is_leader and self.group.onload_self:
+                    self.group.onload_()
 
-        if self.group.pinned:
-            if self.group.onload_leader == module and not self._is_group_on_device():
-                self.group.onload_()
-
-            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
-            if should_onload_next_group:
-                self.next_group.onload_()
-
-            should_synchronize = (
-                not self.group.onload_self
-                and self.group.stream is not None
-                and not should_onload_next_group
-                and not self.group.record_stream
-            )
-            if should_synchronize:
-                self.group.stream.synchronize()
-
-            args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-            kwargs = self._send_kwargs_to_device(kwargs)
-            return args, kwargs
-
-        # If the current module is the onload_leader of the group, we onload the group if it is supposed
-        # to onload itself. In the case of using prefetching with streams, we onload the next group if
-        # it is not supposed to onload itself.
-        if self.group.onload_leader == module:
-            if self.group.onload_self:
-                self.group.onload_()
-
-            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
             if should_onload_next_group:
                 self.next_group.onload_()
 
@@ -345,9 +331,7 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 # If this group didn't onload itself, it means it was asynchronously onloaded by the
                 # previous group. We need to synchronize the side stream to ensure parameters
                 # are completely loaded to proceed with forward pass. Without this, uninitialized
-                # weights will be used in the computation, leading to incorrect results
-                # Also, we should only do this synchronization if we don't already do it from the sync call in
-                # self.next_group.onload_, hence the `not should_onload_next_group` check.
+                # weights will be used in the computation, leading to incorrect results.
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
@@ -546,9 +530,6 @@ def pre_forward(self, module, *args, **kwargs):
         return args, kwargs
 
 
-VALID_PIN_GROUPS = {"all", "first_last"}
-
-
 def _validate_pin_groups(pin_groups: Optional[Union[str, Callable]]) -> Optional[Union[str, Callable]]:
     if pin_groups is None or callable(pin_groups):
         return pin_groups
@@ -708,9 +689,6 @@ def apply_group_offloading(
 
 
 def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
-    registry = HookRegistry.check_if_exists_or_initialize(module)
-    registry._group_offload_pin_groups = config.pin_groups
-
     if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
         _apply_group_offloading_block_level(module, config)
     elif config.offload_type == GroupOffloadingType.LEAF_LEVEL:
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -252,6 +252,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _parallel_config = None
     _cp_plan = None
     _skip_keys = None
+    _group_offload_block_modules = None
 
     def __init__(self):
         super().__init__()
@@ -556,6 +557,11 @@ def enable_group_offload(
             ...     use_stream=True,
             ... )
             ```
+
+        Args:
+            pin_groups (`"first_last"` | `"all"` | `Callable`, *optional*):
+                Optionally keep selected groups on the onload device permanently. See
+                [`~hooks.group_offloading.apply_group_offloading`] for details.
         """
         from ..hooks import apply_group_offloading
 
diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
@@ -15,6 +15,7 @@
 import contextlib
 import gc
 import unittest
+from typing import Any, Iterable, List, Optional, Sequence, Union
 
 import torch
 from parameterized import parameterized
@@ -34,7 +35,6 @@
     torch_device,
 )
 
-from typing import Any, Iterable, List, Optional, Sequence, Union
 
 class DummyBlock(torch.nn.Module):
     def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
@@ -217,8 +217,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = block(x)
         x = self.norm(x)
         return x
-    
+
     # Test for https://github.com/huggingface/diffusers/pull/12747
+
+
 class DummyCallableBySubmodule:
     """
     Callable group offloading pinner that pins first and last DummyBlock
@@ -633,7 +635,7 @@ def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=Non
             "layers_per_block": 1,
         }
         return init_dict
-    
+
     def test_block_level_offloading_with_pin_groups_stay_on_device(self):
         if torch.device(torch_device).type not in ["cuda", "xpu"]:
             return