update

DN6 · DN6 · commit e74b782aac36 · 2025-03-12T08:45:09.000+05:30
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -29,46 +29,7 @@
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class PinnedGroupManager:
-    """
-    Manages a sliding window of pinned module groups to limit total CPU memory usage.
-    Only keeps up to max_pinned_groups pinned at any given time, unpinning oldest
-    ones when the limit is reached.
-    """
-
-    def __init__(self, max_pinned_groups: Optional[int] = None):
-        self.max_pinned_groups = max_pinned_groups
-        self.pinned_groups = []
-
-    def register_group(self, group: "ModuleGroup") -> None:
-        """Register a group with the manager"""
-        if self.max_pinned_groups is not None:
-            group._pinned_group_manager = self
-
-    def on_group_pinned(self, group: "ModuleGroup") -> None:
-        """Called when a group is pinned, manages the sliding window"""
-        if self.max_pinned_groups is None:
-            return
-
-        # Add the newly pinned group to our tracking list
-        if group not in self.pinned_groups:
-            self.pinned_groups.append(group)
-
-        # If we've exceeded our limit, unpin the oldest group(s)
-        while len(self.pinned_groups) > self.max_pinned_groups:
-            oldest_group = self.pinned_groups.pop(0)
-            # Only unpin if it's not the group we just pinned
-            if oldest_group != group and oldest_group.pinned_memory:
-                oldest_group.unpin_memory_()
-
-    def on_group_unpinned(self, group: "ModuleGroup") -> None:
-        """Called when a group is manually unpinned"""
-        if self.max_pinned_groups is None:
-            return
-
-        # Remove the group from our tracking list
-        if group in self.pinned_groups:
-            self.pinned_groups.remove(group)
+# Removed PinnedGroupManager - we no longer use pinned memory to avoid CPU memory spikes
 
 
 # fmt: off
@@ -100,7 +61,6 @@ def __init__(
         stream: Optional[torch.cuda.Stream] = None,
         cpu_param_dict: Optional[Dict[torch.nn.Parameter, torch.Tensor]] = None,
         onload_self: bool = True,
-        pinned_group_manager: Optional["PinnedGroupManager"] = None,
     ) -> None:
         self.modules = modules
         self.offload_device = offload_device
@@ -113,58 +73,38 @@ def __init__(
         self.stream = stream
         self.cpu_param_dict = cpu_param_dict
         self.onload_self = onload_self
-        # Track if we've pinned our group's memory
-        self.pinned_memory = False
-        # Reference to the pinned group manager for sliding window functionality
-        self._pinned_group_manager = pinned_group_manager
+        # We still track if we've prepared the CPU dict for compatibility
+        self.cpu_dict_prepared = False
 
         if self.stream is not None and self.cpu_param_dict is None:
-            raise ValueError("cpu_param_dict must be provided when using stream for data transfer.")
+            # Now we'll create the dict on demand
+            self.cpu_param_dict = {}
 
     def pin_memory_(self):
-        r"""Pin the memory of this group's parameters for faster transfer."""
-        if self.stream is not None and not self.pinned_memory:
-            # Create the pinned memory dict just for this group's parameters
-            self.cpu_param_dict = {}
+        r"""Prepare the CPU parameter dictionary for reference (no pinning)."""
+        if self.stream is not None and not self.cpu_dict_prepared:
+            # Create a reference-only CPU parameter dict without pinning
             for group_module in self.modules:
                 for param in group_module.parameters():
                     if param.device == self.offload_device:
-                        pinned_data = param.data.pin_memory()
-                        self.cpu_param_dict[param] = pinned_data
-                        param.data = pinned_data
+                        # Store a reference without pinning
+                        self.cpu_param_dict[param] = param.data
             if self.parameters is not None:
                 for param in self.parameters:
                     if param.device == self.offload_device:
-                        pinned_data = param.data.pin_memory()
-                        self.cpu_param_dict[param] = pinned_data
-                        param.data = pinned_data
+                        # Store a reference without pinning
+                        self.cpu_param_dict[param] = param.data
+            self.cpu_dict_prepared = True
+            # For API compatibility
             self.pinned_memory = True
 
-            # Notify the manager that this group has been pinned
-            if hasattr(self, "_pinned_group_manager") and self._pinned_group_manager is not None:
-                self._pinned_group_manager.on_group_pinned(self)
-
     def unpin_memory_(self):
-        r"""Unpin the memory of this group's parameters to free up CPU RAM."""
+        r"""No-op method kept for API compatibility."""
+        # No need to unpin since we're not pinning memory anymore
         if self.stream is not None and self.pinned_memory:
-            # Only unpin if we're currently on CPU (i.e., offloaded)
-            for group_module in self.modules:
-                if not any(p.device == self.onload_device for p in group_module.parameters()):
-                    for param in group_module.parameters():
-                        if param in self.cpu_param_dict and param.device == self.offload_device:
-                            # Create a new non-pinned copy and replace
-                            param.data = param.data.clone()
-            if self.parameters is not None:
-                for param in self.parameters:
-                    if param in self.cpu_param_dict and param.device == self.offload_device:
-                        param.data = param.data.clone()
-            # Clear the CPU param dict
-            self.cpu_param_dict = {}
+            # Just mark as unpinned for compatibility
             self.pinned_memory = False
-
-            # Notify the manager that this group has been unpinned
-            if hasattr(self, "_pinned_group_manager") and self._pinned_group_manager is not None:
-                self._pinned_group_manager.on_group_unpinned(self)
+            self.cpu_dict_prepared = False
 
     def onload_(self):
         r"""Onloads the group of modules to the onload_device."""
@@ -378,7 +318,6 @@ def apply_group_offloading(
     non_blocking: bool = False,
     use_stream: bool = False,
     unpin_after_use: bool = False,
-    max_pinned_groups: Optional[int] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -426,13 +365,7 @@ def apply_group_offloading(
             If True, offloading and onloading is done asynchronously using a CUDA stream. This can be useful for
             overlapping computation and data transfer.
         unpin_after_use (`bool`, defaults to `False`):
-            If True, pinned memory is released after a module group has been offloaded to CPU. This reduces CPU memory
-            usage at the cost of potentially slower data transfer when the group is loaded again. Useful for large models
-            with limited CPU RAM.
-        max_pinned_groups (`int`, *optional*):
-            If set, limits the number of module groups that can have pinned memory at any given time. When this limit
-            is reached, the oldest pinned groups are unpinned to make room for new ones. This implements a sliding window
-            approach to manage CPU memory usage while maintaining good performance for the most recently used groups.
+            Legacy parameter kept for API compatibility. Has no effect as we no longer use pinned memory.
 
     Example:
         ```python
@@ -450,8 +383,7 @@ def apply_group_offloading(
         ...     offload_type="block_level",
         ...     num_blocks_per_group=2,
         ...     use_stream=True,
-        ...     unpin_after_use=False,  # Set to True to reduce CPU memory usage
-        ...     max_pinned_groups=5,    # Limit to 5 pinned groups at a time
+        ...     unpin_after_use=False,  # Legacy parameter, no effect
         ... )
         ```
     """
@@ -465,11 +397,7 @@ def apply_group_offloading(
 
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
-    # Create a pinned group manager if max_pinned_groups is set
-    pinned_group_manager = None
-    if max_pinned_groups is not None and stream is not None:
-        pinned_group_manager = PinnedGroupManager(max_pinned_groups)
-        logger.info(f"Using sliding window approach with maximum of {max_pinned_groups} pinned groups")
+    # We no longer need a pinned group manager as we're not using pinned memory
 
     if offload_type == "block_level":
         if num_blocks_per_group is None:
@@ -483,11 +411,10 @@ def apply_group_offloading(
             non_blocking,
             stream,
             unpin_after_use,
-            pinned_group_manager,
         )
     elif offload_type == "leaf_level":
         _apply_group_offloading_leaf_level(
-            module, offload_device, onload_device, non_blocking, stream, unpin_after_use, pinned_group_manager
+            module, offload_device, onload_device, non_blocking, stream, unpin_after_use
         )
     else:
         raise ValueError(f"Unsupported offload_type: {offload_type}")
@@ -501,7 +428,6 @@ def _apply_group_offloading_block_level(
     non_blocking: bool,
     stream: Optional[torch.cuda.Stream] = None,
     unpin_after_use: bool = False,
-    pinned_group_manager: Optional[PinnedGroupManager] = None,
 ) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
@@ -548,7 +474,6 @@ def _apply_group_offloading_block_level(
                 stream=stream,
                 cpu_param_dict=cpu_param_dict,
                 onload_self=stream is None,
-                pinned_group_manager=pinned_group_manager,
             )
             matched_module_groups.append(group)
             for j in range(i, i + len(current_modules)):
@@ -586,7 +511,6 @@ def _apply_group_offloading_block_level(
         stream=None,
         cpu_param_dict=None,
         onload_self=True,
-        pinned_group_manager=pinned_group_manager,
     )
     next_group = matched_module_groups[0] if len(matched_module_groups) > 0 else None
     _apply_group_offloading_hook(module, unmatched_group, next_group, unpin_after_use)
@@ -599,7 +523,6 @@ def _apply_group_offloading_leaf_level(
     non_blocking: bool,
     stream: Optional[torch.cuda.Stream] = None,
     unpin_after_use: bool = False,
-    pinned_group_manager: Optional[PinnedGroupManager] = None,
 ) -> None:
     r"""
     This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
@@ -641,7 +564,6 @@ def _apply_group_offloading_leaf_level(
             stream=stream,
             cpu_param_dict=cpu_param_dict,
             onload_self=True,
-            pinned_group_manager=pinned_group_manager,
         )
         _apply_group_offloading_hook(submodule, group, None, unpin_after_use)
         modules_with_group_offloading.add(name)
@@ -687,7 +609,6 @@ def _apply_group_offloading_leaf_level(
             stream=stream,
             cpu_param_dict=cpu_param_dict,
             onload_self=True,
-            pinned_group_manager=pinned_group_manager,
         )
         _apply_group_offloading_hook(parent_module, group, None, unpin_after_use)
 
@@ -707,7 +628,6 @@ def _apply_group_offloading_leaf_level(
             stream=None,
             cpu_param_dict=None,
             onload_self=True,
-            pinned_group_manager=pinned_group_manager,
         )
         _apply_lazy_group_offloading_hook(module, unmatched_group, None)