handle .to() when group offload applied

a-r-r-o-w · a-r-r-o-w · commit 954bb7d0198a · 2025-02-06T19:04:16.000+01:00
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -662,3 +662,10 @@ def _raise_error_if_accelerate_model_or_sequential_hook_present(module: torch.nn
                 f"offloading strategy from Accelerate. If you want to apply group offloading, please "
                 f"disable the existing offloading strategy first. Offending module: {name} ({type(submodule)})"
             )
+
+
+def _is_group_offload_enabled(module: torch.nn.Module) -> bool:
+    for submodule in module.modules():
+        if hasattr(submodule, "_diffusers_hook") and submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING) is not None:
+            return True
+    return False
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -1245,8 +1245,21 @@ def cuda(self, *args, **kwargs):
     # Adapted from `transformers`.
     @wraps(torch.nn.Module.to)
     def to(self, *args, **kwargs):
+        from ..hooks.group_offloading import _is_group_offload_enabled
+
+        device_arg_or_kwarg_present = any(isinstance(arg, torch.device) for arg in args) or "device" in kwargs
         dtype_present_in_args = "dtype" in kwargs
 
+        # Try converting arguments to torch.device in case they are passed as strings
+        for arg in args:
+            if not isinstance(arg, str):
+                continue
+            try:
+                torch.device(arg)
+                device_arg_or_kwarg_present = True
+            except RuntimeError:
+                pass
+
         if not dtype_present_in_args:
             for arg in args:
                 if isinstance(arg, torch.dtype):
@@ -1271,6 +1284,13 @@ def to(self, *args, **kwargs):
                     "Calling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. "
                     f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2."
                 )
+
+        if _is_group_offload_enabled(self) and device_arg_or_kwarg_present:
+            logger.warning(
+                f"The module '{self.__class__.__name__}' is group offloaded and moving it using `.to()` is not supported."
+            )
+            return self
+
         return super().to(*args, **kwargs)
 
     # Taken from `transformers`.
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -394,6 +394,7 @@ def to(self, *args, **kwargs):
             )
 
         device = device or device_arg
+        device_type = torch.device(device).type if device is not None else None
         pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items())
 
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
@@ -424,7 +425,7 @@ def module_is_offloaded(module):
                 "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
             )
 
-        if device and torch.device(device).type == "cuda":
+        if device_type == "cuda":
             if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
                 raise ValueError(
                     "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
@@ -437,7 +438,7 @@ def module_is_offloaded(module):
 
         # Display a warning in this case (the operation succeeds but the benefits are lost)
         pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
-        if pipeline_is_offloaded and device and torch.device(device).type == "cuda":
+        if pipeline_is_offloaded and device_type == "cuda":
             logger.warning(
                 f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
             )
@@ -449,6 +450,7 @@ def module_is_offloaded(module):
         is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded
         for module in modules:
             _, is_loaded_in_4bit_bnb, is_loaded_in_8bit_bnb = _check_bnb_status(module)
+            is_group_offloaded = self._maybe_raise_error_if_group_offload_active(module=module)
 
             if (is_loaded_in_4bit_bnb or is_loaded_in_8bit_bnb) and dtype is not None:
                 logger.warning(
@@ -460,11 +462,21 @@ def module_is_offloaded(module):
                     f"The module '{module.__class__.__name__}' has been loaded in `bitsandbytes` 8bit and moving it to {device} via `.to()` is not supported. Module is still on {module.device}."
                 )
 
+            # Note: we also handle this as the ModelMixin level. The reason for doing it here too is that modeling
+            # components can be from outside diffusers too, but still have group offloading enabled.
+            if (
+                self._maybe_raise_error_if_group_offload_active(raise_error=False, module=module)
+                and device is not None
+            ):
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' is group offloaded and moving it to {device} via `.to()` is not supported."
+                )
+
             # This can happen for `transformer` models. CPU placement was added in
             # https://github.com/huggingface/transformers/pull/33122. So, we guard this accordingly.
             if is_loaded_in_4bit_bnb and device is not None and is_transformers_version(">", "4.44.0"):
                 module.to(device=device)
-            elif not is_loaded_in_4bit_bnb and not is_loaded_in_8bit_bnb:
+            elif not is_loaded_in_4bit_bnb and not is_loaded_in_8bit_bnb and not is_group_offloaded:
                 module.to(device, dtype)
 
             if (
@@ -1075,7 +1087,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
-        self._check_group_offloading_inactive_or_raise_error()
+        self._maybe_raise_error_if_group_offload_active(raise_error=True)
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
@@ -1188,7 +1200,7 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
-        self._check_group_offloading_inactive_or_raise_error()
+        self._maybe_raise_error_if_group_offload_active(raise_error=True)
 
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
@@ -1914,23 +1926,23 @@ def from_pipe(cls, pipeline, **kwargs):
 
         return new_pipeline
 
-    def _check_group_offloading_inactive_or_raise_error(self) -> None:
-        from ..hooks import HookRegistry
-        from ..hooks.group_offloading import _GROUP_OFFLOADING
+    def _maybe_raise_error_if_group_offload_active(
+        self, raise_error: bool = False, module: Optional[torch.nn.Module] = None
+    ) -> bool:
+        from ..hooks.group_offloading import _is_group_offload_enabled
 
-        for name, component in self.components.items():
-            if not isinstance(component, torch.nn.Module):
-                continue
-            for module in component.modules():
-                if not hasattr(module, "_diffusers_hook"):
-                    continue
-                registry: HookRegistry = module._diffusers_hook
-                if registry.get_hook(_GROUP_OFFLOADING) is not None:
+        components = self.components.values() if module is None else [module]
+        components = [component for component in components if isinstance(component, torch.nn.Module)]
+        for component in components:
+            if _is_group_offload_enabled(component):
+                if raise_error:
                     raise ValueError(
-                        f"You are trying to apply model/sequential CPU offloading to a pipeline that contains "
-                        f"components with group offloading enabled. This is not supported. Please disable group "
-                        f"offloading for the '{name}' component of the pipeline to use other offloading methods."
+                        "You are trying to apply model/sequential CPU offloading to a pipeline that contains components "
+                        "with group offloading enabled. This is not supported. Please disable group offloading for "
+                        "components of the pipeline to use other offloading methods."
                     )
+                return True
+        return False
 
 
 class StableDiffusionMixin:
diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
@@ -19,6 +19,7 @@
 
 from diffusers.models import ModelMixin
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import get_logger
 from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 
 
@@ -153,6 +154,27 @@ def run_forward(model):
         # Memory assertions - offloading should reduce memory usage
         self.assertTrue(mem4 <= mem5 < mem2 < mem3 < mem1 < mem_baseline)
 
+    def test_warning_logged_if_group_offloaded_module_moved_to_cuda(self):
+        if torch.device(torch_device).type != "cuda":
+            return
+        self.model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=3)
+        logger = get_logger("diffusers.models.modeling_utils")
+        logger.setLevel("INFO")
+        with self.assertLogs(logger, level="WARNING") as cm:
+            self.model.to(torch_device)
+        self.assertIn(f"The module '{self.model.__class__.__name__}' is group offloaded", cm.output[0])
+
+    def test_warning_logged_if_group_offloaded_pipe_moved_to_cuda(self):
+        if torch.device(torch_device).type != "cuda":
+            return
+        pipe = DummyPipeline(self.model)
+        self.model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=3)
+        logger = get_logger("diffusers.pipelines.pipeline_utils")
+        logger.setLevel("INFO")
+        with self.assertLogs(logger, level="WARNING") as cm:
+            pipe.to(torch_device)
+        self.assertIn(f"The module '{self.model.__class__.__name__}' is group offloaded", cm.output[0])
+
     def test_error_raised_if_streams_used_and_no_cuda_device(self):
         original_is_available = torch.cuda.is_available
         torch.cuda.is_available = lambda: False