huggingface
diff --git a/‎src/diffusers/models/embeddings.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/models/embeddings.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_ltx.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/models/transformers/transformer_ltx.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py‎
Lines changed: 2 additions & 4 deletions b/‎src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/diffusers/pipelines/consisid/consisid_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/pipelines/consisid/consisid_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py‎
Lines changed: 5 additions & 1 deletion b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py‎
Lines changed: 3 additions & 3 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py‎
Lines changed: 3 additions & 3 deletions
@@ -1199,11 +1199,11 @@ def apply_rotary_emb(
 
         if use_real_unbind_dim == -1:
             # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
             x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
         elif use_real_unbind_dim == -2:
             # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
             x_rotated = torch.cat([-x_imag, x_real], dim=-1)
         else:
             raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
 
@@ -481,7 +481,7 @@ def forward(
 
 def apply_rotary_emb(x, freqs):
     cos, sin = freqs
-    x_real, x_imag = x.unflatten(2, (-1, 2)).unbind(-1)  # [B, S, H, D // 2]
+    x_real, x_imag = x.unflatten(2, (-1, 2)).unbind(-1)  # [B, S, C // 2]
     x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(2)
     out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
     return out
@@ -41,7 +41,7 @@
     replace_example_docstring,
 )
 from ...utils.import_utils import is_transformers_version
-from ...utils.torch_utils import randn_tensor
+from ...utils.torch_utils import empty_device_cache, randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
 
@@ -267,9 +267,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
 
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            device_mod = getattr(torch, device.type, None)
-            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
-                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+            empty_device_cache(device.type)
 
         model_sequence = [
             self.text_encoder.text_model,
 
@@ -294,7 +294,7 @@ def prepare_face_models(model_path, device, dtype):
 
     Parameters:
     - model_path: Path to the directory containing model files.
-    - device: The device (e.g., 'cuda', 'cpu') where models will be loaded.
+    - device: The device (e.g., 'cuda', 'xpu', 'cpu') where models will be loaded.
     - dtype: Data type (e.g., torch.float32) for model inference.
 
     Returns:
 
@@ -37,7 +37,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, is_torch_version, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1339,7 +1339,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -36,7 +36,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1311,7 +1311,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -38,7 +38,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1500,7 +1500,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
 
@@ -51,7 +51,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
@@ -1858,7 +1858,7 @@ def denoising_value_valid(dnv):
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
 
@@ -1465,7 +1465,11 @@ def __call__(
 
                 # Relevant thread:
                 # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
-                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                if (
+                    torch.cuda.is_available()
+                    and (is_unet_compiled and is_controlnet_compiled)
+                    and is_torch_higher_equal_2_1
+                ):
                     torch._inductor.cudagraph_mark_step_begin()
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
 
@@ -53,7 +53,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
@@ -921,7 +921,7 @@ def prepare_latents(
         # Offload text encoder if `enable_model_cpu_offload` was enabled
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.text_encoder_2.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         image = image.to(device=device, dtype=dtype)
 
@@ -1632,7 +1632,7 @@ def __call__(
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.unet.to("cpu")
             self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+            empty_device_cache()
 
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16