huggingface · hlky · Mar 20, 2025 · Mar 7, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py
@@ -449,9 +449,9 @@ def load_textual_inversion(
 
         # 7.5 Offload the model again
         if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
+            self.enable_model_cpu_offload(device=device)
         elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
+            self.enable_sequential_cpu_offload(device=device)
 
         # / Unsafe Code >
 

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
@@ -320,6 +320,21 @@ def require_torch_multi_gpu(test_case):
     return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
 
 
+def require_torch_multi_accelerator(test_case):
+    """
+    Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine
+    without multiple hardware accelerators.
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    return unittest.skipUnless(
+        torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
+    )(test_case)
+
+
 def require_torch_accelerator_with_fp16(test_case):
     """Decorator marking a test that requires an accelerator with support for the FP16 data type."""
     return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
@@ -354,6 +369,31 @@ def require_big_gpu_with_torch_cuda(test_case):
     )(test_case)
 
 
+def require_big_accelerator(test_case):
+    """
+    Decorator marking a test that requires a bigger hardware accelerator (24GB) for execution. Some example pipelines:
+    Flux, SD3, Cog, etc.
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if not (torch.cuda.is_available() or torch.xpu.is_available()):
+        return unittest.skip("test requires PyTorch CUDA")(test_case)
+
+    if torch.xpu.is_available():
+        device_properties = torch.xpu.get_device_properties(0)
+    else:
+        device_properties = torch.cuda.get_device_properties(0)
+
+    total_memory = device_properties.total_memory / (1024**3)
+    return unittest.skipUnless(
+        total_memory >= BIG_GPU_MEMORY,
+        f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
+    )(test_case)
+
+
 def require_torch_accelerator_with_training(test_case):
     """Decorator marking a test that requires an accelerator with support for training."""
     return unittest.skipUnless(

diff --git a/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
@@ -124,7 +124,7 @@ def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x
         return model
 
     def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
         if torch_device != "mps":
             return torch.Generator(device=generator_device).manual_seed(seed)
         return torch.manual_seed(seed)

diff --git a/tests/models/autoencoders/test_models_autoencoder_kl.py b/tests/models/autoencoders/test_models_autoencoder_kl.py
@@ -165,7 +165,7 @@ def test_output_pretrained(self):
         model.eval()
 
         # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
         if torch_device != "mps":
             generator = torch.Generator(device=generator_device).manual_seed(0)
         else:
@@ -263,7 +263,7 @@ def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False)
         return model
 
     def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
         if torch_device != "mps":
             return torch.Generator(device=generator_device).manual_seed(seed)
         return torch.manual_seed(seed)

diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -183,7 +183,7 @@ def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp
         return model
 
     def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
         if torch_device != "mps":
             return torch.Generator(device=generator_device).manual_seed(seed)
         return torch.manual_seed(seed)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -63,7 +63,7 @@
     require_torch_accelerator,
     require_torch_accelerator_with_training,
     require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     run_test_in_subprocess,
     torch_all_close,
     torch_device,
@@ -1227,7 +1227,7 @@ def test_disk_offload_with_safetensors(self):
 
             self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
 
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_model_parallelism(self):
         config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**config).eval()

diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -31,9 +31,10 @@
 from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -219,20 +220,20 @@ def test_xformers_attention_forwardGenerator_pass(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3ControlNetPipeline
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_canny(self):
         controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
@@ -272,7 +273,7 @@ def test_pose(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -304,7 +305,7 @@ def test_tile(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -338,7 +339,7 @@ def test_multi_controlnet(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
@@ -12,7 +12,7 @@
     backend_empty_cache,
     nightly,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -204,7 +204,7 @@ def test_flux_true_cfg(self):
 
 
 @nightly
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxPipelineSlowTests(unittest.TestCase):
     pipeline_class = FluxPipeline
@@ -292,7 +292,7 @@ def test_flux_inference(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
     pipeline_class = FluxPipeline
@@ -304,12 +304,12 @@ class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, seed=0):
         if str(device).startswith("mps"):

diff --git a/tests/pipelines/flux/test_pipeline_flux_redux.py b/tests/pipelines/flux/test_pipeline_flux_redux.py
@@ -8,15 +8,16 @@
 from diffusers import FluxPipeline, FluxPriorReduxPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxReduxSlowTests(unittest.TestCase):
     pipeline_class = FluxPriorReduxPipeline
@@ -27,12 +28,12 @@ class FluxReduxSlowTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, seed=0):
         init_image = load_image(
@@ -59,7 +60,7 @@ def test_flux_redux_inference(self):
             self.base_repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
         )
         pipe_redux.to(torch_device)
-        pipe_base.enable_model_cpu_offload()
+        pipe_base.enable_model_cpu_offload(device=torch_device)
 
         inputs = self.get_inputs(torch_device)
         base_pipeline_inputs = self.get_base_pipeline_inputs(torch_device)

diff --git a/tests/pipelines/pag/test_pag_sd3_img2img.py b/tests/pipelines/pag/test_pag_sd3_img2img.py
@@ -262,7 +262,7 @@ def test_pag_uncond(self):
         pipeline = AutoPipelineForImage2Image.from_pretrained(
             self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.(4|17)"]
         )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device, guidance_scale=0.0, pag_scale=1.8)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -57,7 +57,7 @@
     require_accelerate_version_greater,
     require_torch_2,
     require_torch_accelerator,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     run_test_in_subprocess,
     skip_mps,
     slow,
@@ -1409,7 +1409,7 @@ def test_stable_diffusion_euler(self):
 
 # (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
 @slow
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 @require_accelerate_version_greater("0.27.0")
 class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
     def tearDown(self):
@@ -1497,7 +1497,7 @@ def test_reset_device_map_to(self):
         assert sd_pipe_with_device_map.hf_device_map is None
 
         # Make sure `to()` can be used and the pipeline can be called.
-        pipe = sd_pipe_with_device_map.to("cuda")
+        pipe = sd_pipe_with_device_map.to(torch_device)
         _ = pipe("hello", num_inference_steps=2)
 
     def test_reset_device_map_enable_model_cpu_offload(self):
@@ -1509,7 +1509,7 @@ def test_reset_device_map_enable_model_cpu_offload(self):
         assert sd_pipe_with_device_map.hf_device_map is None
 
         # Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
-        sd_pipe_with_device_map.enable_model_cpu_offload()
+        sd_pipe_with_device_map.enable_model_cpu_offload(device=torch_device)
         _ = sd_pipe_with_device_map("hello", num_inference_steps=2)
 
     def test_reset_device_map_enable_sequential_cpu_offload(self):
@@ -1521,5 +1521,5 @@ def test_reset_device_map_enable_sequential_cpu_offload(self):
         assert sd_pipe_with_device_map.hf_device_map is None
 
         # Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
-        sd_pipe_with_device_map.enable_sequential_cpu_offload()
+        sd_pipe_with_device_map.enable_sequential_cpu_offload(device=torch_device)
         _ = sd_pipe_with_device_map("hello", num_inference_steps=2)
diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -10,7 +10,7 @@
 from diffusers.utils.testing_utils import (
     backend_empty_cache,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -232,7 +232,7 @@ def test_skip_guidance_layers(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3PipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3Pipeline

diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -18,7 +18,7 @@
     backend_empty_cache,
     floats_tensor,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -166,7 +166,7 @@ def test_multi_vae(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3Img2ImgPipeline
@@ -202,11 +202,10 @@ def get_inputs(self, device, seed=0):
         }
 
     def test_sd3_img2img_inference(self):
+        torch.manual_seed(0)
         pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
         pipe.enable_model_cpu_offload(device=torch_device)
-
         inputs = self.get_inputs(torch_device)
-
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
         expected_slice = np.array(