huggingface
diff --git a/‎src/diffusers/utils/testing_utils.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/utils/testing_utils.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/models/unets/test_models_unet_2d_condition.py‎
Lines changed: 29 additions & 26 deletions b/‎tests/models/unets/test_models_unet_2d_condition.py‎
Lines changed: 29 additions & 26 deletions
diff --git a/‎tests/pipelines/controlnet/test_controlnet_sdxl.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/pipelines/controlnet/test_controlnet_sdxl.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/pipelines/flux/test_pipeline_flux.py‎
Lines changed: 4 additions & 3 deletions b/‎tests/pipelines/flux/test_pipeline_flux.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tests/pipelines/pag/test_pag_sdxl_img2img.py‎
Lines changed: 7 additions & 6 deletions b/‎tests/pipelines/pag/test_pag_sdxl_img2img.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎tests/pipelines/pag/test_pag_sdxl_inpaint.py‎
Lines changed: 7 additions & 6 deletions b/‎tests/pipelines/pag/test_pag_sdxl_inpaint.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎tests/pipelines/stable_diffusion/test_stable_diffusion.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/pipelines/stable_diffusion/test_stable_diffusion.py‎
Lines changed: 1 addition & 1 deletion
@@ -1137,6 +1137,8 @@ def backend_device_count(device: str):
 def backend_reset_peak_memory_stats(device: str):
     return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS)
 
+def backend_reset_max_memory_allocated(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED)
 
 def backend_max_memory_allocated(device: str):
     return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED)
 
@@ -36,6 +36,9 @@
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
     backend_empty_cache,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
+    backend_max_memory_allocated,
     enable_full_determinism,
     floats_tensor,
     is_peft_available,
@@ -1014,7 +1017,7 @@ def test_load_sharded_checkpoint_from_hub_local(self):
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_load_sharded_checkpoint_from_hub_local_subfolder(self):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
@@ -1025,7 +1028,7 @@ def test_load_sharded_checkpoint_from_hub_local_subfolder(self):
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @parameterized.expand(
         [
             ("hf-internal-testing/unet2d-sharded-dummy", None),
@@ -1040,7 +1043,7 @@ def test_load_sharded_checkpoint_device_map_from_hub(self, repo_id, variant):
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @parameterized.expand(
         [
             ("hf-internal-testing/unet2d-sharded-dummy-subfolder", None),
@@ -1055,7 +1058,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_subfolder(self, repo_id, va
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_load_sharded_checkpoint_device_map_from_hub_local(self):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy")
@@ -1065,7 +1068,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_local(self):
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self):
         _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder")
@@ -1165,11 +1168,11 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
 
         return model
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_set_attention_slice_auto(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
 
         unet = self.get_unet_model()
         unet.set_attention_slice("auto")
@@ -1181,15 +1184,15 @@ def test_set_attention_slice_auto(self):
         with torch.no_grad():
             _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
 
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
 
         assert mem_bytes < 5 * 10**9
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_set_attention_slice_max(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
 
         unet = self.get_unet_model()
         unet.set_attention_slice("max")
@@ -1201,15 +1204,15 @@ def test_set_attention_slice_max(self):
         with torch.no_grad():
             _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
 
-        mem_bytes = torch.cuda.max_memory_allocated()
-
+        mem_bytes = backend_max_memory_allocated(torch_device)
+        
         assert mem_bytes < 5 * 10**9
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_set_attention_slice_int(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
 
         unet = self.get_unet_model()
         unet.set_attention_slice(2)
@@ -1221,15 +1224,15 @@ def test_set_attention_slice_int(self):
         with torch.no_grad():
             _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
 
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
 
         assert mem_bytes < 5 * 10**9
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_set_attention_slice_list(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)
 
         # there are 32 sliceable layers
         slice_list = 16 * [2, 3]
@@ -1243,7 +1246,7 @@ def test_set_attention_slice_list(self):
         with torch.no_grad():
             _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
 
-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
 
         assert mem_bytes < 5 * 10**9
 
 
@@ -222,12 +222,12 @@ def test_stable_diffusion_xl_offloads(self):
 
         components = self.get_dummy_components()
         sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
         pipes.append(sd_pipe)
 
         components = self.get_dummy_components()
         sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
         pipes.append(sd_pipe)
 
         image_slices = []
 
@@ -9,6 +9,7 @@
 
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
     slow,
@@ -219,12 +220,12 @@ class FluxPipelineSlowTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, seed=0):
         if str(device).startswith("mps"):
@@ -254,7 +255,7 @@ def test_flux_inference(self):
         pipe = self.pipeline_class.from_pretrained(
             self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
 
         inputs = self.get_inputs(torch_device)
 
 
@@ -39,10 +39,11 @@
     UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     floats_tensor,
     load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -267,19 +268,19 @@ def test_pag_inference(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
         img_url = (
@@ -303,7 +304,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0)
 
     def test_pag_cfg(self):
         pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
@@ -320,7 +321,7 @@ def test_pag_cfg(self):
 
     def test_pag_uncond(self):
         pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device, guidance_scale=0.0)
 
@@ -40,10 +40,11 @@
     UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     floats_tensor,
     load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -272,19 +273,19 @@ def test_pag_inference(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase):
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0):
         img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
@@ -309,7 +310,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0)
 
     def test_pag_cfg(self):
         pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
@@ -326,7 +327,7 @@ def test_pag_cfg(self):
 
     def test_pag_uncond(self):
         pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16)
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device, guidance_scale=0.0)
 
@@ -988,7 +988,7 @@ def test_stable_diffusion_attention_slicing(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_vae_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_peak_memory_stats(torch_device)
         pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)