enable big gpu cases

faaany · faaany · commit a7cd73d0a196 · 2025-03-14T02:24:41.000-07:00
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
@@ -322,15 +322,17 @@ def require_torch_multi_gpu(test_case):
 
 def require_torch_multi_accelerator(test_case):
     """
-    Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine without
-    multiple hardware accelerators.
+    Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine
+    without multiple hardware accelerators.
     """
     if not is_torch_available():
         return unittest.skip("test requires PyTorch")(test_case)
 
     import torch
 
-    return unittest.skipUnless(torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators")(test_case)
+    return unittest.skipUnless(
+        torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
+    )(test_case)
 
 
 def require_torch_accelerator_with_fp16(test_case):
@@ -367,6 +369,31 @@ def require_big_gpu_with_torch_cuda(test_case):
     )(test_case)
 
 
+def require_big_accelerator(test_case):
+    """
+    Decorator marking a test that requires a bigger hardware accelerator (24GB) for execution. Some example pipelines:
+    Flux, SD3, Cog, etc.
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if not (torch.cuda.is_available() or torch.xpu.is_available()):
+        return unittest.skip("test requires PyTorch CUDA")(test_case)
+
+    if torch.xpu.is_available():
+        device_properties = torch.xpu.get_device_properties(0)
+    else:
+        device_properties = torch.cuda.get_device_properties(0)
+
+    total_memory = device_properties.total_memory / (1024**3)
+    return unittest.skipUnless(
+        total_memory >= BIG_GPU_MEMORY,
+        f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
+    )(test_case)
+
+
 def require_torch_accelerator_with_training(test_case):
     """Decorator marking a test that requires an accelerator with support for training."""
     return unittest.skipUnless(
diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -31,9 +31,10 @@
 from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -219,20 +220,20 @@ def test_xformers_attention_forwardGenerator_pass(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3ControlNetPipeline
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_canny(self):
         controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
@@ -272,7 +273,7 @@ def test_pose(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -304,7 +305,7 @@ def test_tile(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -338,7 +339,7 @@ def test_multi_controlnet(self):
         pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
         )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
@@ -12,7 +12,7 @@
     backend_empty_cache,
     nightly,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -204,7 +204,7 @@ def test_flux_true_cfg(self):
 
 
 @nightly
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxPipelineSlowTests(unittest.TestCase):
     pipeline_class = FluxPipeline
@@ -292,7 +292,7 @@ def test_flux_inference(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
     pipeline_class = FluxPipeline
@@ -304,12 +304,12 @@ class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, seed=0):
         if str(device).startswith("mps"):
diff --git a/tests/pipelines/flux/test_pipeline_flux_redux.py b/tests/pipelines/flux/test_pipeline_flux_redux.py
@@ -8,15 +8,16 @@
 from diffusers import FluxPipeline, FluxPriorReduxPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxReduxSlowTests(unittest.TestCase):
     pipeline_class = FluxPriorReduxPipeline
@@ -27,12 +28,12 @@ class FluxReduxSlowTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def get_inputs(self, device, seed=0):
         init_image = load_image(
@@ -59,7 +60,7 @@ def test_flux_redux_inference(self):
             self.base_repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
         )
         pipe_redux.to(torch_device)
-        pipe_base.enable_model_cpu_offload()
+        pipe_base.enable_model_cpu_offload(device=torch_device)
 
         inputs = self.get_inputs(torch_device)
         base_pipeline_inputs = self.get_base_pipeline_inputs(torch_device)
diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -10,7 +10,7 @@
 from diffusers.utils.testing_utils import (
     backend_empty_cache,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -232,7 +232,7 @@ def test_skip_guidance_layers(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3PipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3Pipeline
diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -18,7 +18,7 @@
     backend_empty_cache,
     floats_tensor,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
     slow,
     torch_device,
 )
@@ -166,7 +166,7 @@ def test_multi_vae(self):
 
 
 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
     pipeline_class = StableDiffusion3Img2ImgPipeline
@@ -202,11 +202,10 @@ def get_inputs(self, device, seed=0):
         }
 
     def test_sd3_img2img_inference(self):
+        torch.manual_seed(0)
         pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
         pipe.enable_model_cpu_offload(device=torch_device)
-
         inputs = self.get_inputs(torch_device)
-
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
         expected_slice = np.array(