diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py index 860aa6511689..4a74db95f528 100644 --- a/tests/lora/test_lora_layers_flux.py +++ b/tests/lora/test_lora_layers_flux.py @@ -31,13 +31,14 @@ from diffusers.utils import load_image, logging from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, floats_tensor, is_peft_available, nightly, numpy_cosine_similarity_distance, - require_big_gpu_with_torch_cuda, + require_big_accelerator, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -809,10 +810,10 @@ def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self): @slow @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend -@require_big_gpu_with_torch_cuda -@pytest.mark.big_gpu_with_torch_cuda +@require_big_accelerator +@pytest.mark.big_accelerator class FluxLoRAIntegrationTests(unittest.TestCase): """internal note: The integration slices were obtained on audace. @@ -827,7 +828,7 @@ def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) self.pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16) @@ -836,13 +837,13 @@ def tearDown(self): del self.pipeline gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_flux_the_last_ben(self): self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors") self.pipeline.fuse_lora() self.pipeline.unload_lora_weights() - # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI + # Instead of calling `enable_model_cpu_offload()`, we do a accelerator placement here because the CI # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with # `enable_model_cpu_offload()`. We repeat this for the other tests, too. self.pipeline = self.pipeline.to(torch_device) @@ -956,10 +957,10 @@ def test_flux_xlabs_load_lora_with_single_blocks(self): @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend -@require_big_gpu_with_torch_cuda -@pytest.mark.big_gpu_with_torch_cuda +@require_big_accelerator +@pytest.mark.big_accelerator class FluxControlLoRAIntegrationTests(unittest.TestCase): num_inference_steps = 10 seed = 0 @@ -969,17 +970,17 @@ def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) self.pipeline = FluxControlPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16 - ).to("cuda") + ).to(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @parameterized.expand(["black-forest-labs/FLUX.1-Canny-dev-lora", "black-forest-labs/FLUX.1-Depth-dev-lora"]) def test_lora(self, lora_ckpt_id): diff --git a/tests/lora/test_lora_layers_hunyuanvideo.py b/tests/lora/test_lora_layers_hunyuanvideo.py index d2015d8b0711..87c3100b59c6 100644 --- a/tests/lora/test_lora_layers_hunyuanvideo.py +++ b/tests/lora/test_lora_layers_hunyuanvideo.py @@ -28,13 +28,16 @@ HunyuanVideoTransformer3DModel, ) from diffusers.utils.testing_utils import ( + Expectations, + backend_empty_cache, floats_tensor, nightly, numpy_cosine_similarity_distance, - require_big_gpu_with_torch_cuda, + require_big_accelerator, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, skip_mps, + torch_device, ) @@ -192,10 +195,10 @@ def test_simple_inference_with_text_lora_save_load(self): @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend -@require_big_gpu_with_torch_cuda -@pytest.mark.big_gpu_with_torch_cuda +@require_big_accelerator +@pytest.mark.big_accelerator class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): """internal note: The integration slices were obtained on DGX. @@ -210,7 +213,7 @@ def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) model_id = "hunyuanvideo-community/HunyuanVideo" transformer = HunyuanVideoTransformer3DModel.from_pretrained( @@ -218,13 +221,13 @@ def setUp(self): ) self.pipeline = HunyuanVideoPipeline.from_pretrained( model_id, transformer=transformer, torch_dtype=torch.float16 - ).to("cuda") + ).to(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_original_format_cseti(self): self.pipeline.load_lora_weights( @@ -249,8 +252,13 @@ def test_original_format_cseti(self): out_slice = np.concatenate((out[:8], out[-8:])) # fmt: off - expected_slice = np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]) + expected_slices = Expectations( + { + ("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]), + } + ) # fmt: on + expected_slice = expected_slices.get_expectation() max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice) diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py index e50f5316da60..5295535b3cfa 100644 --- a/tests/lora/test_lora_layers_sd.py +++ b/tests/lora/test_lora_layers_sd.py @@ -93,12 +93,12 @@ def output_shape(self): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) # Keeping this test here makes sense because it doesn't look any integration # (value assertions on logits). diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 90aaa3bcfe78..06668c9497bf 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -34,7 +34,7 @@ is_flaky, nightly, numpy_cosine_similarity_distance, - require_big_gpu_with_torch_cuda, + require_big_accelerator, require_peft_backend, require_torch_accelerator, torch_device, @@ -138,8 +138,8 @@ def test_multiple_wrong_adapter_name_raises_error(self): @nightly @require_torch_accelerator @require_peft_backend -@require_big_gpu_with_torch_cuda -@pytest.mark.big_gpu_with_torch_cuda +@require_big_accelerator +@pytest.mark.big_accelerator class SD3LoraIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusion3Img2ImgPipeline repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py index 76d6dc48602b..0a31f214a38c 100644 --- a/tests/lora/test_lora_layers_sdxl.py +++ b/tests/lora/test_lora_layers_sdxl.py @@ -37,12 +37,13 @@ from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, is_flaky, load_image, nightly, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -105,12 +106,12 @@ def output_shape(self): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @is_flaky def test_multiple_wrong_adapter_name_raises_error(self): @@ -119,18 +120,18 @@ def test_multiple_wrong_adapter_name_raises_error(self): @slow @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend class LoraSDXLIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_sdxl_1_0_lora(self): generator = torch.Generator("cpu").manual_seed(0)