Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/diffusers/loaders/textual_inversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,9 @@ def load_textual_inversion(

# 7.5 Offload the model again
if is_model_cpu_offload:
self.enable_model_cpu_offload()
self.enable_model_cpu_offload(device=device)
elif is_sequential_cpu_offload:
self.enable_sequential_cpu_offload()
self.enable_sequential_cpu_offload(device=device)

# / Unsafe Code >

Expand Down
40 changes: 40 additions & 0 deletions src/diffusers/utils/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,21 @@ def require_torch_multi_gpu(test_case):
return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)


def require_torch_multi_accelerator(test_case):
"""
Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine
without multiple hardware accelerators.
"""
if not is_torch_available():
return unittest.skip("test requires PyTorch")(test_case)

import torch

return unittest.skipUnless(
torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
)(test_case)


def require_torch_accelerator_with_fp16(test_case):
"""Decorator marking a test that requires an accelerator with support for the FP16 data type."""
return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
Expand Down Expand Up @@ -354,6 +369,31 @@ def require_big_gpu_with_torch_cuda(test_case):
)(test_case)


def require_big_accelerator(test_case):
"""
Decorator marking a test that requires a bigger hardware accelerator (24GB) for execution. Some example pipelines:
Flux, SD3, Cog, etc.
"""
if not is_torch_available():
return unittest.skip("test requires PyTorch")(test_case)

import torch

if not (torch.cuda.is_available() or torch.xpu.is_available()):
return unittest.skip("test requires PyTorch CUDA")(test_case)

if torch.xpu.is_available():
device_properties = torch.xpu.get_device_properties(0)
else:
device_properties = torch.cuda.get_device_properties(0)

total_memory = device_properties.total_memory / (1024**3)
return unittest.skipUnless(
total_memory >= BIG_GPU_MEMORY,
f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
)(test_case)


def require_torch_accelerator_with_training(test_case):
"""Decorator marking a test that requires an accelerator with support for training."""
return unittest.skipUnless(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x
return model

def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
Expand Down
4 changes: 2 additions & 2 deletions tests/models/autoencoders/test_models_autoencoder_kl.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_output_pretrained(self):
model.eval()

# Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
generator = torch.Generator(device=generator_device).manual_seed(0)
else:
Expand Down Expand Up @@ -263,7 +263,7 @@ def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False)
return model

def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp
return model

def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
require_torch_accelerator,
require_torch_accelerator_with_training,
require_torch_gpu,
require_torch_multi_gpu,
require_torch_multi_accelerator,
run_test_in_subprocess,
torch_all_close,
torch_device,
Expand Down Expand Up @@ -1227,7 +1227,7 @@ def test_disk_offload_with_safetensors(self):

self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_model_parallelism(self):
config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**config).eval()
Expand Down
15 changes: 8 additions & 7 deletions tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
Expand Down Expand Up @@ -219,20 +220,20 @@ def test_xformers_attention_forwardGenerator_pass(self):


@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3ControlNetPipeline

def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def test_canny(self):
controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
Expand Down Expand Up @@ -272,7 +273,7 @@ def test_pose(self):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)

generator = torch.Generator(device="cpu").manual_seed(0)
Expand Down Expand Up @@ -304,7 +305,7 @@ def test_tile(self):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)

generator = torch.Generator(device="cpu").manual_seed(0)
Expand Down Expand Up @@ -338,7 +339,7 @@ def test_multi_controlnet(self):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)

generator = torch.Generator(device="cpu").manual_seed(0)
Expand Down
10 changes: 5 additions & 5 deletions tests/pipelines/flux/test_pipeline_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
backend_empty_cache,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
Expand Down Expand Up @@ -204,7 +204,7 @@ def test_flux_true_cfg(self):


@nightly
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxPipeline
Expand Down Expand Up @@ -292,7 +292,7 @@ def test_flux_inference(self):


@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxPipeline
Expand All @@ -304,12 +304,12 @@ class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def get_inputs(self, device, seed=0):
if str(device).startswith("mps"):
Expand Down
11 changes: 6 additions & 5 deletions tests/pipelines/flux/test_pipeline_flux_redux.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
from diffusers import FluxPipeline, FluxPriorReduxPipeline
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)


@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxReduxSlowTests(unittest.TestCase):
pipeline_class = FluxPriorReduxPipeline
Expand All @@ -27,12 +28,12 @@ class FluxReduxSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)

def get_inputs(self, device, seed=0):
init_image = load_image(
Expand All @@ -59,7 +60,7 @@ def test_flux_redux_inference(self):
self.base_repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
)
pipe_redux.to(torch_device)
pipe_base.enable_model_cpu_offload()
pipe_base.enable_model_cpu_offload(device=torch_device)

inputs = self.get_inputs(torch_device)
base_pipeline_inputs = self.get_base_pipeline_inputs(torch_device)
Expand Down
2 changes: 1 addition & 1 deletion tests/pipelines/pag/test_pag_sd3_img2img.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def test_pag_uncond(self):
pipeline = AutoPipelineForImage2Image.from_pretrained(
self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.(4|17)"]
)
pipeline.enable_model_cpu_offload()
pipeline.enable_model_cpu_offload(device=torch_device)
pipeline.set_progress_bar_config(disable=None)

inputs = self.get_inputs(torch_device, guidance_scale=0.0, pag_scale=1.8)
Expand Down
10 changes: 5 additions & 5 deletions tests/pipelines/stable_diffusion/test_stable_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
require_accelerate_version_greater,
require_torch_2,
require_torch_accelerator,
require_torch_multi_gpu,
require_torch_multi_accelerator,
run_test_in_subprocess,
skip_mps,
slow,
Expand Down Expand Up @@ -1409,7 +1409,7 @@ def test_stable_diffusion_euler(self):

# (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
@slow
@require_torch_multi_gpu
@require_torch_multi_accelerator
@require_accelerate_version_greater("0.27.0")
class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
def tearDown(self):
Expand Down Expand Up @@ -1497,7 +1497,7 @@ def test_reset_device_map_to(self):
assert sd_pipe_with_device_map.hf_device_map is None

# Make sure `to()` can be used and the pipeline can be called.
pipe = sd_pipe_with_device_map.to("cuda")
pipe = sd_pipe_with_device_map.to(torch_device)
_ = pipe("hello", num_inference_steps=2)

def test_reset_device_map_enable_model_cpu_offload(self):
Expand All @@ -1509,7 +1509,7 @@ def test_reset_device_map_enable_model_cpu_offload(self):
assert sd_pipe_with_device_map.hf_device_map is None

# Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
sd_pipe_with_device_map.enable_model_cpu_offload()
sd_pipe_with_device_map.enable_model_cpu_offload(device=torch_device)
_ = sd_pipe_with_device_map("hello", num_inference_steps=2)

def test_reset_device_map_enable_sequential_cpu_offload(self):
Expand All @@ -1521,5 +1521,5 @@ def test_reset_device_map_enable_sequential_cpu_offload(self):
assert sd_pipe_with_device_map.hf_device_map is None

# Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
sd_pipe_with_device_map.enable_sequential_cpu_offload()
sd_pipe_with_device_map.enable_sequential_cpu_offload(device=torch_device)
_ = sd_pipe_with_device_map("hello", num_inference_steps=2)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from diffusers.utils.testing_utils import (
backend_empty_cache,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
Expand Down Expand Up @@ -232,7 +232,7 @@ def test_skip_guidance_layers(self):


@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3PipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3Pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
backend_empty_cache,
floats_tensor,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
Expand Down Expand Up @@ -166,7 +166,7 @@ def test_multi_vae(self):


@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3Img2ImgPipeline
Expand Down Expand Up @@ -202,11 +202,10 @@ def get_inputs(self, device, seed=0):
}

def test_sd3_img2img_inference(self):
torch.manual_seed(0)
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload(device=torch_device)

inputs = self.get_inputs(torch_device)

image = pipe(**inputs).images[0]
image_slice = image[0, :10, :10]
expected_slice = np.array(
Expand Down
Loading
Loading