diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index 79b4f785f30c..f160f50fe791 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -237,3 +237,5 @@ with torch.no_grad(): ``` By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs. + +This workflow is also compatible with LoRAs via [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. However, only LoRAs without text encoder components are currently supported in this workflow. diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py index 286d0a12bc71..4c1beb399212 100644 --- a/src/diffusers/loaders/lora_base.py +++ b/src/diffusers/loaders/lora_base.py @@ -327,12 +327,18 @@ def _optionally_disable_offloading(cls, _pipeline): tuple: A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True. """ + from ..pipelines.pipeline_loading_utils import model_has_device_map + is_model_cpu_offload = False is_sequential_cpu_offload = False if _pipeline is not None and _pipeline.hf_device_map is None: for _, component in _pipeline.components.items(): - if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): + if ( + isinstance(component, nn.Module) + and hasattr(component, "_hf_hook") + and not model_has_device_map(component) + ): if not is_model_cpu_offload: is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) if not is_sequential_cpu_offload: diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 7050968b6de5..27c797a598b8 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -400,12 +400,18 @@ def _optionally_disable_offloading(cls, _pipeline): tuple: A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True. """ + from ..pipelines.pipeline_loading_utils import model_has_device_map + is_model_cpu_offload = False is_sequential_cpu_offload = False if _pipeline is not None and _pipeline.hf_device_map is None: for _, component in _pipeline.components.items(): - if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): + if ( + isinstance(component, nn.Module) + and hasattr(component, "_hf_hook") + and not model_has_device_map(component) + ): if not is_model_cpu_offload: is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) if not is_sequential_cpu_offload: diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index 0a7a222ec007..296793b92fbe 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -36,6 +36,7 @@ deprecate, get_class_from_dynamic_module, is_accelerate_available, + is_accelerate_version, is_peft_available, is_transformers_available, logging, @@ -968,3 +969,18 @@ def _get_ignore_patterns( ) return ignore_patterns + + +def model_has_device_map(model): + if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"): + return False + + # Check if the model has a device map that is not exclusively CPU + # `device_map` can only contain CPU when a model has sharded checkpoints. + # See here: https://github.com/huggingface/diffusers/blob/41e4779d988ead99e7acd78dc8e752de88777d0f/src/diffusers/models/modeling_utils.py#L883 + device_map = getattr(model, "hf_device_map", None) + if device_map is not None: + unique_devices = set(device_map.values()) + return len(unique_devices) > 1 or unique_devices != {"cpu"} + + return False diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index a504184ea2f2..964ce547adae 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -84,6 +84,7 @@ _update_init_kwargs_with_connected_pipeline, load_sub_model, maybe_raise_or_warn, + model_has_device_map, variant_compatible_siblings, warn_deprecated_model_variant, ) @@ -406,6 +407,16 @@ def module_is_offloaded(module): return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload) + # device-mapped modules should not go through any device placements. + device_mapped_components = [ + key for key, component in self.components.items() if model_has_device_map(component) + ] + if device_mapped_components: + raise ValueError( + "The following pipeline components have been found to use a device map: " + f"{device_mapped_components}. This is incompatible with explicitly setting the device using `to()`." + ) + # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer pipeline_is_sequentially_offloaded = any( module_is_sequentially_offloaded(module) for _, module in self.components.items() @@ -1008,6 +1019,16 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will default to "cuda". """ + # device-mapped modules should not go through any device placements. + device_mapped_components = [ + key for key, component in self.components.items() if model_has_device_map(component) + ] + if device_mapped_components: + raise ValueError( + "The following pipeline components have been found to use a device map: " + f"{device_mapped_components}. This is incompatible with `enable_model_cpu_offload()`." + ) + is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 if is_pipeline_device_mapped: raise ValueError( @@ -1110,6 +1131,16 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will default to "cuda". """ + # device-mapped modules should not go through any device placements. + device_mapped_components = [ + key for key, component in self.components.items() if model_has_device_map(component) + ] + if device_mapped_components: + raise ValueError( + "The following pipeline components have been found to use a device map: " + f"{device_mapped_components}. This is incompatible with `enable_sequential_cpu_offload()`." + ) + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): from accelerate import cpu_offload else: diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index fb550dd3219d..9af49697f913 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -506,9 +506,14 @@ def test_to_dtype(self): model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) + @unittest.skip("Test currently not supported.") def test_sequential_cpu_offload_forward_pass(self): pass + @unittest.skip("Test currently not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + @nightly class AudioLDM2PipelineSlowTests(unittest.TestCase): diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index b12655d989d4..1cb6569716a8 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -514,6 +514,18 @@ def test_inference_multiple_prompt_input(self): assert image.shape == (4, 64, 64, 3) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class StableDiffusionMultiControlNetOneModelPipelineFastTests( IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase @@ -697,6 +709,18 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 7c4ae716b37d..45bc70c809f2 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -389,6 +389,18 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index e49106334c2e..af8ddb7e6b28 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -441,6 +441,18 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index ea7fff5537a5..d6e18598cc6f 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -683,6 +683,18 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): return self._test_save_load_optional_components() + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class StableDiffusionXLMultiControlNetOneModelPipelineFastTests( PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase @@ -887,6 +899,18 @@ def test_negative_conditions(self): self.assertTrue(np.abs(image_slice_without_neg_cond - image_slice_with_neg_cond).max() > 1e-2) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @slow @require_torch_gpu diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py index df9021ee0adb..36c4ab0f3289 100644 --- a/tests/pipelines/flux/test_pipeline_flux.py +++ b/tests/pipelines/flux/test_pipeline_flux.py @@ -8,9 +8,11 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel +from diffusers.image_processor import VaeImageProcessor from diffusers.utils.testing_utils import ( numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, + require_torch_multi_gpu, slow, torch_device, ) @@ -296,3 +298,172 @@ def test_flux_inference(self): max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten()) assert max_diff < 1e-4 + + @require_torch_multi_gpu + @torch.no_grad() + def test_flux_component_sharding(self): + """ + internal note: test was run on `audace`. + """ + + ckpt_id = "black-forest-labs/FLUX.1-dev" + dtype = torch.bfloat16 + prompt = "a photo of a cat with tiger-like look" + + pipeline = FluxPipeline.from_pretrained( + ckpt_id, + transformer=None, + vae=None, + device_map="balanced", + max_memory={0: "16GB", 1: "16GB"}, + torch_dtype=dtype, + ) + prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt( + prompt=prompt, prompt_2=None, max_sequence_length=512 + ) + + del pipeline.text_encoder + del pipeline.text_encoder_2 + del pipeline.tokenizer + del pipeline.tokenizer_2 + del pipeline + + gc.collect() + torch.cuda.empty_cache() + + transformer = FluxTransformer2DModel.from_pretrained( + ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype + ) + pipeline = FluxPipeline.from_pretrained( + ckpt_id, + text_encoder=None, + text_encoder_2=None, + tokenizer=None, + tokenizer_2=None, + vae=None, + transformer=transformer, + torch_dtype=dtype, + ) + + height, width = 768, 1360 + # No need to wrap it up under `torch.no_grad()` as pipeline call method + # is already wrapped under that. + latents = pipeline( + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + num_inference_steps=10, + guidance_scale=3.5, + height=height, + width=width, + output_type="latent", + generator=torch.manual_seed(0), + ).images + latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy() + expected_slice = np.array([-0.377, -0.3008, -0.5117, -0.252, 0.0615, -0.3477, -0.1309, -0.1914, 0.1533]) + + assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4 + + del pipeline.transformer + del pipeline + + gc.collect() + torch.cuda.empty_cache() + + vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device) + vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) + + latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor) + latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor + + image = vae.decode(latents, return_dict=False)[0] + image = image_processor.postprocess(image, output_type="np") + image_slice = image[0, :3, :3, -1].flatten() + expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152]) + + assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4 + + @require_torch_multi_gpu + @torch.no_grad() + def test_flux_component_sharding_with_lora(self): + """ + internal note: test was run on `audace`. + """ + + ckpt_id = "black-forest-labs/FLUX.1-dev" + dtype = torch.bfloat16 + prompt = "jon snow eating pizza." + + pipeline = FluxPipeline.from_pretrained( + ckpt_id, + transformer=None, + vae=None, + device_map="balanced", + max_memory={0: "16GB", 1: "16GB"}, + torch_dtype=dtype, + ) + prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt( + prompt=prompt, prompt_2=None, max_sequence_length=512 + ) + + del pipeline.text_encoder + del pipeline.text_encoder_2 + del pipeline.tokenizer + del pipeline.tokenizer_2 + del pipeline + + gc.collect() + torch.cuda.empty_cache() + + transformer = FluxTransformer2DModel.from_pretrained( + ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype + ) + pipeline = FluxPipeline.from_pretrained( + ckpt_id, + text_encoder=None, + text_encoder_2=None, + tokenizer=None, + tokenizer_2=None, + vae=None, + transformer=transformer, + torch_dtype=dtype, + ) + pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors") + + height, width = 768, 1360 + # No need to wrap it up under `torch.no_grad()` as pipeline call method + # is already wrapped under that. + latents = pipeline( + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + num_inference_steps=10, + guidance_scale=3.5, + height=height, + width=width, + output_type="latent", + generator=torch.manual_seed(0), + ).images + latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy() + expected_slice = np.array([-0.6523, -0.4961, -0.9141, -0.5, -0.2129, -0.6914, -0.375, -0.5664, -0.1699]) + + assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4 + + del pipeline.transformer + del pipeline + + gc.collect() + torch.cuda.empty_cache() + + vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device) + vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) + + latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor) + latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor + + image = vae.decode(latents, return_dict=False)[0] + image = image_processor.postprocess(image, output_type="np") + image_slice = image[0, :3, :3, -1].flatten() + expected_slice = np.array([0.1211, 0.1094, 0.1035, 0.1094, 0.1113, 0.1074, 0.1133, 0.1133, 0.1094]) + + assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4 diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index 607a47e08e58..739f8676cbd3 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -139,6 +139,18 @@ def test_float16_inference(self): def test_dict_tuple_outputs_equivalent(self): super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = KandinskyImg2ImgCombinedPipeline @@ -248,6 +260,18 @@ def test_dict_tuple_outputs_equivalent(self): def test_save_load_optional_components(self): super().test_save_load_optional_components(expected_max_difference=5e-4) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = KandinskyInpaintCombinedPipeline @@ -363,3 +387,15 @@ def test_save_load_optional_components(self): def test_save_load_local(self): super().test_save_load_local(expected_max_difference=5e-3) + + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py index 5f42447bd9d5..7545ec5bb5d3 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import tempfile import unittest import numpy as np @@ -28,11 +30,16 @@ ) from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler -from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME +from diffusers.utils.testing_utils import enable_full_determinism, is_accelerate_available, skip_mps, torch_device from ..test_pipelines_common import PipelineTesterMixin +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + enable_full_determinism() @@ -236,3 +243,31 @@ def test_attention_slicing_forward_pass(self): test_max_difference=test_max_difference, test_mean_pixel_difference=test_mean_pixel_difference, ) + + # It needs a different sharding ratio than the standard 0.75. So, we override it. + def test_sharded_components_can_be_device_placed(self): + components = self.get_dummy_components() + + component_selected = None + for component_name in components: + if isinstance(components[component_name], ModelMixin) and hasattr( + components[component_name], "load_config" + ): + component_to_be_sharded = components[component_name] + component_cls = component_to_be_sharded.__class__ + component_selected = component_name + break + + assert component_selected, "No component selected that can be sharded." + + model_size = compute_module_sizes(component_to_be_sharded)[""] + max_shard_size = int((model_size * 0.45) / (2**10)) + + with tempfile.TemporaryDirectory() as tmp_dir: + component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB") + self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) + + loaded_sharded_component = component_cls.from_pretrained(tmp_dir) + _ = components.pop(component_selected) + components.update({component_selected: loaded_sharded_component}) + _ = self.pipeline_class(**components).to(torch_device) diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py index dbba0831397b..cf2b70f4c990 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py @@ -159,6 +159,18 @@ def test_callback_inputs(self): def test_callback_cfg(self): pass + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = KandinskyV22Img2ImgCombinedPipeline @@ -281,6 +293,18 @@ def test_callback_inputs(self): def test_callback_cfg(self): pass + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = KandinskyV22InpaintCombinedPipeline @@ -404,3 +428,15 @@ def test_callback_inputs(self): def test_callback_cfg(self): pass + + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py index be0bc238d4da..b3047e2a248e 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py @@ -14,6 +14,8 @@ # limitations under the License. import inspect +import os +import tempfile import unittest import numpy as np @@ -29,11 +31,17 @@ ) from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler -from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME +from diffusers.utils.testing_utils import enable_full_determinism, is_accelerate_available, skip_mps, torch_device from ..test_pipelines_common import PipelineTesterMixin +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + + enable_full_determinism() @@ -277,3 +285,31 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): output = pipe(**inputs)[0] assert output.abs().sum() == 0 + + # It needs a different sharding ratio than the standard 0.75. So, we override it. + def test_sharded_components_can_be_device_placed(self): + components = self.get_dummy_components() + + component_selected = None + for component_name in components: + if isinstance(components[component_name], ModelMixin) and hasattr( + components[component_name], "load_config" + ): + component_to_be_sharded = components[component_name] + component_cls = component_to_be_sharded.__class__ + component_selected = component_name + break + + assert component_selected, "No component selected that can be sharded." + + model_size = compute_module_sizes(component_to_be_sharded)[""] + max_shard_size = int((model_size * 0.45) / (2**10)) + + with tempfile.TemporaryDirectory() as tmp_dir: + component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB") + self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) + + loaded_sharded_component = component_cls.from_pretrained(tmp_dir) + _ = components.pop(component_selected) + components.update({component_selected: loaded_sharded_component}) + _ = self.pipeline_class(**components).to(torch_device) diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py index e898824e2d17..760a77b68b75 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random +import tempfile import unittest import numpy as np @@ -30,9 +32,12 @@ ) from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, + is_accelerate_available, skip_mps, torch_device, ) @@ -40,6 +45,10 @@ from ..test_pipelines_common import PipelineTesterMixin +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + + enable_full_determinism() @@ -240,3 +249,31 @@ def test_attention_slicing_forward_pass(self): test_max_difference=test_max_difference, test_mean_pixel_difference=test_mean_pixel_difference, ) + + # It needs a different sharding ratio than the standard 0.75. So, we override it. + def test_sharded_components_can_be_device_placed(self): + components = self.get_dummy_components() + + component_selected = None + for component_name in components: + if isinstance(components[component_name], ModelMixin) and hasattr( + components[component_name], "load_config" + ): + component_to_be_sharded = components[component_name] + component_cls = component_to_be_sharded.__class__ + component_selected = component_name + break + + assert component_selected, "No component selected that can be sharded." + + model_size = compute_module_sizes(component_to_be_sharded)[""] + max_shard_size = int((model_size * 0.45) / (2**10)) + + with tempfile.TemporaryDirectory() as tmp_dir: + component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB") + self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) + + loaded_sharded_component = component_cls.from_pretrained(tmp_dir) + _ = components.pop(component_selected) + components.update({component_selected: loaded_sharded_component}) + _ = self.pipeline_class(**components).to(torch_device) diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py index e51f5103933a..70765d981bbc 100644 --- a/tests/pipelines/musicldm/test_musicldm.py +++ b/tests/pipelines/musicldm/test_musicldm.py @@ -404,6 +404,10 @@ def test_to_dtype(self): model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) + @unittest.skip("Test currently not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + @nightly @require_torch_gpu diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py index d256deed376c..d799ae6e623a 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py @@ -279,3 +279,15 @@ def test_stable_cascade_combined_prompt_embeds(self): ) assert np.abs(output_prompt.images - output_prompt_embeds.images).max() < 1e-5 + + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index 2a1e691e9e8f..996afbb9d323 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -593,6 +593,18 @@ def test_inference_batch_single_identical( if test_mean_pixel_difference: assert_mean_pixel_difference(output_batch[0][0], output[0][0]) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @slow @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 2091af9c0383..61b5b754c44c 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -642,9 +642,6 @@ def test_adapter_sdxl_lcm(self): assert image.shape == (1, 64, 64, 3) expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448]) - debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()] - print(",".join(debug)) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 def test_adapter_sdxl_lcm_custom_timesteps(self): @@ -667,7 +664,16 @@ def test_adapter_sdxl_lcm_custom_timesteps(self): assert image.shape == (1, 64, 64, 3) expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448]) - debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()] - print(",".join(debug)) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index bb54d212a786..9740d28b0b14 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -1,4 +1,6 @@ import gc +import os +import tempfile import unittest import torch @@ -12,8 +14,17 @@ StableUnCLIPPipeline, UNet2DConditionModel, ) +from diffusers.models.modeling_utils import ModelMixin from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer -from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device +from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME +from diffusers.utils.testing_utils import ( + enable_full_determinism, + is_accelerate_available, + load_numpy, + nightly, + require_torch_gpu, + torch_device, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import ( @@ -24,6 +35,10 @@ ) +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + + enable_full_determinism() @@ -184,6 +199,46 @@ def test_attention_slicing_forward_pass(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=1e-3) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + + # It needs a different sharding ratio than the standard 0.75. So, we override it. + def test_sharded_components_can_be_device_placed(self): + components = self.get_dummy_components() + + component_selected = None + for component_name in components: + if isinstance(components[component_name], ModelMixin) and hasattr( + components[component_name], "load_config" + ): + component_to_be_sharded = components[component_name] + component_cls = component_to_be_sharded.__class__ + component_selected = component_name + break + + assert component_selected, "No component selected that can be sharded." + + model_size = compute_module_sizes(component_to_be_sharded)[""] + max_shard_size = int((model_size * 0.45) / (2**10)) + + with tempfile.TemporaryDirectory() as tmp_dir: + component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB") + self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) + + loaded_sharded_component = component_cls.from_pretrained(tmp_dir) + _ = components.pop(component_selected) + components.update({component_selected: loaded_sharded_component}) + _ = self.pipeline_class(**components).to(torch_device) + @nightly @require_torch_gpu diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index a5cbf7761501..1a662819b00f 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -205,6 +205,18 @@ def test_inference_batch_single_identical(self): def test_xformers_attention_forwardGenerator_pass(self): self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False) + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass + @nightly @require_torch_gpu diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 4d2b534c9a28..995c1d02c202 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -41,10 +41,14 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( CaptureLogger, + is_accelerate_available, + nightly, require_accelerate_version_greater, require_accelerator, require_torch, + require_torch_multi_gpu, skip_mps, + slow, torch_device, ) @@ -61,6 +65,10 @@ from ..others.test_utils import TOKEN, USER, is_staging_test +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + + def to_np(tensor): if isinstance(tensor, torch.Tensor): tensor = tensor.detach().cpu().numpy() @@ -1902,6 +1910,78 @@ def test_StableDiffusionMixin_component(self): ) ) + @require_torch_multi_gpu + @slow + @nightly + def test_calling_to_raises_error_device_mapped_components(self, safe_serialization=True): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + max_model_size = max( + compute_module_sizes(module)[""] + for _, module in pipe.components.items() + if isinstance(module, torch.nn.Module) + ) + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization) + max_memory = {0: max_model_size, 1: max_model_size} + loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory) + + with self.assertRaises(ValueError) as err_context: + loaded_pipe.to(torch_device) + + self.assertTrue( + "The following pipeline components have been found" in str(err_context.exception) + and "This is incompatible with explicitly setting the device using `to()`" in str(err_context.exception) + ) + + @require_torch_multi_gpu + @slow + @nightly + def test_calling_mco_raises_error_device_mapped_components(self, safe_serialization=True): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + max_model_size = max( + compute_module_sizes(module)[""] + for _, module in pipe.components.items() + if isinstance(module, torch.nn.Module) + ) + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization) + max_memory = {0: max_model_size, 1: max_model_size} + loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory) + + with self.assertRaises(ValueError) as err_context: + loaded_pipe.enable_model_cpu_offload() + + self.assertTrue( + "The following pipeline components have been found" in str(err_context.exception) + and "This is incompatible with `enable_model_cpu_offload()`" in str(err_context.exception) + ) + + @require_torch_multi_gpu + @slow + @nightly + def test_calling_sco_raises_error_device_mapped_components(self, safe_serialization=True): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + max_model_size = max( + compute_module_sizes(module)[""] + for _, module in pipe.components.items() + if isinstance(module, torch.nn.Module) + ) + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization) + max_memory = {0: max_model_size, 1: max_model_size} + loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory) + + with self.assertRaises(ValueError) as err_context: + loaded_pipe.enable_sequential_cpu_offload() + + self.assertTrue( + "The following pipeline components have been found" in str(err_context.exception) + and "This is incompatible with `enable_sequential_cpu_offload()`" in str(err_context.exception) + ) + @is_staging_test class PipelinePushToHubTester(unittest.TestCase): diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index 07590c9db458..235418571b30 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -14,6 +14,8 @@ # limitations under the License. import gc +import os +import tempfile import unittest import numpy as np @@ -21,9 +23,12 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel +from diffusers.models.modeling_utils import ModelMixin from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel +from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME from diffusers.utils.testing_utils import ( enable_full_determinism, + is_accelerate_available, load_numpy, nightly, require_torch_gpu, @@ -35,6 +40,9 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +if is_accelerate_available(): + from accelerate.utils import compute_module_sizes + enable_full_determinism() @@ -418,6 +426,34 @@ def test_save_load_optional_components(self): def test_float16_inference(self): super().test_float16_inference(expected_max_diff=1.0) + # It needs a different sharding ratio than the standard 0.75. So, we override it. + def test_sharded_components_can_be_device_placed(self): + components = self.get_dummy_components() + + component_selected = None + for component_name in components: + if isinstance(components[component_name], ModelMixin) and hasattr( + components[component_name], "load_config" + ): + component_to_be_sharded = components[component_name] + component_cls = component_to_be_sharded.__class__ + component_selected = component_name + break + + assert component_selected, "No component selected that can be sharded." + + model_size = compute_module_sizes(component_to_be_sharded)[""] + max_shard_size = int((model_size * 0.45) / (2**10)) + + with tempfile.TemporaryDirectory() as tmp_dir: + component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB") + self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) + + loaded_sharded_component = component_cls.from_pretrained(tmp_dir) + _ = components.pop(component_selected) + components.update({component_selected: loaded_sharded_component}) + _ = self.pipeline_class(**components).to(torch_device) + @nightly class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase): diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index 2e0ba1cfb8eb..5cf017029fdf 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -576,6 +576,15 @@ def test_unidiffuser_default_img2text_v1_cuda_fp16(self): expected_text_prefix = '" This This' assert text[0][: len(expected_text_prefix)] == expected_text_prefix + def test_calling_mco_raises_error_device_mapped_components(self): + super().test_calling_mco_raises_error_device_mapped_components(safe_serialization=False) + + def test_calling_to_raises_error_device_mapped_components(self): + super().test_calling_to_raises_error_device_mapped_components(safe_serialization=False) + + def test_calling_sco_raises_error_device_mapped_components(self): + super().test_calling_sco_raises_error_device_mapped_components(safe_serialization=False) + @nightly @require_torch_gpu diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py index 0caed159100a..cd7891767f65 100644 --- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py +++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py @@ -237,3 +237,15 @@ def test_callback_inputs(self): def test_callback_cfg(self): pass + + @unittest.skip("Test not supported.") + def test_calling_mco_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_to_raises_error_device_mapped_components(self): + pass + + @unittest.skip("Test not supported.") + def test_calling_sco_raises_error_device_mapped_components(self): + pass