huggingface · sayakpaul · Oct 31, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -237,3 +237,5 @@ with torch.no_grad():
 ```
 
 By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
+
+This workflow is also compatible when working with LoRAs via `load_lora_weights()`. However, note that only LoRAs not involving any text encoder components are supported in this workflow at the moment.
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
@@ -31,6 +31,7 @@
     delete_adapter_layers,
     deprecate,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_available,
     is_transformers_available,
     logging,
@@ -214,9 +215,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -39,6 +39,7 @@
     get_adapter_name,
     get_peft_kwargs,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_version,
     is_torch_version,
     logging,
@@ -398,9 +399,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -389,6 +389,11 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
@@ -406,6 +411,16 @@ def module_is_offloaded(module):
 
             return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
 
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `to()`."
+            )
+
         # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
@@ -1002,6 +1017,22 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with `enable_model_cpu_offload()`."
+            )
+
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
             raise ValueError(
@@ -1104,6 +1135,22 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
+        # device-mapped modules should not go through any device placements.
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
+            raise ValueError(
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with `enable_sequential_cpu_offload()`."
+            )
+
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -506,9 +506,14 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
     def test_sequential_cpu_offload_forward_pass(self):
         pass
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 class AudioLDM2PipelineSlowTests(unittest.TestCase):

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
@@ -6,9 +6,11 @@
 from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils.testing_utils import (
     numpy_cosine_similarity_distance,
     require_torch_gpu,
+    require_torch_multi_gpu,
     slow,
     torch_device,
 )
@@ -249,3 +251,87 @@ def test_flux_inference(self):
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
 
         assert max_diff < 1e-4
+
+    @require_torch_multi_gpu
+    @torch.no_grad()
+    def test_flux_component_sharding(self):
+        """
+        internal note: test was run on `audace`.
+        """
+
+        ckpt_id = "black-forest-labs/FLUX.1-dev"
+        dtype = torch.bfloat16
+        prompt = "a photo of a cat with tiger-like look"
+
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            transformer=None,
+            vae=None,
+            device_map="balanced",
+            max_memory={0: "16GB", 1: "16GB"},
+            torch_dtype=dtype,
+        )
+        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
+            prompt=prompt, prompt_2=None, max_sequence_length=512
+        )
+
+        del pipeline.text_encoder
+        del pipeline.text_encoder_2
+        del pipeline.tokenizer
+        del pipeline.tokenizer_2
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        transformer = FluxTransformer2DModel.from_pretrained(
+            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
+        )
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            text_encoder=None,
+            text_encoder_2=None,
+            tokenizer=None,
+            tokenizer_2=None,
+            vae=None,
+            transformer=transformer,
+            torch_dtype=dtype,
+        )
+
+        height, width = 768, 1360
+        # No need to wrap it up under `torch.no_grad()` as pipeline call method
+        # is already wrapped under that.
+        latents = pipeline(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_inference_steps=10,
+            guidance_scale=3.5,
+            height=height,
+            width=width,
+            output_type="latent",
+            generator=torch.manual_seed(0),
+        ).images
+        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
+        expected_slice = np.array([-0.377, -0.3008, -0.5117, -0.252, 0.0615, -0.3477, -0.1309, -0.1914, 0.1533])
+
+        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
+
+        del pipeline.transformer
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
+        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+
+        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
+        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+
+        image = vae.decode(latents, return_dict=False)[0]
+        image = image_processor.postprocess(image, output_type="np")
+        image_slice = image[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152])
+
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py
@@ -404,6 +404,10 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 @require_torch_gpu

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -30,19 +30,24 @@
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin
+from diffusers.models.adapter import MultiAdapter
 from diffusers.models.attention_processor import AttnProcessor
 from diffusers.models.controlnet_xs import UNetControlNetXSModel
 from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
+from diffusers.pipelines.controlnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
 from diffusers.utils.testing_utils import (
     CaptureLogger,
+    nightly,
     require_torch,
+    require_torch_multi_gpu,
     skip_mps,
+    slow,
     torch_device,
 )
 
@@ -59,6 +64,10 @@
 from ..others.test_utils import TOKEN, USER, is_staging_test
 
 
+if is_accelerate_available():
+    from accelerate.utils import compute_module_sizes
+
+
 def to_np(tensor):
     if isinstance(tensor, torch.Tensor):
         tensor = tensor.detach().cpu().numpy()
@@ -1907,6 +1916,99 @@ def test_StableDiffusionMixin_component(self):
             )
         )
 
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_to_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.to(torch_device)
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with explicitly setting the device using `to()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_model_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_model_cpu_offload()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_sequential_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_sequential_cpu_offload()`" in str(err_context.exception)
+        )
+
 
 @is_staging_test
 class PipelinePushToHubTester(unittest.TestCase):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -237,3 +237,5 @@ with torch.no_grad():
		```

		By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.

		This workflow is also compatible when working with LoRAs via `load_lora_weights()`. However, note that only LoRAs not involving any text encoder components are supported in this workflow at the moment.
sayakpaul marked this conversation as resolved. Outdated Show resolved Hide resolved sayakpaul marked this conversation as resolved. Outdated Show resolved Hide resolved