From dc1aee2718c685aa6a430618185a07c4d2707faa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 17 Sep 2024 07:22:22 +0530
Subject: [PATCH 01/14] fix: lora loading when using with a device_mapped
 model.

---
 src/diffusers/loaders/lora_base.py        | 12 ++++++-
 src/diffusers/pipelines/pipeline_utils.py | 38 +++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
index 89bb498a3acd..9f6ea8ae9362 100644
--- a/src/diffusers/loaders/lora_base.py
+++ b/src/diffusers/loaders/lora_base.py
@@ -31,6 +31,7 @@
     delete_adapter_layers,
     deprecate,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_available,
     is_transformers_available,
     logging,
@@ -214,9 +215,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index dffd49cb0ce7..cbff7fe99945 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -386,6 +386,11 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
@@ -403,6 +408,13 @@ def module_is_offloaded(module):
 
             return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
 
+        # device-mapped modules should not go through any device placements.
+        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        if pipeline_has_device_mapped_modules:
+            raise ValueError(
+                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+            )
+
         # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
@@ -975,6 +987,19 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+
+        # device-mapped modules should not go through any device placements.
+        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        if pipeline_has_device_mapped_modules:
+            raise ValueError(
+                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+            )
+
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
             raise ValueError(
@@ -1068,6 +1093,19 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
+
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+
+        # device-mapped modules should not go through any device placements.
+        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        if pipeline_has_device_mapped_modules:
+            raise ValueError(
+                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+            )
+
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:

From 949a9298e3b616ca50b0eca290cafd07617baad5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 17 Sep 2024 19:34:32 +0530
Subject: [PATCH 02/14] better attibutung

---
 src/diffusers/pipelines/pipeline_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index cbff7fe99945..b286b4e6e44c 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -389,7 +389,7 @@ def to(self, *args, **kwargs):
         def model_has_device_map(model):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
-            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+            return getattr(model, "hf_device_map", None) is not None
 
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
@@ -991,7 +991,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         def model_has_device_map(model):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
-            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+            return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
         pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
@@ -1097,7 +1097,7 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
         def model_has_device_map(model):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
-            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+            return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
         pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())

From 64b3ad14dadffb683e854c8c1097ec82e9b025da Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 17 Sep 2024 19:36:14 +0530
Subject: [PATCH 03/14] empty

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

From 54791980856dcfcdb60eae13ce0a5ad8f73dffee Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 24 Sep 2024 19:53:34 +0530
Subject: [PATCH 04/14] Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/diffusers/loaders/lora_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
index 9f6ea8ae9362..0799af06fb35 100644
--- a/src/diffusers/loaders/lora_base.py
+++ b/src/diffusers/loaders/lora_base.py
@@ -218,7 +218,7 @@ def _optionally_disable_offloading(cls, _pipeline):
         def model_has_device_map(model):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
-            return hasattr(model, "hf_device_map") and model.hf_device_map is not None
+            return getattr(model, "hf_device_map", None) is not None
 
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():

From ea727a3b320df43af86844074f3d2ffc76bf32e6 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 16:05:33 +0530
Subject: [PATCH 05/14] minors

---
 src/diffusers/pipelines/pipeline_utils.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 1b7e872e826c..91eeb4aa0cd5 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -410,7 +410,9 @@ def module_is_offloaded(module):
             return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        pipeline_has_device_mapped_modules = any(
+            model_has_device_map(component) for _, component in self.components.items()
+        )
         if pipeline_has_device_mapped_modules:
             raise ValueError(
                 "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
@@ -1014,10 +1016,12 @@ def model_has_device_map(model):
             return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        pipeline_has_device_mapped_modules = any(
+            model_has_device_map(component) for _, component in self.components.items()
+        )
         if pipeline_has_device_mapped_modules:
             raise ValueError(
-                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `enable_model_cpu_offload()`."
             )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
@@ -1120,10 +1124,12 @@ def model_has_device_map(model):
             return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(model_has_device_map(module) for _, module in self.components.items())
+        pipeline_has_device_mapped_modules = any(
+            model_has_device_map(component) for _, component in self.components.items()
+        )
         if pipeline_has_device_mapped_modules:
             raise ValueError(
-                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `enable_sequential_cpu_offload()`."
             )
 
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):

From 71989e3edf7c68cfea11e356647aa816c7091dda Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 16:17:12 +0530
Subject: [PATCH 06/14] better error messages.

---
 src/diffusers/pipelines/pipeline_utils.py | 33 ++++++++++++-----------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 91eeb4aa0cd5..f918de858781 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -410,12 +410,13 @@ def module_is_offloaded(module):
             return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(
-            model_has_device_map(component) for _, component in self.components.items()
-        )
-        if pipeline_has_device_mapped_modules:
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
             raise ValueError(
-                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `to()`."
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `to()`."
             )
 
         # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
@@ -1016,12 +1017,13 @@ def model_has_device_map(model):
             return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(
-            model_has_device_map(component) for _, component in self.components.items()
-        )
-        if pipeline_has_device_mapped_modules:
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
             raise ValueError(
-                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `enable_model_cpu_offload()`."
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `enable_model_cpu_offload()`."
             )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
@@ -1124,12 +1126,13 @@ def model_has_device_map(model):
             return getattr(model, "hf_device_map", None) is not None
 
         # device-mapped modules should not go through any device placements.
-        pipeline_has_device_mapped_modules = any(
-            model_has_device_map(component) for _, component in self.components.items()
-        )
-        if pipeline_has_device_mapped_modules:
+        device_mapped_components = [
+            key for key, component in self.components.items() if model_has_device_map(component)
+        ]
+        if device_mapped_components:
             raise ValueError(
-                "It seems like you have device-mapped modules in the pipeline which doesn't allow explicit device placement using `enable_sequential_cpu_offload()`."
+                "The following pipeline components have been found to use a device map: "
+                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `enable_sequential_cpu_offload()`."
             )
 
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):

From f62afac6405e9a89ec3b11e105b7520ffd0faa9e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 16:18:15 +0530
Subject: [PATCH 07/14] fix-copies

---
 src/diffusers/loaders/unet.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index 2fa7732a6a3b..55b1a24e60db 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -39,6 +39,7 @@
     get_adapter_name,
     get_peft_kwargs,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_version,
     is_torch_version,
     logging,
@@ -398,9 +399,18 @@ def _optionally_disable_offloading(cls, _pipeline):
         is_model_cpu_offload = False
         is_sequential_cpu_offload = False
 
+        def model_has_device_map(model):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+            return getattr(model, "hf_device_map", None) is not None
+
         if _pipeline is not None and _pipeline.hf_device_map is None:
             for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
+                    isinstance(component, nn.Module)
+                    and hasattr(component, "_hf_hook")
+                    and not model_has_device_map(component)
+                ):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:

From 2334f78c3bae84feec8f55deb0cd737dfa7ceca8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 18:06:41 +0530
Subject: [PATCH 08/14] add: tests, docs.

---
 .../en/training/distributed_inference.md      |   2 +
 src/diffusers/pipelines/pipeline_utils.py     |   4 +-
 tests/pipelines/audioldm2/test_audioldm2.py   |   5 +
 tests/pipelines/flux/test_pipeline_flux.py    |  82 ++++++++++++++
 tests/pipelines/musicldm/test_musicldm.py     |   4 +
 tests/pipelines/test_pipelines_common.py      | 102 ++++++++++++++++++
 6 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
index 0e1eb7962bf7..63bbec0b1190 100644
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -237,3 +237,5 @@ with torch.no_grad():
 ```
 
 By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
+
+This workflow is also compatible when working with LoRAs via `load_lora_weights()`. However, note that only LoRAs not involving any text encoder components are supported in this workflow at the moment.
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index f918de858781..40e1924d64eb 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1023,7 +1023,7 @@ def model_has_device_map(model):
         if device_mapped_components:
             raise ValueError(
                 "The following pipeline components have been found to use a device map: "
-                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `enable_model_cpu_offload()`."
+                f"{device_mapped_components}. This is incompatible with `enable_model_cpu_offload()`."
             )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
@@ -1132,7 +1132,7 @@ def model_has_device_map(model):
         if device_mapped_components:
             raise ValueError(
                 "The following pipeline components have been found to use a device map: "
-                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `enable_sequential_cpu_offload()`."
+                f"{device_mapped_components}. This is incompatible with `enable_sequential_cpu_offload()`."
             )
 
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index fb550dd3219d..9af49697f913 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -506,9 +506,14 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
     def test_sequential_cpu_offload_forward_pass(self):
         pass
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 class AudioLDM2PipelineSlowTests(unittest.TestCase):
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 4caff4030261..0063255ce86b 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -6,9 +6,11 @@
 from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils.testing_utils import (
     numpy_cosine_similarity_distance,
     require_torch_gpu,
+    require_torch_multi_gpu,
     slow,
     torch_device,
 )
@@ -249,3 +251,83 @@ def test_flux_inference(self):
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
 
         assert max_diff < 1e-4
+
+    @require_torch_multi_gpu
+    @torch.no_grad()
+    def test_flux_component_sharding(self):
+        ckpt_id = "black-forest-labs/FLUX.1-dev"
+        dtype = torch.bfloat16
+        prompt = "a photo of a cat with tiger-like look"
+
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            transformer=None,
+            vae=None,
+            device_map="balanced",
+            max_memory={0: "16GB", 1: "16GB"},
+            torch_dtype=dtype,
+        )
+        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
+            prompt=prompt, prompt_2=None, max_sequence_length=512
+        )
+
+        del pipeline.text_encoder
+        del pipeline.text_encoder_2
+        del pipeline.tokenizer
+        del pipeline.tokenizer_2
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        transformer = FluxTransformer2DModel.from_pretrained(
+            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
+        )
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            text_encoder=None,
+            text_encoder_2=None,
+            tokenizer=None,
+            tokenizer_2=None,
+            vae=None,
+            transformer=transformer,
+            torch_dtype=dtype,
+        )
+
+        height, width = 768, 1360
+        # No need to wrap it up under `torch.no_grad()` as pipeline call method
+        # is already wrapped under that.
+        latents = pipeline(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_inference_steps=10,
+            guidance_scale=3.5,
+            height=height,
+            width=width,
+            output_type="latent",
+            generator=torch.manual_seed(0),
+        ).images
+        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
+        expected_slice = np.array([-0.377, -0.3008, -0.5117, -0.252, 0.0615, -0.3477, -0.1309, -0.1914, 0.1533])
+
+        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
+
+        del pipeline.transformer
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
+        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+
+        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
+        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+
+        image = vae.decode(latents, return_dict=False)[0]
+        image = image_processor.postprocess(image, output_type="np")
+        image_slice = image[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152])
+
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py
index e51f5103933a..70765d981bbc 100644
--- a/tests/pipelines/musicldm/test_musicldm.py
+++ b/tests/pipelines/musicldm/test_musicldm.py
@@ -404,6 +404,10 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skip("Test currently not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 @require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 3e6f9d1278e8..7a4891f38a75 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -30,19 +30,24 @@
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin
+from diffusers.models.adapter import MultiAdapter
 from diffusers.models.attention_processor import AttnProcessor
 from diffusers.models.controlnet_xs import UNetControlNetXSModel
 from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
+from diffusers.pipelines.controlnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
 from diffusers.utils.testing_utils import (
     CaptureLogger,
+    nightly,
     require_torch,
+    require_torch_multi_gpu,
     skip_mps,
+    slow,
     torch_device,
 )
 
@@ -59,6 +64,10 @@
 from ..others.test_utils import TOKEN, USER, is_staging_test
 
 
+if is_accelerate_available():
+    from accelerate.utils import compute_module_sizes
+
+
 def to_np(tensor):
     if isinstance(tensor, torch.Tensor):
         tensor = tensor.detach().cpu().numpy()
@@ -1907,6 +1916,99 @@ def test_StableDiffusionMixin_component(self):
             )
         )
 
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_to_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.to(torch_device)
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with explicitly setting the device using `to()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_model_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_model_cpu_offload()`" in str(err_context.exception)
+        )
+
+    @require_torch_multi_gpu
+    @slow
+    @nightly
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        if "Combined" in self.pipeline_class.__name__:
+            return
+
+        # TODO (sayakpaul): skip these for now. revisit later.
+        components = self.get_dummy_components()
+        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
+            return
+
+        pipe = self.pipeline_class(**components)
+        max_model_size = max(
+            compute_module_sizes(module)[""]
+            for _, module in pipe.components.items()
+            if isinstance(module, torch.nn.Module)
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            max_memory = {0: max_model_size, 1: max_model_size}
+            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
+
+        with self.assertRaises(ValueError) as err_context:
+            loaded_pipe.enable_sequential_cpu_offload()
+
+        self.assertTrue(
+            "The following pipeline components have been found" in str(err_context.exception)
+            and "This is incompatible with `enable_sequential_cpu_offload()`" in str(err_context.exception)
+        )
+
 
 @is_staging_test
 class PipelinePushToHubTester(unittest.TestCase):

From 5ea1173aeb7e8a66041c222021135be207f616ec Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 18:10:24 +0530
Subject: [PATCH 09/14] add hardware note.

---
 tests/pipelines/flux/test_pipeline_flux.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 0063255ce86b..918abb7bc4f1 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -255,6 +255,10 @@ def test_flux_inference(self):
     @require_torch_multi_gpu
     @torch.no_grad()
     def test_flux_component_sharding(self):
+        """
+        internal note: test was run on `audace`.
+        """
+        
         ckpt_id = "black-forest-labs/FLUX.1-dev"
         dtype = torch.bfloat16
         prompt = "a photo of a cat with tiger-like look"

From c0dee879d4d5bf1b36e81c27869369aa3414a76f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 19 Oct 2024 18:19:59 +0530
Subject: [PATCH 10/14] quality

---
 tests/pipelines/flux/test_pipeline_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 918abb7bc4f1..1927055ca790 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -258,7 +258,7 @@ def test_flux_component_sharding(self):
         """
         internal note: test was run on `audace`.
         """
-        
+
         ckpt_id = "black-forest-labs/FLUX.1-dev"
         dtype = torch.bfloat16
         prompt = "a photo of a cat with tiger-like look"

From fe2cca8766df60ef75caa77ce6375a9d0162b1b0 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 23 Oct 2024 12:51:05 +0530
Subject: [PATCH 11/14] Update docs/source/en/training/distributed_inference.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/training/distributed_inference.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
index 63bbec0b1190..8e68b1bed382 100644
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -238,4 +238,4 @@ with torch.no_grad():
 
 By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
 
-This workflow is also compatible when working with LoRAs via `load_lora_weights()`. However, note that only LoRAs not involving any text encoder components are supported in this workflow at the moment.
+This workflow is also compatible with LoRAs via [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. However, only LoRAs without text encoder components are currently supported in this workflow.

From 03377b7afc18ae6c75bed65bb1a57ef7bfee71c3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 31 Oct 2024 19:02:45 +0530
Subject: [PATCH 12/14] fixes

---
 .../pipelines/pipeline_loading_utils.py       |  7 ++
 src/diffusers/pipelines/pipeline_utils.py     | 18 +---
 tests/pipelines/flux/test_pipeline_flux.py    | 87 ++++++++++++++++++-
 3 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index 5eba1952e608..7d42ed5bcba8 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -36,6 +36,7 @@
     deprecate,
     get_class_from_dynamic_module,
     is_accelerate_available,
+    is_accelerate_version,
     is_peft_available,
     is_transformers_available,
     logging,
@@ -947,3 +948,9 @@ def _get_ignore_patterns(
             )
 
     return ignore_patterns
+
+
+def model_has_device_map(model):
+    if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+        return False
+    return getattr(model, "hf_device_map", None) is not None
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index d35fb77feb29..791b3e5e9394 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -85,6 +85,7 @@
     _update_init_kwargs_with_connected_pipeline,
     load_sub_model,
     maybe_raise_or_warn,
+    model_has_device_map,
     variant_compatible_siblings,
     warn_deprecated_model_variant,
 )
@@ -389,11 +390,6 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
 
-        def model_has_device_map(model):
-            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
-                return False
-            return getattr(model, "hf_device_map", None) is not None
-
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
@@ -1017,12 +1013,6 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
-
-        def model_has_device_map(model):
-            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
-                return False
-            return getattr(model, "hf_device_map", None) is not None
-
         # device-mapped modules should not go through any device placements.
         device_mapped_components = [
             key for key, component in self.components.items() if model_has_device_map(component)
@@ -1135,12 +1125,6 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
                 The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                 default to "cuda".
         """
-
-        def model_has_device_map(model):
-            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
-                return False
-            return getattr(model, "hf_device_map", None) is not None
-
         # device-mapped modules should not go through any device placements.
         device_mapped_components = [
             key for key, component in self.components.items() if model_has_device_map(component)
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 1927055ca790..81937b0e47d0 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -323,7 +323,7 @@ def test_flux_component_sharding(self):
         torch.cuda.empty_cache()
 
         vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
-        vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
         image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
 
         latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
@@ -335,3 +335,88 @@ def test_flux_component_sharding(self):
         expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152])
 
         assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
+
+    @require_torch_multi_gpu
+    @torch.no_grad()
+    def test_flux_component_sharding_with_lora(self):
+        """
+        internal note: test was run on `audace`.
+        """
+
+        ckpt_id = "black-forest-labs/FLUX.1-dev"
+        dtype = torch.bfloat16
+        prompt = "jon snow eating pizza."
+
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            transformer=None,
+            vae=None,
+            device_map="balanced",
+            max_memory={0: "16GB", 1: "16GB"},
+            torch_dtype=dtype,
+        )
+        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
+            prompt=prompt, prompt_2=None, max_sequence_length=512
+        )
+
+        del pipeline.text_encoder
+        del pipeline.text_encoder_2
+        del pipeline.tokenizer
+        del pipeline.tokenizer_2
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        transformer = FluxTransformer2DModel.from_pretrained(
+            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
+        )
+        pipeline = FluxPipeline.from_pretrained(
+            ckpt_id,
+            text_encoder=None,
+            text_encoder_2=None,
+            tokenizer=None,
+            tokenizer_2=None,
+            vae=None,
+            transformer=transformer,
+            torch_dtype=dtype,
+        )
+        pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
+
+        height, width = 768, 1360
+        # No need to wrap it up under `torch.no_grad()` as pipeline call method
+        # is already wrapped under that.
+        latents = pipeline(
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_inference_steps=10,
+            guidance_scale=3.5,
+            height=height,
+            width=width,
+            output_type="latent",
+            generator=torch.manual_seed(0),
+        ).images
+        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
+        expected_slice = np.array([-0.6523, -0.4961, -0.9141, -0.5, -0.2129, -0.6914, -0.375, -0.5664, -0.1699])
+
+        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
+
+        del pipeline.transformer
+        del pipeline
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+
+        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
+        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+
+        image = vae.decode(latents, return_dict=False)[0]
+        image = image_processor.postprocess(image, output_type="np")
+        image_slice = image[0, :3, :3, -1].flatten()
+        expected_slice = np.array([0.1211, 0.1094, 0.1035, 0.1094, 0.1113, 0.1074, 0.1133, 0.1133, 0.1094])
+
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4

From 0bd40cbff35d01a76674a68f47a644f5e836df9a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 31 Oct 2024 19:10:18 +0530
Subject: [PATCH 13/14] skip properly.

---
 .../kandinsky/test_kandinsky_combined.py      | 12 +++++++
 .../kandinsky2_2/test_kandinsky_combined.py   | 36 +++++++++++++++++++
 .../test_stable_cascade_combined.py           | 12 +++++++
 tests/pipelines/test_pipelines_common.py      |  9 -----
 .../wuerstchen/test_wuerstchen_combined.py    | 12 +++++++
 5 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py
index 607a47e08e58..305e4bc60eeb 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -363,3 +363,15 @@ def test_save_load_optional_components(self):
 
     def test_save_load_local(self):
         super().test_save_load_local(expected_max_difference=5e-3)
+
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
index dbba0831397b..cf2b70f4c990 100644
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -159,6 +159,18 @@ def test_callback_inputs(self):
     def test_callback_cfg(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22Img2ImgCombinedPipeline
@@ -281,6 +293,18 @@ def test_callback_inputs(self):
     def test_callback_cfg(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22InpaintCombinedPipeline
@@ -404,3 +428,15 @@ def test_callback_inputs(self):
 
     def test_callback_cfg(self):
         pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
index d256deed376c..d799ae6e623a 100644
--- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
@@ -279,3 +279,15 @@ def test_stable_cascade_combined_prompt_embeds(self):
         )
 
         assert np.abs(output_prompt.images - output_prompt_embeds.images).max() < 1e-5
+
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index e59a4dfb1d48..9671efe6c979 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1921,9 +1921,6 @@ def test_StableDiffusionMixin_component(self):
     @slow
     @nightly
     def test_calling_to_raises_error_device_mapped_components(self):
-        if "Combined" in self.pipeline_class.__name__:
-            return
-
         # TODO (sayakpaul): skip these for now. revisit later.
         components = self.get_dummy_components()
         if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
@@ -1952,9 +1949,6 @@ def test_calling_to_raises_error_device_mapped_components(self):
     @slow
     @nightly
     def test_calling_mco_raises_error_device_mapped_components(self):
-        if "Combined" in self.pipeline_class.__name__:
-            return
-
         # TODO (sayakpaul): skip these for now. revisit later.
         components = self.get_dummy_components()
         if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
@@ -1983,9 +1977,6 @@ def test_calling_mco_raises_error_device_mapped_components(self):
     @slow
     @nightly
     def test_calling_sco_raises_error_device_mapped_components(self):
-        if "Combined" in self.pipeline_class.__name__:
-            return
-
         # TODO (sayakpaul): skip these for now. revisit later.
         components = self.get_dummy_components()
         if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
index 0caed159100a..cd7891767f65 100644
--- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
@@ -237,3 +237,15 @@ def test_callback_inputs(self):
 
     def test_callback_cfg(self):
         pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass

From a61b754fe54890513c42eed3ec4c93968c246040 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 31 Oct 2024 20:40:59 +0530
Subject: [PATCH 14/14] fixes

---
 tests/pipelines/controlnet/test_controlnet.py | 24 +++++++++++++++++
 .../controlnet/test_controlnet_img2img.py     | 12 +++++++++
 .../controlnet/test_controlnet_inpaint.py     | 12 +++++++++
 .../controlnet/test_controlnet_sdxl.py        | 24 +++++++++++++++++
 .../kandinsky/test_kandinsky_combined.py      | 24 +++++++++++++++++
 .../test_stable_diffusion_adapter.py          | 12 +++++++++
 .../test_stable_diffusion_xl_adapter.py       | 18 ++++++++-----
 .../stable_unclip/test_stable_unclip.py       | 12 +++++++++
 .../test_stable_unclip_img2img.py             | 12 +++++++++
 tests/pipelines/test_pipelines_common.py      | 26 +++++--------------
 .../pipelines/unidiffuser/test_unidiffuser.py |  9 +++++++
 11 files changed, 159 insertions(+), 26 deletions(-)

diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index b12655d989d4..1cb6569716a8 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -514,6 +514,18 @@ def test_inference_multiple_prompt_input(self):
 
         assert image.shape == (4, 64, 64, 3)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class StableDiffusionMultiControlNetOneModelPipelineFastTests(
     IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
@@ -697,6 +709,18 @@ def test_save_pretrained_raise_not_implemented_exception(self):
             except NotImplementedError:
                 pass
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py
index 7c4ae716b37d..45bc70c809f2 100644
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -389,6 +389,18 @@ def test_save_pretrained_raise_not_implemented_exception(self):
             except NotImplementedError:
                 pass
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
index e49106334c2e..af8ddb7e6b28 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -441,6 +441,18 @@ def test_save_pretrained_raise_not_implemented_exception(self):
             except NotImplementedError:
                 pass
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py
index c931391ac4d5..a8fa23678fc7 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -683,6 +683,18 @@ def test_inference_batch_single_identical(self):
     def test_save_load_optional_components(self):
         return self._test_save_load_optional_components()
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(
     PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
@@ -887,6 +899,18 @@ def test_negative_conditions(self):
 
         self.assertTrue(np.abs(image_slice_without_neg_cond - image_slice_with_neg_cond).max() > 1e-2)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py
index 305e4bc60eeb..739f8676cbd3 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -139,6 +139,18 @@ def test_float16_inference(self):
     def test_dict_tuple_outputs_equivalent(self):
         super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyImg2ImgCombinedPipeline
@@ -248,6 +260,18 @@ def test_dict_tuple_outputs_equivalent(self):
     def test_save_load_optional_components(self):
         super().test_save_load_optional_components(expected_max_difference=5e-4)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyInpaintCombinedPipeline
diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
index 2a1e691e9e8f..996afbb9d323 100644
--- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
+++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
@@ -593,6 +593,18 @@ def test_inference_batch_single_identical(
         if test_mean_pixel_difference:
             assert_mean_pixel_difference(output_batch[0][0], output[0][0])
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
index 2091af9c0383..61b5b754c44c 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -642,9 +642,6 @@ def test_adapter_sdxl_lcm(self):
         assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
 
-        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
-        print(",".join(debug))
-
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_adapter_sdxl_lcm_custom_timesteps(self):
@@ -667,7 +664,16 @@ def test_adapter_sdxl_lcm_custom_timesteps(self):
         assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
 
-        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
-        print(",".join(debug))
-
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py
index bb54d212a786..be5e3783ff5c 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -184,6 +184,18 @@ def test_attention_slicing_forward_pass(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=1e-3)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 @require_torch_gpu
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index a5cbf7761501..1a662819b00f 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -205,6 +205,18 @@ def test_inference_batch_single_identical(self):
     def test_xformers_attention_forwardGenerator_pass(self):
         self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
 
+    @unittest.skip("Test not supported.")
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_to_raises_error_device_mapped_components(self):
+        pass
+
+    @unittest.skip("Test not supported.")
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        pass
+
 
 @nightly
 @require_torch_gpu
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 9671efe6c979..f5ceda8f2703 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -30,13 +30,11 @@
 )
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin
-from diffusers.models.adapter import MultiAdapter
 from diffusers.models.attention_processor import AttnProcessor
 from diffusers.models.controlnet_xs import UNetControlNetXSModel
 from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
-from diffusers.pipelines.controlnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
@@ -1920,12 +1918,8 @@ def test_StableDiffusionMixin_component(self):
     @require_torch_multi_gpu
     @slow
     @nightly
-    def test_calling_to_raises_error_device_mapped_components(self):
-        # TODO (sayakpaul): skip these for now. revisit later.
+    def test_calling_to_raises_error_device_mapped_components(self, safe_serialization=True):
         components = self.get_dummy_components()
-        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
-            return
-
         pipe = self.pipeline_class(**components)
         max_model_size = max(
             compute_module_sizes(module)[""]
@@ -1933,7 +1927,7 @@ def test_calling_to_raises_error_device_mapped_components(self):
             if isinstance(module, torch.nn.Module)
         )
         with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
+            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
             max_memory = {0: max_model_size, 1: max_model_size}
             loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
 
@@ -1948,12 +1942,8 @@ def test_calling_to_raises_error_device_mapped_components(self):
     @require_torch_multi_gpu
     @slow
     @nightly
-    def test_calling_mco_raises_error_device_mapped_components(self):
-        # TODO (sayakpaul): skip these for now. revisit later.
+    def test_calling_mco_raises_error_device_mapped_components(self, safe_serialization=True):
         components = self.get_dummy_components()
-        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
-            return
-
         pipe = self.pipeline_class(**components)
         max_model_size = max(
             compute_module_sizes(module)[""]
@@ -1961,7 +1951,7 @@ def test_calling_mco_raises_error_device_mapped_components(self):
             if isinstance(module, torch.nn.Module)
         )
         with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
+            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
             max_memory = {0: max_model_size, 1: max_model_size}
             loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
 
@@ -1976,12 +1966,8 @@ def test_calling_mco_raises_error_device_mapped_components(self):
     @require_torch_multi_gpu
     @slow
     @nightly
-    def test_calling_sco_raises_error_device_mapped_components(self):
-        # TODO (sayakpaul): skip these for now. revisit later.
+    def test_calling_sco_raises_error_device_mapped_components(self, safe_serialization=True):
         components = self.get_dummy_components()
-        if any(isinstance(component, (MultiControlNetModel, MultiAdapter)) for component in components):
-            return
-
         pipe = self.pipeline_class(**components)
         max_model_size = max(
             compute_module_sizes(module)[""]
@@ -1989,7 +1975,7 @@ def test_calling_sco_raises_error_device_mapped_components(self):
             if isinstance(module, torch.nn.Module)
         )
         with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
+            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
             max_memory = {0: max_model_size, 1: max_model_size}
             loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
 
diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py
index 2e0ba1cfb8eb..5cf017029fdf 100644
--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -576,6 +576,15 @@ def test_unidiffuser_default_img2text_v1_cuda_fp16(self):
         expected_text_prefix = '" This This'
         assert text[0][: len(expected_text_prefix)] == expected_text_prefix
 
+    def test_calling_mco_raises_error_device_mapped_components(self):
+        super().test_calling_mco_raises_error_device_mapped_components(safe_serialization=False)
+
+    def test_calling_to_raises_error_device_mapped_components(self):
+        super().test_calling_to_raises_error_device_mapped_components(safe_serialization=False)
+
+    def test_calling_sco_raises_error_device_mapped_components(self):
+        super().test_calling_sco_raises_error_device_mapped_components(safe_serialization=False)
+
 
 @nightly
 @require_torch_gpu