From 997e56cf6b20cccfa8b12b7fa14b00b97211f9a9 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 17 Dec 2024 07:00:24 +0100
Subject: [PATCH 1/7] add sharded + device_map check

---
 src/diffusers/models/modeling_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index ce5289e3dbfd..23cc6ce2404e 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -803,6 +803,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     subfolder=subfolder or "",
                 )
                 if hf_quantizer is not None:
+                    is_torchao_quantization_method = quantization_config.quant_method == QuantizationMethod.TORCHAO
+                    if device_map is not None and is_torchao_quantization_method:
+                        raise NotImplementedError(
+                            "Loading sharded checkpoints, while passing `device_map`, is not supported with `torchao` quantization. This will be supported in the near future."
+                        )
+
                     model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata)
                     logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.")
                     is_sharded = False

From c129428105436f225a8b9bcd3f6fdf1f5d475659 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 18 Dec 2024 01:51:50 +0100
Subject: [PATCH 2/7] fix

---
 src/diffusers/models/modeling_utils.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 23cc6ce2404e..69e0f0240968 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -802,13 +802,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     revision=revision,
                     subfolder=subfolder or "",
                 )
-                if hf_quantizer is not None:
-                    is_torchao_quantization_method = quantization_config.quant_method == QuantizationMethod.TORCHAO
-                    if device_map is not None and is_torchao_quantization_method:
-                        raise NotImplementedError(
-                            "Loading sharded checkpoints, while passing `device_map`, is not supported with `torchao` quantization. This will be supported in the near future."
-                        )
-
+                if hf_quantizer is not None and is_bnb_quantization_method:
                     model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata)
                     logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.")
                     is_sharded = False

From 739601c3002ce960bb201b76dded7f189a88bf69 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 18 Dec 2024 03:01:32 +0100
Subject: [PATCH 3/7] add test for sharded model

---
 tests/quantization/torchao/test_torchao.py | 71 +++++++++++++++-------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 5c71fc4e0ae7..1f15c58bc156 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -279,13 +279,14 @@ def test_int4wo_quant_bfloat16_conversion(self):
         self.assertEqual(weight.quant_max, 15)
         self.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType))
 
-    def test_offload(self):
+    def test_device_map(self):
         """
-        Test if the quantized model int4 weight-only is working properly with cpu/disk offload. Also verifies
-        that the device map is correctly set (in the `hf_device_map` attribute of the model).
+        Test if the quantized model int4 weight-only is working properly with "auto" and custom device maps.
+        The custom device map performs cpu/disk offloading as well. Also verifies that the device map is
+        correctly set (in the `hf_device_map` attribute of the model).
         """
 
-        device_map_offload = {
+        custom_device_map_dict = {
             "time_text_embed": torch_device,
             "context_embedder": torch_device,
             "x_embedder": torch_device,
@@ -294,27 +295,51 @@ def test_offload(self):
             "norm_out": torch_device,
             "proj_out": "cpu",
         }
+        device_maps = ["auto", custom_device_map_dict]
 
         inputs = self.get_dummy_tensor_inputs(torch_device)
-
-        with tempfile.TemporaryDirectory() as offload_folder:
-            quantization_config = TorchAoConfig("int4_weight_only", group_size=64)
-            quantized_model = FluxTransformer2DModel.from_pretrained(
-                "hf-internal-testing/tiny-flux-pipe",
-                subfolder="transformer",
-                quantization_config=quantization_config,
-                device_map=device_map_offload,
-                torch_dtype=torch.bfloat16,
-                offload_folder=offload_folder,
-            )
-
-            self.assertTrue(quantized_model.hf_device_map == device_map_offload)
-
-            output = quantized_model(**inputs)[0]
-            output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
-
-            expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375])
-            self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3))
+        expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375])
+
+        for device_map in device_maps:
+            device_map_to_compare = {"": 0} if device_map == "auto" else device_map
+
+            # Test non-sharded model
+            with tempfile.TemporaryDirectory() as offload_folder:
+                quantization_config = TorchAoConfig("int4_weight_only", group_size=64)
+                quantized_model = FluxTransformer2DModel.from_pretrained(
+                    "hf-internal-testing/tiny-flux-pipe",
+                    subfolder="transformer",
+                    quantization_config=quantization_config,
+                    device_map=device_map,
+                    torch_dtype=torch.bfloat16,
+                    offload_folder=offload_folder,
+                )
+
+                self.assertTrue(quantized_model.hf_device_map == device_map_to_compare)
+
+                output = quantized_model(**inputs)[0]
+                output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
+                self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3))
+
+            # Test sharded model
+            with tempfile.TemporaryDirectory() as offload_folder:
+                quantization_config = TorchAoConfig("int4_weight_only", group_size=64)
+                quantized_model = FluxTransformer2DModel.from_pretrained(
+                    "hf-internal-testing/tiny-flux-sharded",
+                    subfolder="transformer",
+                    quantization_config=quantization_config,
+                    device_map=device_map,
+                    torch_dtype=torch.bfloat16,
+                    offload_folder=offload_folder,
+                )
+
+                self.assertTrue(quantized_model.hf_device_map == device_map_to_compare)
+
+                output = quantized_model(**inputs)[0]
+                output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
+
+                expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375])
+                self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3))
 
     def test_modules_to_not_convert(self):
         quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"])

From 9ec70f056d6c8b65d808021dd697753c2ead4bab Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 18 Dec 2024 07:40:56 +0530
Subject: [PATCH 4/7] Update tests/quantization/torchao/test_torchao.py

---
 tests/quantization/torchao/test_torchao.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 16c2d1c43200..6f9980c006ac 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -337,7 +337,6 @@ def test_device_map(self):
                 output = quantized_model(**inputs)[0]
                 output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
 
-                expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375])
                 self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3))
 
     def test_modules_to_not_convert(self):

From fe447ba5536417eb014f6ad5822b5af8f87c6224 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 18 Dec 2024 06:41:54 +0100
Subject: [PATCH 5/7] address review comments

---
 src/diffusers/pipelines/pipeline_utils.py | 10 +++++++++-
 src/diffusers/quantizers/torchao/utils.py | 20 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/quantizers/torchao/utils.py

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index a504184ea2f2..530323df1e73 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -45,6 +45,7 @@
 from ..models.attention_processor import FusedAttnProcessor2_0
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
 from ..quantizers.bitsandbytes.utils import _check_bnb_status
+from ..quantizers.torchao.utils import _check_torchao_status
 from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
     CONFIG_NAME,
@@ -388,6 +389,7 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
         pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items())
+        pipeline_has_torchao = any(_check_torchao_status(module) for _, module in self.components.items())
 
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
@@ -411,7 +413,7 @@ def module_is_offloaded(module):
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
         )
         if device and torch.device(device).type == "cuda":
-            if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
+            if pipeline_is_sequentially_offloaded and not (pipeline_has_bnb or pipeline_has_torchao):
                 raise ValueError(
                     "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
                 )
@@ -420,6 +422,12 @@ def module_is_offloaded(module):
                 raise ValueError(
                     "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation."
                 )
+            elif pipeline_has_torchao:
+                raise ValueError(
+                    "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `torchao`. This is not supported. There are two options on what could be done to fix this error:\n"
+                    "1. Move the individual components of the model to the desired device directly using `.to()` on each.\n"
+                    '2. Pass `device_map="balanced"` when initializing the pipeline to let `accelerate` handle the device placement.'
+                )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
diff --git a/src/diffusers/quantizers/torchao/utils.py b/src/diffusers/quantizers/torchao/utils.py
new file mode 100644
index 000000000000..86cd4ae8c73e
--- /dev/null
+++ b/src/diffusers/quantizers/torchao/utils.py
@@ -0,0 +1,20 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..quantization_config import QuantizationMethod
+
+
+def _check_torchao_status(module) -> bool:
+    is_loaded_in_torchao = getattr(module, "quantization_method", None) == QuantizationMethod.TORCHAO
+    return is_loaded_in_torchao

From 05276c422c29f6eab4270f0c59e7778427c2fab9 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Thu, 19 Dec 2024 22:27:36 +0100
Subject: [PATCH 6/7] revert changes to pipeline utils

---
 src/diffusers/pipelines/pipeline_utils.py | 32 ++++++++++++++++-------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 530323df1e73..c505c5a262a3 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import enum
 import fnmatch
 import importlib
 import inspect
@@ -45,7 +46,6 @@
 from ..models.attention_processor import FusedAttnProcessor2_0
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
 from ..quantizers.bitsandbytes.utils import _check_bnb_status
-from ..quantizers.torchao.utils import _check_torchao_status
 from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
     CONFIG_NAME,
@@ -389,7 +389,6 @@ def to(self, *args, **kwargs):
 
         device = device or device_arg
         pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items())
-        pipeline_has_torchao = any(_check_torchao_status(module) for _, module in self.components.items())
 
         # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
         def module_is_sequentially_offloaded(module):
@@ -413,7 +412,7 @@ def module_is_offloaded(module):
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
         )
         if device and torch.device(device).type == "cuda":
-            if pipeline_is_sequentially_offloaded and not (pipeline_has_bnb or pipeline_has_torchao):
+            if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
                 raise ValueError(
                     "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
                 )
@@ -422,12 +421,6 @@ def module_is_offloaded(module):
                 raise ValueError(
                     "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation."
                 )
-            elif pipeline_has_torchao:
-                raise ValueError(
-                    "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `torchao`. This is not supported. There are two options on what could be done to fix this error:\n"
-                    "1. Move the individual components of the model to the desired device directly using `.to()` on each.\n"
-                    '2. Pass `device_map="balanced"` when initializing the pipeline to let `accelerate` handle the device placement.'
-                )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
@@ -819,6 +812,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # in this case they are already instantiated in `kwargs`
         # extract them here
         expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        expected_types = pipeline_class._get_signature_types()
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
         passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
         init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
@@ -841,6 +835,26 @@ def load_module(name, value):
 
         init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
 
+        for key in init_dict.keys():
+            if key not in passed_class_obj:
+                continue
+            if "scheduler" in key:
+                continue
+
+            class_obj = passed_class_obj[key]
+            _expected_class_types = []
+            for expected_type in expected_types[key]:
+                if isinstance(expected_type, enum.EnumMeta):
+                    _expected_class_types.extend(expected_type.__members__.keys())
+                else:
+                    _expected_class_types.append(expected_type.__name__)
+
+            _is_valid_type = class_obj.__class__.__name__ in _expected_class_types
+            if not _is_valid_type:
+                logger.warning(
+                    f"Expected types for {key}: {_expected_class_types}, got {class_obj.__class__.__name__}."
+                )
+
         # Special case: safety_checker must be loaded separately when using `from_flax`
         if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj:
             raise NotImplementedError(

From 3822ead4e2bbcf3e6f3bf48455d83cc00d0cd946 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 20 Dec 2024 01:47:46 +0100
Subject: [PATCH 7/7] remove unused file

---
 src/diffusers/quantizers/torchao/utils.py | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 src/diffusers/quantizers/torchao/utils.py

diff --git a/src/diffusers/quantizers/torchao/utils.py b/src/diffusers/quantizers/torchao/utils.py
deleted file mode 100644
index 86cd4ae8c73e..000000000000
--- a/src/diffusers/quantizers/torchao/utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..quantization_config import QuantizationMethod
-
-
-def _check_torchao_status(module) -> bool:
-    is_loaded_in_torchao = getattr(module, "quantization_method", None) == QuantizationMethod.TORCHAO
-    return is_loaded_in_torchao