From 997e56cf6b20cccfa8b12b7fa14b00b97211f9a9 Mon Sep 17 00:00:00 2001 From: Aryan Date: Tue, 17 Dec 2024 07:00:24 +0100 Subject: [PATCH 1/7] add sharded + device_map check --- src/diffusers/models/modeling_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index ce5289e3dbfd..23cc6ce2404e 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -803,6 +803,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P subfolder=subfolder or "", ) if hf_quantizer is not None: + is_torchao_quantization_method = quantization_config.quant_method == QuantizationMethod.TORCHAO + if device_map is not None and is_torchao_quantization_method: + raise NotImplementedError( + "Loading sharded checkpoints, while passing `device_map`, is not supported with `torchao` quantization. This will be supported in the near future." + ) + model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.") is_sharded = False From c129428105436f225a8b9bcd3f6fdf1f5d475659 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 18 Dec 2024 01:51:50 +0100 Subject: [PATCH 2/7] fix --- src/diffusers/models/modeling_utils.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 23cc6ce2404e..69e0f0240968 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -802,13 +802,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P revision=revision, subfolder=subfolder or "", ) - if hf_quantizer is not None: - is_torchao_quantization_method = quantization_config.quant_method == QuantizationMethod.TORCHAO - if device_map is not None and is_torchao_quantization_method: - raise NotImplementedError( - "Loading sharded checkpoints, while passing `device_map`, is not supported with `torchao` quantization. This will be supported in the near future." - ) - + if hf_quantizer is not None and is_bnb_quantization_method: model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.") is_sharded = False From 739601c3002ce960bb201b76dded7f189a88bf69 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 18 Dec 2024 03:01:32 +0100 Subject: [PATCH 3/7] add test for sharded model --- tests/quantization/torchao/test_torchao.py | 71 +++++++++++++++------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 5c71fc4e0ae7..1f15c58bc156 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -279,13 +279,14 @@ def test_int4wo_quant_bfloat16_conversion(self): self.assertEqual(weight.quant_max, 15) self.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType)) - def test_offload(self): + def test_device_map(self): """ - Test if the quantized model int4 weight-only is working properly with cpu/disk offload. Also verifies - that the device map is correctly set (in the `hf_device_map` attribute of the model). + Test if the quantized model int4 weight-only is working properly with "auto" and custom device maps. + The custom device map performs cpu/disk offloading as well. Also verifies that the device map is + correctly set (in the `hf_device_map` attribute of the model). """ - device_map_offload = { + custom_device_map_dict = { "time_text_embed": torch_device, "context_embedder": torch_device, "x_embedder": torch_device, @@ -294,27 +295,51 @@ def test_offload(self): "norm_out": torch_device, "proj_out": "cpu", } + device_maps = ["auto", custom_device_map_dict] inputs = self.get_dummy_tensor_inputs(torch_device) - - with tempfile.TemporaryDirectory() as offload_folder: - quantization_config = TorchAoConfig("int4_weight_only", group_size=64) - quantized_model = FluxTransformer2DModel.from_pretrained( - "hf-internal-testing/tiny-flux-pipe", - subfolder="transformer", - quantization_config=quantization_config, - device_map=device_map_offload, - torch_dtype=torch.bfloat16, - offload_folder=offload_folder, - ) - - self.assertTrue(quantized_model.hf_device_map == device_map_offload) - - output = quantized_model(**inputs)[0] - output_slice = output.flatten()[-9:].detach().float().cpu().numpy() - - expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375]) - self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3)) + expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375]) + + for device_map in device_maps: + device_map_to_compare = {"": 0} if device_map == "auto" else device_map + + # Test non-sharded model + with tempfile.TemporaryDirectory() as offload_folder: + quantization_config = TorchAoConfig("int4_weight_only", group_size=64) + quantized_model = FluxTransformer2DModel.from_pretrained( + "hf-internal-testing/tiny-flux-pipe", + subfolder="transformer", + quantization_config=quantization_config, + device_map=device_map, + torch_dtype=torch.bfloat16, + offload_folder=offload_folder, + ) + + self.assertTrue(quantized_model.hf_device_map == device_map_to_compare) + + output = quantized_model(**inputs)[0] + output_slice = output.flatten()[-9:].detach().float().cpu().numpy() + self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3)) + + # Test sharded model + with tempfile.TemporaryDirectory() as offload_folder: + quantization_config = TorchAoConfig("int4_weight_only", group_size=64) + quantized_model = FluxTransformer2DModel.from_pretrained( + "hf-internal-testing/tiny-flux-sharded", + subfolder="transformer", + quantization_config=quantization_config, + device_map=device_map, + torch_dtype=torch.bfloat16, + offload_folder=offload_folder, + ) + + self.assertTrue(quantized_model.hf_device_map == device_map_to_compare) + + output = quantized_model(**inputs)[0] + output_slice = output.flatten()[-9:].detach().float().cpu().numpy() + + expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375]) + self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3)) def test_modules_to_not_convert(self): quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"]) From 9ec70f056d6c8b65d808021dd697753c2ead4bab Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 18 Dec 2024 07:40:56 +0530 Subject: [PATCH 4/7] Update tests/quantization/torchao/test_torchao.py --- tests/quantization/torchao/test_torchao.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 16c2d1c43200..6f9980c006ac 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -337,7 +337,6 @@ def test_device_map(self): output = quantized_model(**inputs)[0] output_slice = output.flatten()[-9:].detach().float().cpu().numpy() - expected_slice = np.array([0.3457, -0.0366, 0.0105, -0.2275, -0.4941, 0.4395, -0.166, -0.6641, 0.4375]) self.assertTrue(np.allclose(output_slice, expected_slice, atol=1e-3, rtol=1e-3)) def test_modules_to_not_convert(self): From fe447ba5536417eb014f6ad5822b5af8f87c6224 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 18 Dec 2024 06:41:54 +0100 Subject: [PATCH 5/7] address review comments --- src/diffusers/pipelines/pipeline_utils.py | 10 +++++++++- src/diffusers/quantizers/torchao/utils.py | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 src/diffusers/quantizers/torchao/utils.py diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index a504184ea2f2..530323df1e73 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -45,6 +45,7 @@ from ..models.attention_processor import FusedAttnProcessor2_0 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin from ..quantizers.bitsandbytes.utils import _check_bnb_status +from ..quantizers.torchao.utils import _check_torchao_status from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from ..utils import ( CONFIG_NAME, @@ -388,6 +389,7 @@ def to(self, *args, **kwargs): device = device or device_arg pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items()) + pipeline_has_torchao = any(_check_torchao_status(module) for _, module in self.components.items()) # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU. def module_is_sequentially_offloaded(module): @@ -411,7 +413,7 @@ def module_is_offloaded(module): module_is_sequentially_offloaded(module) for _, module in self.components.items() ) if device and torch.device(device).type == "cuda": - if pipeline_is_sequentially_offloaded and not pipeline_has_bnb: + if pipeline_is_sequentially_offloaded and not (pipeline_has_bnb or pipeline_has_torchao): raise ValueError( "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading." ) @@ -420,6 +422,12 @@ def module_is_offloaded(module): raise ValueError( "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation." ) + elif pipeline_has_torchao: + raise ValueError( + "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `torchao`. This is not supported. There are two options on what could be done to fix this error:\n" + "1. Move the individual components of the model to the desired device directly using `.to()` on each.\n" + '2. Pass `device_map="balanced"` when initializing the pipeline to let `accelerate` handle the device placement.' + ) is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 if is_pipeline_device_mapped: diff --git a/src/diffusers/quantizers/torchao/utils.py b/src/diffusers/quantizers/torchao/utils.py new file mode 100644 index 000000000000..86cd4ae8c73e --- /dev/null +++ b/src/diffusers/quantizers/torchao/utils.py @@ -0,0 +1,20 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..quantization_config import QuantizationMethod + + +def _check_torchao_status(module) -> bool: + is_loaded_in_torchao = getattr(module, "quantization_method", None) == QuantizationMethod.TORCHAO + return is_loaded_in_torchao From 05276c422c29f6eab4270f0c59e7778427c2fab9 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 19 Dec 2024 22:27:36 +0100 Subject: [PATCH 6/7] revert changes to pipeline utils --- src/diffusers/pipelines/pipeline_utils.py | 32 ++++++++++++++++------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 530323df1e73..c505c5a262a3 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import enum import fnmatch import importlib import inspect @@ -45,7 +46,6 @@ from ..models.attention_processor import FusedAttnProcessor2_0 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin from ..quantizers.bitsandbytes.utils import _check_bnb_status -from ..quantizers.torchao.utils import _check_torchao_status from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from ..utils import ( CONFIG_NAME, @@ -389,7 +389,6 @@ def to(self, *args, **kwargs): device = device or device_arg pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items()) - pipeline_has_torchao = any(_check_torchao_status(module) for _, module in self.components.items()) # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU. def module_is_sequentially_offloaded(module): @@ -413,7 +412,7 @@ def module_is_offloaded(module): module_is_sequentially_offloaded(module) for _, module in self.components.items() ) if device and torch.device(device).type == "cuda": - if pipeline_is_sequentially_offloaded and not (pipeline_has_bnb or pipeline_has_torchao): + if pipeline_is_sequentially_offloaded and not pipeline_has_bnb: raise ValueError( "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading." ) @@ -422,12 +421,6 @@ def module_is_offloaded(module): raise ValueError( "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation." ) - elif pipeline_has_torchao: - raise ValueError( - "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `torchao`. This is not supported. There are two options on what could be done to fix this error:\n" - "1. Move the individual components of the model to the desired device directly using `.to()` on each.\n" - '2. Pass `device_map="balanced"` when initializing the pipeline to let `accelerate` handle the device placement.' - ) is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 if is_pipeline_device_mapped: @@ -819,6 +812,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # in this case they are already instantiated in `kwargs` # extract them here expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) + expected_types = pipeline_class._get_signature_types() passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) @@ -841,6 +835,26 @@ def load_module(name, value): init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)} + for key in init_dict.keys(): + if key not in passed_class_obj: + continue + if "scheduler" in key: + continue + + class_obj = passed_class_obj[key] + _expected_class_types = [] + for expected_type in expected_types[key]: + if isinstance(expected_type, enum.EnumMeta): + _expected_class_types.extend(expected_type.__members__.keys()) + else: + _expected_class_types.append(expected_type.__name__) + + _is_valid_type = class_obj.__class__.__name__ in _expected_class_types + if not _is_valid_type: + logger.warning( + f"Expected types for {key}: {_expected_class_types}, got {class_obj.__class__.__name__}." + ) + # Special case: safety_checker must be loaded separately when using `from_flax` if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj: raise NotImplementedError( From 3822ead4e2bbcf3e6f3bf48455d83cc00d0cd946 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 20 Dec 2024 01:47:46 +0100 Subject: [PATCH 7/7] remove unused file --- src/diffusers/quantizers/torchao/utils.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 src/diffusers/quantizers/torchao/utils.py diff --git a/src/diffusers/quantizers/torchao/utils.py b/src/diffusers/quantizers/torchao/utils.py deleted file mode 100644 index 86cd4ae8c73e..000000000000 --- a/src/diffusers/quantizers/torchao/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ..quantization_config import QuantizationMethod - - -def _check_torchao_status(module) -> bool: - is_loaded_in_torchao = getattr(module, "quantization_method", None) == QuantizationMethod.TORCHAO - return is_loaded_in_torchao