update

a-r-r-o-w · a-r-r-o-w · commit 748a0023ec7f · 2024-11-24T10:46:50.000+01:00
diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
@@ -28,6 +28,10 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
 
 [[autodoc]] BitsAndBytesConfig
 
+## TorchAoConfig
+
+[[autodoc]] TorchAoConfig
+
 ## DiffusersQuantizer
 
 [[autodoc]] quantizers.base.DiffusersQuantizer
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -32,4 +32,4 @@ If you are new to the quantization field, we recommend you to check out these be
 
 ## When to use what?
 
-This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
+This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes` and `torchao`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
@@ -0,0 +1,31 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# torchao
+
+[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like `torch.compile`, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+
+Before you begin, make sure you have Pytorch version 2.5, or above, and TorchAO installed:
+
+```bash
+pip install -U torch torchao
+```
+
+## Usage
+
+Now you can quantize a model by passing a [`TorchAoConfig`] to [`~ModelMixin.from_pretrained`]. This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
+
+## Usage
+
+## Resources
+
+- [TorchAO Quantization API]()
+- [Diffusers-TorchAO examples](https://github.com/sayakpaul/diffusers-torchao)
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -25,7 +25,6 @@
 import torch
 from huggingface_hub.utils import EntryNotFoundError
 
-from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -829,9 +829,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if device_map is None and not is_sharded:
                     # `torch.cuda.current_device()` is fine here when `hf_quantizer` is not None.
                     # It would error out during the `validate_environment()` call above in the absence of cuda.
-                    is_quant_method_bnb = (
-                        getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
-                    )
                     if hf_quantizer is None:
                         param_device = "cpu"
                     # TODO (sayakpaul,  SunMarc): remove this after model loading refactor
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
@@ -19,8 +19,8 @@
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
-from .torchao import TorchAoHfQuantizer
 from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod, TorchAoConfig
+from .torchao import TorchAoHfQuantizer
 
 
 AUTO_QUANTIZER_MAPPING = {
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -484,32 +484,25 @@ def _get_torchao_quant_type_to_method(cls):
                 "int4": int4_weight_only,
                 "int4wo": int4_weight_only,
                 "int4_weight_only": int4_weight_only,
-                "int4_a16w4": int4_weight_only,
                 # int4 weight + int8 activation
                 "int4dq": int8_dynamic_activation_int4_weight,
                 "int8_dynamic_activation_int4_weight": int8_dynamic_activation_int4_weight,
-                "int4_a8w4": int8_dynamic_activation_int4_weight,
             }
 
             INT8_QUANTIZATION_TYPES = {
                 # int8 weight + bfloat16/float16 activation
                 "int8": int8_weight_only,
                 "int8wo": int8_weight_only,
                 "int8_weight_only": int8_weight_only,
-                "int8_a16w8": int8_weight_only,
                 # int8 weight + int8 activation
                 "int8dq": int8_dynamic_activation_int8_weight,
                 "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
-                "int8_a8w8": int8_dynamic_activation_int8_weight,
             }
 
             def generate_float8dq_types(dtype: torch.dtype):
                 name = "e5m2" if dtype == torch.float8_e5m2 else "e4m3"
                 types = {}
 
-                types[f"float8dq_{name}_a8w8"] = partial(
-                    float8_dynamic_activation_float8_weight, activation_dtype=dtype, weight_dtype=dtype
-                )
                 for activation_granularity_cls in [PerTensor, PerRow]:
                     for weight_granularity_cls in [PerTensor, PerRow]:
                         activation_name = "t" if activation_granularity_cls is PerTensor else "r"
@@ -526,22 +519,15 @@ def generate_float8dq_types(dtype: torch.dtype):
                             weight_dtype=dtype,
                             granularity=(activation_granularity_cls(), weight_granularity_cls()),
                         )
-                        types[f"float8dq_{name}_a{activation_name}w{weight_name}_a8w8"] = partial(
-                            float8_dynamic_activation_float8_weight,
-                            activation_dtype=dtype,
-                            weight_dtype=dtype,
-                            granularity=(activation_granularity_cls(), weight_granularity_cls()),
-                        )
 
                 return types
 
             def generate_fpx_quantization_types(bits: int):
                 types = {}
 
-                for ebits in range(1, bits):
+                for ebits in range(0, bits):
                     mbits = bits - ebits - 1
                     types[f"fp{bits}_e{ebits}m{mbits}"] = partial(fpx_weight_only, ebits=ebits, mbits=mbits)
-                    types[f"fp{bits}_e{ebits}m{mbits}_a16w{bits}"] = partial(fpx_weight_only, ebits=ebits, mbits=mbits)
 
                 non_sign_bits = bits - 1
                 default_ebits = (non_sign_bits + 1) // 2
@@ -550,20 +536,17 @@ def generate_fpx_quantization_types(bits: int):
 
                 return types
 
-            # TODO(aryan): handle cuda capability and torch 2.2/2.3
+            # TODO(aryan): handle torch 2.2/2.3
             FLOATX_QUANTIZATION_TYPES = {
                 # float8_e5m2 weight + bfloat16/float16 activation
                 "float8": float8_weight_only,
                 "float8_weight_only": float8_weight_only,
                 "float8wo": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
-                "float8_a16w8": float8_weight_only,
                 "float8_e5m2": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
                 "float8wo_e5m2": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
-                "float8_e5m2_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
                 # float8_e4m3 weight + bfloat16/float16 activation
                 "float8_e4m3": partial(float8_weight_only, weight_dtype=torch.float8_e4m3fn),
                 "float8wo_e4m3": partial(float8_weight_only, weight_dtype=torch.float8_e4m3fn),
-                "float8wo_e4m3_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e4m3fn),
                 # float8_e5m2 weight + float8 activation (dynamic)
                 "float8_dynamic_activation_float8_weight": float8_dynamic_activation_float8_weight,
                 "float8dq": float8_dynamic_activation_float8_weight,
@@ -572,7 +555,6 @@ def generate_fpx_quantization_types(bits: int):
                     activation_dtype=torch.float8_e5m2,
                     weight_dtype=torch.float8_e5m2,
                 ),
-                "float8_a8w8": float8_dynamic_activation_float8_weight,
                 **generate_float8dq_types(torch.float8_e5m2),
                 # float8_e4m3 weight + float8 activation (dynamic)
                 "float8dq_e4m3": partial(
@@ -609,7 +591,6 @@ def generate_uintx_quantization_types(bits: int):
                 types = {}
                 types[f"uint{bits}"] = partial(uintx_weight_only, dtype=UINTX_TO_DTYPE[bits])
                 types[f"uint{bits}wo"] = partial(uintx_weight_only, dtype=UINTX_TO_DTYPE[bits])
-                types[f"uint{bits}_a16w{bits}"] = partial(uintx_weight_only, dtype=UINTX_TO_DTYPE[bits])
                 return types
 
             UINTX_QUANTIZATION_DTYPES = {
@@ -625,13 +606,41 @@ def generate_uintx_quantization_types(bits: int):
                 **generate_uintx_quantization_types(8),
             }
 
+            SHORTHAND_QUANTIZATION_TYPES = {
+                "int_a16w4": int4_weight_only,
+                "int_a8w4": int8_dynamic_activation_int4_weight,
+                "int_a16w8": int8_weight_only,
+                "int_a8w8": int8_dynamic_activation_int8_weight,
+                "uint_a16w1": partial(uintx_weight_only, dtype=torch.uint1),
+                "uint_a16w2": partial(uintx_weight_only, dtype=torch.uint2),
+                "uint_a16w3": partial(uintx_weight_only, dtype=torch.uint3),
+                "uint_a16w4": partial(uintx_weight_only, dtype=torch.uint4),
+                "uint_a16w5": partial(uintx_weight_only, dtype=torch.uint5),
+                "uint_a16w6": partial(uintx_weight_only, dtype=torch.uint6),
+                "uint_a16w7": partial(uintx_weight_only, dtype=torch.uint7),
+                "uint_a16w8": partial(uintx_weight_only, dtype=torch.uint8),
+            }
+            SHORTHAND_FLOAT_QUANTIZATION_TYPES = {
+                "float_e5m2_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
+                "float_e4m3_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e4m3fn),
+                "float_a8w8": float8_dynamic_activation_float8_weight,
+                "float_a16w3": partial(fpx_weight_only, ebits=2, mbits=0),
+                "float_a16w4": partial(fpx_weight_only, ebits=2, mbits=1),
+                "float_a16w5": partial(fpx_weight_only, ebits=3, mbits=1),
+                "float_a16w6": partial(fpx_weight_only, ebits=3, mbits=2),
+                "float_a16w7": partial(fpx_weight_only, ebits=4, mbits=2),
+                "float_a16w8": partial(fpx_weight_only, ebits=5, mbits=2),
+            }
+
             QUANTIZATION_TYPES = {}
             QUANTIZATION_TYPES.update(INT4_QUANTIZATION_TYPES)
             QUANTIZATION_TYPES.update(INT8_QUANTIZATION_TYPES)
             QUANTIZATION_TYPES.update(UINTX_QUANTIZATION_DTYPES)
+            QUANTIZATION_TYPES.update(SHORTHAND_QUANTIZATION_TYPES)
 
             if cls._is_cuda_capability_atleast_8_9():
                 QUANTIZATION_TYPES.update(FLOATX_QUANTIZATION_TYPES)
+                QUANTIZATION_TYPES.update(SHORTHAND_FLOAT_QUANTIZATION_TYPES)
 
             return QUANTIZATION_TYPES
         else:

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,4 @@ If you are new to the quantization field, we recommend you to check out these be`
`32`	`32`
`33`	`33`	`## When to use what?`
`34`	`34`
`35`		-This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
	`35`	+This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes` and `torchao`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.