address review comments

a-r-r-o-w · a-r-r-o-w · commit 7d9d1dc02f14 · 2024-12-09T22:28:20.000+01:00
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
@@ -45,7 +45,7 @@ pipe = FluxPipeline.from_pretrained(
 pipe.to("cuda")
 
 prompt = "A cat holding a sign that says hello world"
-image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+image = pipe(prompt, num_inference_steps=28, guidance_scale=0.0).images[0]
 image.save("output.png")
 ```
 
@@ -75,27 +75,12 @@ Dynamic activation quantization stores the model weights in a low-bit dtype, whi
 
 The quantization methods supported are as follows:
 
-- **Integer quantization:**
-  - Full function names: `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight`
-  - Shorthands: `int4wo`, `int4dq`, `int8wo`, `int8dq`
-  - Documentation shorthands/Common speak: `int_a16w4`, `int_a8w4`, `int_a16w8`, `int_a8w8`
-
-- **Floating point 8-bit quantization:**
-  - Full function names: `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight`
-  - Shorthands: `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row`, `float8sq`
-  - Documentation shorthands/Common speak: `float8_e5m2_a16w8`, `float8_e4m3_a16w8`, `float_a8w8`, `float_a16w8`
-
-- **Floating point X-bit quantization:**
-  - Full function names: `fpx_weight_only`
-  - Shorthands: `fpX_eAwB`, where `X` is the number of bits (between `1` to `7`), `A` is the number of exponent bits and `B` is the number of mantissa bits. The constraint of `X == A + B + 1` must be satisfied for a given shorthand notation.
-  - Documentation shorthands/Common speak: `float_a16w3`, `float_a16w4`, `float_a16w5`, `float_a16w6`, `float_a16w7`, `float_a16w8`
-
-- **Unsigned Integer quantization:**
-  - Full function names: `uintx_weight_only`
-  - Shorthands: `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo`
-  - Documentation shorthands/Common speak: `uint_a16w1`, `uint_a16w2`, `uint_a16w3`, `uint_a16w4`, `uint_a16w5`, `uint_a16w6`, `uint_a16w7`
-
-The "Documentation shorthands/Common speak" refers to the underlying storage dtype with the number of bits for storing activations and weights, respectively. For example, int_a16w8 stores the activations in 16-bit and the weights in 8-bit.
+| **Category** | **Full Function Names** | **Shorthands** |
+|--------------|-------------------------|----------------|
+| **Integer quantization** | `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight` | `int4wo`, `int4dq`, `int8wo`, `int8dq` |
+| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row` |
+| **Floating point X-bit quantization** | `fpx_weight_only` | `fpX_eAwB` where `X` is the number of bits (1-7), `A` is exponent bits, and `B` is mantissa bits. Constraint: `X == A + B + 1` |
+| **Unsigned Integer quantization** | `uintx_weight_only` | `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo` |
 
 Some quantization methods are aliases (for example, `int8wo` is the commonly used shorthand for `int8_weight_only`). This allows using the quantization methods described in the torchao docs as-is, while also making it convenient to remember their shorthand notations.
 
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -405,29 +405,22 @@ class TorchAoConfig(QuantizationConfigMixin):
                     - Full function names: `int4_weight_only`, `int8_dynamic_activation_int4_weight`,
                       `int8_weight_only`, `int8_dynamic_activation_int8_weight`
                     - Shorthands: `int4wo`, `int4dq`, `int8wo`, `int8dq`
-                    - Documentation shorthands/Common speak: `int_a16w4`, `int_a8w4`, `int_a16w8`, `int_a8w8`
 
                 - **Floating point 8-bit quantization:**
                     - Full function names: `float8_weight_only`, `float8_dynamic_activation_float8_weight`,
                       `float8_static_activation_float8_weight`
                     - Shorthands: `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`,
-                      `float8_e4m3_tensor`, `float8_e4m3_row`, `float8sq`
-                    - Documentation shorthands/Common speak: `float8_e5m2_a16w8`, `float8_e4m3_a16w8`, `float_a8w8`,
-                      `float_a16w8`
+                      `float8_e4m3_tensor`, `float8_e4m3_row`,
 
                 - **Floating point X-bit quantization:**
                     - Full function names: `fpx_weight_only`
                     - Shorthands: `fpX_eAwB`, where `X` is the number of bits (between `1` to `7`), `A` is the number
                       of exponent bits and `B` is the number of mantissa bits. The constraint of `X == A + B + 1` must
                       be satisfied for a given shorthand notation.
-                    - Documentation shorthands/Common speak: `float_a16w3`, `float_a16w4`, `float_a16w5`,
-                      `float_a16w6`, `float_a16w7`, `float_a16w8`
 
                 - **Unsigned Integer quantization:**
                     - Full function names: `uintx_weight_only`
                     - Shorthands: `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo`
-                    - Documentation shorthands/Common speak: `uint_a16w1`, `uint_a16w2`, `uint_a16w3`, `uint_a16w4`,
-                      `uint_a16w5`, `uint_a16w6`, `uint_a16w7`
         modules_to_not_convert (`List[str]`, *optional*, default to `None`):
             The list of modules to not quantize, useful for quantizing models that explicitly require to have some
             modules left in their original precision.
@@ -584,7 +577,6 @@ def generate_fpx_quantization_types(bits: int):
                 **generate_float8dq_types(torch.float8_e4m3fn),
                 # float8 weight + float8 activation (static)
                 "float8_static_activation_float8_weight": float8_static_activation_float8_weight,
-                "float8sq": float8_static_activation_float8_weight,
                 # For fpx, only x <= 8 is supported by default. Other dtypes can be explored by users directly
                 # fpx weight + bfloat16/float16 activation
                 **generate_fpx_quantization_types(3),
@@ -606,42 +598,13 @@ def generate_fpx_quantization_types(bits: int):
                 # "uint8wo": partial(uintx_weight_only, dtype=torch.uint8),  # uint8 quantization is not supported
             }
 
-            SHORTHAND_QUANTIZATION_TYPES = {
-                "int_a16w4": int4_weight_only,
-                "int_a8w4": int8_dynamic_activation_int4_weight,
-                "int_a16w8": int8_weight_only,
-                "int_a8w8": int8_dynamic_activation_int8_weight,
-                "uint_a16w1": partial(uintx_weight_only, dtype=torch.uint1),
-                "uint_a16w2": partial(uintx_weight_only, dtype=torch.uint2),
-                "uint_a16w3": partial(uintx_weight_only, dtype=torch.uint3),
-                "uint_a16w4": partial(uintx_weight_only, dtype=torch.uint4),
-                "uint_a16w5": partial(uintx_weight_only, dtype=torch.uint5),
-                "uint_a16w6": partial(uintx_weight_only, dtype=torch.uint6),
-                "uint_a16w7": partial(uintx_weight_only, dtype=torch.uint7),
-                # "uint_a16w8": partial(uintx_weight_only, dtype=torch.uint8),  # uint8 quantization is not supported
-            }
-
-            SHORTHAND_FLOAT_QUANTIZATION_TYPES = {
-                "float_e5m2_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
-                "float_e4m3_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e4m3fn),
-                "float_a8w8": float8_dynamic_activation_float8_weight,
-                "float_a16w3": partial(fpx_weight_only, ebits=2, mbits=0),
-                "float_a16w4": partial(fpx_weight_only, ebits=2, mbits=1),
-                "float_a16w5": partial(fpx_weight_only, ebits=3, mbits=1),
-                "float_a16w6": partial(fpx_weight_only, ebits=3, mbits=2),
-                "float_a16w7": partial(fpx_weight_only, ebits=4, mbits=2),
-                "float_a16w8": partial(float8_weight_only, weight_dtype=torch.float8_e5m2),
-            }
-
             QUANTIZATION_TYPES = {}
             QUANTIZATION_TYPES.update(INT4_QUANTIZATION_TYPES)
             QUANTIZATION_TYPES.update(INT8_QUANTIZATION_TYPES)
             QUANTIZATION_TYPES.update(UINTX_QUANTIZATION_DTYPES)
-            QUANTIZATION_TYPES.update(SHORTHAND_QUANTIZATION_TYPES)
 
             if cls._is_cuda_capability_atleast_8_9():
                 QUANTIZATION_TYPES.update(FLOATX_QUANTIZATION_TYPES)
-                QUANTIZATION_TYPES.update(SHORTHAND_FLOAT_QUANTIZATION_TYPES)
 
             return QUANTIZATION_TYPES
         else:
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -33,6 +33,7 @@
     enable_full_determinism,
     is_torch_available,
     is_torchao_available,
+    nightly,
     require_torch,
     require_torch_gpu,
     require_torchao_version_greater,
@@ -280,7 +281,8 @@ def test_int4wo_quant_bfloat16_conversion(self):
 
     def test_offload(self):
         """
-        Test if the quantized model int4 weight-only is working properly with cpu/disk offload.
+        Test if the quantized model int4 weight-only is working properly with cpu/disk offload. Also verifies
+        that the device map is correctly set (in the `hf_device_map` attribute of the model).
         """
 
         device_map_offload = {
@@ -306,6 +308,8 @@ def test_offload(self):
                 offload_folder=offload_folder,
             )
 
+            self.assertTrue(quantized_model.hf_device_map == device_map_offload)
+
             output = quantized_model(**inputs)[0]
             output_slice = output.flatten()[-9:].detach().float().cpu().numpy()
 
@@ -539,6 +543,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest):
 @require_torch_gpu
 @require_torchao_version_greater("0.6.0")
 @slow
+@nightly
 class SlowTorchAoTests(unittest.TestCase):
     def tearDown(self):
         gc.collect()