frst commit

MekkCyber · MekkCyber · commit d561f6552d26 · 2025-06-03T08:01:42.000Z
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -473,6 +473,8 @@ jobs:
             additional_deps: []
           - backend: "optimum_quanto"
             test_location: "quanto"
+          - backend: "finegrained_fp8"
+            test_location: "finegrained_fp8"
             additional_deps: []
     runs-on:
       group: aws-g6e-xlarge-plus
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -174,6 +174,8 @@
     title: torchao
   - local: quantization/quanto
     title: quanto
+  - local: quantization/finegrained_fp8
+    title: finegrained_fp8
   title: Quantization Methods
 - sections:
   - local: optimization/fp16
diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
@@ -41,6 +41,11 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
 
 [[autodoc]] TorchAoConfig
 
+## FinegrainedFP8Config
+
+[[autodoc]] FinegrainedFP8Config
+
 ## DiffusersQuantizer
 
 [[autodoc]] quantizers.base.DiffusersQuantizer
+
diff --git a/docs/source/en/quantization/finegrained_fp8.md b/docs/source/en/quantization/finegrained_fp8.md
@@ -0,0 +1,15 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FinegrainedFP8
+
+## Overview
+
+## Usage
+
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -37,7 +37,7 @@ Diffusers currently supports the following quantization methods.
 - [TorchAO](./torchao)
 - [GGUF](./gguf)
 - [Quanto](./quanto.md)
-
+- [FinegrainedFP8](./finegrained_fp8.md)
 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
 
 ## Pipeline-level quantization
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -96,6 +96,8 @@
 else:
     _import_structure["quantizers.quantization_config"].append("TorchAoConfig")
 
+_import_structure["quantizers.quantization_config"].append("FinegrainedFP8Config")
+
 try:
     if not is_torch_available() and not is_accelerate_available() and not is_optimum_quanto_available():
         raise OptionalDependencyNotAvailable()
@@ -724,6 +726,8 @@
     else:
         from .quantizers.quantization_config import QuantoConfig
 
+    from .quantizers.quantization_config import FinegrainedFP8Config
+
     try:
         if not is_onnx_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -1238,6 +1238,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         }
 
         # Dispatch model with hooks on all devices if necessary
+        print(model.transformer_blocks[0].attn.to_q.weight)
+        print(model.transformer_blocks[0].attn.to_q.weight_scale_inv)
         if device_map is not None:
             device_map_kwargs = {
                 "device_map": device_map,
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
@@ -28,9 +28,11 @@
     QuantizationMethod,
     QuantoConfig,
     TorchAoConfig,
+    FinegrainedFP8Config,
 )
 from .quanto import QuantoQuantizer
 from .torchao import TorchAoHfQuantizer
+from .finegrained_fp8 import FinegrainedFP8Quantizer
 
 
 AUTO_QUANTIZER_MAPPING = {
@@ -39,6 +41,7 @@
     "gguf": GGUFQuantizer,
     "quanto": QuantoQuantizer,
     "torchao": TorchAoHfQuantizer,
+    "finegrained_fp8": FinegrainedFP8Quantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
@@ -47,6 +50,7 @@
     "gguf": GGUFQuantizationConfig,
     "quanto": QuantoConfig,
     "torchao": TorchAoConfig,
+    "finegrained_fp8": FinegrainedFP8Config,
 }
 
 
diff --git a/src/diffusers/quantizers/finegrained_fp8/__init__.py b/src/diffusers/quantizers/finegrained_fp8/__init__.py
@@ -0,0 +1 @@
+from .finegrained_fp8_quantizer import FinegrainedFP8Quantizer
diff --git a/src/diffusers/quantizers/finegrained_fp8/finegrained_fp8_quantizer.py b/src/diffusers/quantizers/finegrained_fp8/finegrained_fp8_quantizer.py
@@ -0,0 +1,206 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from ...utils import is_accelerate_available, is_torch_available, logging
+from ..base import DiffusersQuantizer
+from ...utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    from ...models.modeling_utils import ModelMixin
+
+class FinegrainedFP8Quantizer(DiffusersQuantizer):
+    """
+    FP8 quantization implementation supporting both standard and MoE models.
+    Supports both e4m3fn formats based on platform.
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torch_available():
+            raise ImportError(
+                "Using fp8 quantization requires torch >= 2.1.0"
+                "Please install the latest version of torch ( pip install --upgrade torch )"
+            )
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an FP8 quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into FP8 weights from tf/flax weights is currently not supported, "
+                "please make sure the weights are in PyTorch format."
+            )
+
+        if torch.cuda.is_available():
+            compute_capability = torch.cuda.get_device_capability()
+            major, minor = compute_capability
+            if (major < 8) or (major == 8 and minor < 9):
+                raise ValueError(
+                    "FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100)"
+                    f", actual = `{major}.{minor}`"
+                )
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. "
+            )
+        elif device_map is not None:
+            if (
+                not self.pre_quantized
+                and isinstance(device_map, dict)
+                and ("cpu" in device_map.values() or "disk" in device_map.values())
+            ):
+                raise ValueError(
+                    "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
+                    "This is not supported when the model is quantized on the fly. "
+                    "Please use a quantized checkpoint or remove the cpu/disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("Setting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained")
+            torch_dtype = torch.float32
+        return torch_dtype
+
+    def create_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Quantizes weights to FP8 format using Block-wise quantization
+        """
+        # print("############ create quantized param ########")
+        from accelerate.utils import set_module_tensor_to_device
+
+        set_module_tensor_to_device(model, param_name, target_device, param_value)
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        # Get FP8 min/max values
+        fp8_min = torch.finfo(torch.float8_e4m3fn).min
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+
+        block_size_m, block_size_n = self.quantization_config.weight_block_size
+
+        rows, cols = param_value.shape[-2:]
+
+        if rows % block_size_m != 0 or cols % block_size_n != 0:
+            raise ValueError(
+                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})"
+            )
+        param_value_orig_shape = param_value.shape
+
+        param_value = param_value.reshape(
+            -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
+        ).permute(0, 1, 3, 2, 4)
+
+        # Calculate scaling factor for each block
+        max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+        scale = fp8_max / max_abs
+        scale_orig_shape = scale.shape
+        scale = scale.unsqueeze(-1).unsqueeze(-1)
+
+        # Quantize the weights
+        quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
+        # Reshape back to matrix shape
+        quantized_param = quantized_param.reshape(param_value_orig_shape)
+
+        # Reshape scale to match the number of blocks
+        scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
+
+        # Load into the model
+        module._buffers[tensor_name] = quantized_param.to(target_device)
+        module._buffers["weight_scale_inv"] = scale.to(target_device)
+        # print("_buffers[0]", module._buffers["weight_scale_inv"])
+
+    def check_if_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from .utils import FP8Linear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module, FP8Linear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale_inv":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "ModelMixin",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from .utils import replace_with_fp8_linear
+
+        if self.quantization_config.modules_to_not_convert is not None:
+            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+        model = replace_with_fp8_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
+        return model
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from .utils import FP8Linear
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, FP8Linear):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
+
+    def get_cuda_warm_up_factor(self):
+        # Pre-processing is done cleanly, so we can allocate everything here
+        return 2
diff --git a/src/diffusers/quantizers/finegrained_fp8/utils.py b/src/diffusers/quantizers/finegrained_fp8/utils.py
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py

Original file line number	Diff line number	Diff line change
`@@ -1238,6 +1238,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P`
`1238`	`1238`	`}`
`1239`	`1239`
`1240`	`1240`	`# Dispatch model with hooks on all devices if necessary`
	`1241`	`+ print(model.transformer_blocks[0].attn.to_q.weight)`
	`1242`	`+ print(model.transformer_blocks[0].attn.to_q.weight_scale_inv)`
`1241`	`1243`	`if device_map is not None:`
`1242`	`1244`	`device_map_kwargs = {`
`1243`	`1245`	`"device_map": device_map,`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .finegrained_fp8_quantizer import FinegrainedFP8Quantizer`