start nunchaku.

sayakpaul · sayakpaul · commit 8e1ea006f097 · 2025-08-21T12:25:59.000+05:30
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -90,7 +90,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param):
     def check_if_quantized_param(
         self,
         model: "ModelMixin",
-        param_value: Union["GGUFParameter", "torch.Tensor"],
+        param_value: Union["torch.Tensor"],
         param_name: str,
         state_dict: Dict[str, Any],
         **kwargs,
diff --git a/src/diffusers/quantizers/nunchaku/nunchaku_quantizer.py b/src/diffusers/quantizers/nunchaku/nunchaku_quantizer.py
@@ -0,0 +1,182 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Union
+
+from diffusers.utils.import_utils import is_nunchaku_version
+
+from ...utils import (
+    get_module_from_name,
+    is_accelerate_available,
+    is_nunchaku_available,
+    is_torch_available,
+    logging,
+)
+from ...utils.torch_utils import is_fp8_available
+from ..base import DiffusersQuantizer
+
+
+if TYPE_CHECKING:
+    from ...models.modeling_utils import ModelMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_accelerate_available():
+    pass
+
+if is_nunchaku_available():
+    from .utils import replace_with_nunchaku_linear
+
+logger = logging.get_logger(__name__)
+
+
+class QuantoQuantizer(DiffusersQuantizer):
+    r"""
+    Diffusers Quantizer for Optimum Quanto
+    """
+
+    use_keep_in_fp32_modules = True
+    requires_calibration = False
+    required_packages = ["nunchaku", "accelerate"]
+
+    dtype_map = {"int4": torch.int8}
+    if is_fp8_available():
+        dtype_map = {"nvfp4": torch.float8_e4m3fn}
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, *args, **kwargs):
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for nunchaku quantization.")
+
+        if not is_nunchaku_available():
+            raise ImportError(
+                "Loading an nunchaku quantized model requires nunchaku library (follow https://nunchaku.tech/docs/nunchaku/installation/installation.html)"
+            )
+        if not is_nunchaku_version(">=", "0.3.1"):
+            raise ImportError(
+                "Loading an nunchaku quantized model requires `nunchaku>=1.0.0`. "
+                "Please upgrade your installation by following https://nunchaku.tech/docs/nunchaku/installation/installation.html."
+            )
+
+        if not is_accelerate_available():
+            raise ImportError(
+                "Loading an nunchaku quantized model requires accelerate library (`pip install accelerate`)"
+            )
+
+        # TODO: check
+        # device_map = kwargs.get("device_map", None)
+        # if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+        #     raise ValueError(
+        #         "`device_map` for multi-GPU inference or CPU/disk offload is currently not supported with Diffusers and the Quanto backend"
+        #     )
+
+    def check_if_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        # Quanto imports diffusers internally. This is here to prevent circular imports
+        from nunchaku.models.linear import SVDQW4A4Linear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if self.pre_quantized and isinstance(module, SVDQW4A4Linear):
+            return True
+
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        *args,
+        **kwargs,
+    ):
+        """
+        Create a quantized parameter.
+        """
+        from nunchaku.models.linear import SVDQW4A4Linear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if tensor_name not in module._parameters and tensor_name not in module._buffers:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        if self.pre_quantized:
+            if tensor_name in module._parameters:
+                module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
+            if tensor_name in module._buffers:
+                module._buffers[tensor_name] = torch.nn.Parameter(param_value.to(target_device))
+
+        elif isinstance(module, torch.nn.Linear):
+            if tensor_name in module._parameters:
+                module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
+            if tensor_name in module._buffers:
+                module._buffers[tensor_name] = torch.nn.Parameter(param_value).to(target_device)
+
+            new_module = SVDQW4A4Linear.from_linear(module)
+            setattr(model, param_name, new_module)
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        precision = self.quantization_config.precision
+        expected_target_dtypes = [torch.int8]
+        if is_fp8_available():
+            expected_target_dtypes.append(torch.float8_e4m3fn)
+        if target_dtype not in expected_target_dtypes:
+            new_target_dtype = self.dtype_map[precision]
+
+            logger.info(f"target_dtype {target_dtype} is replaced by {new_target_dtype} for `nunchaku` quantization")
+            return new_target_dtype
+        else:
+            raise ValueError(f"Wrong `target_dtype` ({target_dtype}) provided.")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            # We force the `dtype` to be bfloat16, this is a requirement from `bitsandbytes`
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.bfloat16` due to "
+                "requirements of `nunchaku` to enable model loading in 4-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.bfloat16 to remove this warning.",
+                torch_dtype,
+            )
+            torch_dtype = torch.bfloat16
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "ModelMixin",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        # TODO: deal with `device_map`
+        self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        model = replace_with_nunchaku_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        return model
+
+    # @property
+    # def is_serializable(self):
+    #     return True
diff --git a/src/diffusers/quantizers/nunchaku/utils.py b/src/diffusers/quantizers/nunchaku/utils.py
@@ -0,0 +1,77 @@
+import torch.nn as nn
+
+from ...utils import is_accelerate_available, is_nunchaku_available, logging
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+if is_nunchaku_available():
+    from nunchaku.models.linear import SVDQW4A4Linear
+
+
+logger = logging.get_logger(__name__)
+
+
+def _replace_with_nunchaku_linear(
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+):
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights():
+                    in_features = module.in_features
+                    out_features = module.out_features
+
+                    if quantization_config.precision in ["int4", "nvfp4"]:
+                        model._modules[name] = SVDQW4A4Linear(
+                            in_features,
+                            out_features,
+                            rank=quantization_config.rank,
+                            bias=module.bias is not None,
+                            dtype=model.dtype,
+                        )
+                        has_been_replaced = True
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_nunchaku_linear(
+                module,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_nunchaku_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+    model, _ = _replace_with_nunchaku_linear(model, modules_to_not_convert, current_key_name, quantization_config)
+
+    has_been_replaced = any(
+        isinstance(replaced_module, SVDQW4A4Linear) for _, replaced_module in model.named_modules()
+    )
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in the SVDQuant method but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
+    return model
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -46,6 +46,7 @@ class QuantizationMethod(str, Enum):
     GGUF = "gguf"
     TORCHAO = "torchao"
     QUANTO = "quanto"
+    NUNCHAKU = "nunchaku"
 
 
 if is_torchao_available():
@@ -724,3 +725,40 @@ def post_init(self):
         accepted_weights = ["float8", "int8", "int4", "int2"]
         if self.weights_dtype not in accepted_weights:
             raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights_dtype}")
+
+
+class NunchakuConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `nunchaku`.
+
+    Args:
+        TODO
+       modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
+            modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
+    """
+
+    group_size_map = {"int4": 64, "nvfp4": 16}
+
+    def __init__(
+        self,
+        precision: str = "int4",
+        rank: int = 32,
+        modules_to_not_convert: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.NUNCHAKU
+        self.precision = precision
+        self.group_size = self.group_size_map[precision]
+        self.modules_to_not_convert = modules_to_not_convert
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        accpeted_precision = ["int4", "nvfp4"]
+        if self.precision not in accpeted_precision:
+            raise ValueError(f"Only supported precision in {accpeted_precision} but found {self.precision}")
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -89,6 +89,7 @@
     is_matplotlib_available,
     is_nltk_available,
     is_note_seq_available,
+    is_nunchaku_available,
     is_onnx_available,
     is_opencv_available,
     is_optimum_quanto_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
@@ -217,6 +217,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
 _torchao_available, _torchao_version = _is_package_available("torchao")
 _bitsandbytes_available, _bitsandbytes_version = _is_package_available("bitsandbytes")
 _optimum_quanto_available, _optimum_quanto_version = _is_package_available("optimum", get_dist_name=True)
+_nunchaku_available, _nunchaku_version = _is_package_available("nunchaku", get_dist_name=True)
 _pytorch_retinaface_available, _pytorch_retinaface_version = _is_package_available("pytorch_retinaface")
 _better_profanity_available, _better_profanity_version = _is_package_available("better_profanity")
 _nltk_available, _nltk_version = _is_package_available("nltk")
@@ -363,6 +364,10 @@ def is_optimum_quanto_available():
     return _optimum_quanto_available
 
 
+def is_nunchaku_available():
+    return _nunchaku_available
+
+
 def is_timm_available():
     return _timm_available
 
@@ -816,7 +821,7 @@ def is_k_diffusion_version(operation: str, version: str):
 
 def is_optimum_quanto_version(operation: str, version: str):
     """
-    Compares the current Accelerate version to a given reference with an operation.
+    Compares the current quanto version to a given reference with an operation.
 
     Args:
         operation (`str`):
@@ -829,6 +834,21 @@ def is_optimum_quanto_version(operation: str, version: str):
     return compare_versions(parse(_optimum_quanto_version), operation, version)
 
 
+def is_nunchaku_version(operation: str, version: str):
+    """
+    Compares the current nunchaku version to a given reference with an operation.
+
+    Args:
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _nunchaku_available:
+        return False
+    return compare_versions(parse(_nunchaku_version), operation, version)
+
+
 def is_xformers_version(operation: str, version: str):
     """
     Compares the current xformers version to a given reference with an operation.
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
@@ -197,3 +197,7 @@ def device_synchronize(device_type: Optional[str] = None):
         device_type = get_device()
     device_mod = getattr(torch, device_type, torch.cuda)
     device_mod.synchronize()
+
+
+def is_fp8_available():
+    return getattr(torch, "float8_e4m3fn", None) is None