Merge branch 'main' into main

xiaolil1 · web-flow · commit 7d7c74cd05cd · 2025-09-10T10:34:45.000+08:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -355,7 +355,7 @@ jobs:
             pypi_index: "https://download.pytorch.org/whl/cu128"
           - cuda_version: "12.9.1"
             torch_version: "2.8.0"
-            pypi_index: "https://download.pytorch.org/whl/test/cu129"
+            pypi_index: "https://download.pytorch.org/whl/cu129"
 
 
           # Linux L40S runners
diff --git a/bitsandbytes/nn/parametrize.py b/bitsandbytes/nn/parametrize.py
@@ -0,0 +1,192 @@
+from functools import partial
+from typing import Any, Literal, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.utils.parametrize as P
+
+from .. import functional as F
+
+
+class Bnb4bitParametrization(nn.Module):
+    """
+    A parametrization module that handles dequantization of a 4-bit quantized parameter.
+
+    The parameter data is expected to be already quantized when this parametrization is applied.
+    This module will dequantize the parameter data to its original floating-point representation
+    when the forward method is called (i.e. when the parameter is accessed).
+
+    Args:
+        quant_state (`F.QuantState`):
+            The quantization state containing the necessary information for dequantization.
+    """
+
+    def __init__(self, quant_state: F.QuantState):
+        super().__init__()
+        self.quant_state = quant_state
+
+    @torch.no_grad()
+    def forward(self, quantized_param: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass to dequantize the parameter.
+
+        Args:
+            quantized_param (`torch.Tensor`): The quantized parameter tensor (from .original)
+
+        Returns:
+            `torch.Tensor`: The dequantized parameter tensor in the original shape and dtype.
+        """
+        return F.dequantize_4bit(quantized_param, self.quant_state)
+
+
+def replace_parameter_4bit_prequantized(
+    module: nn.Module, param_name: str, qs_dict: dict[str, Any], device: torch.device
+):
+    if not hasattr(module, param_name):
+        raise AttributeError(f"Module does not have parameter '{param_name}'")
+
+    original_param = getattr(module, param_name)
+
+    if not isinstance(original_param, nn.Parameter):
+        raise TypeError(f"Parameter '{param_name}' is not an instance of nn.Parameter")
+
+    quant_state = F.QuantState.from_dict(qs_dict, device=device)
+
+    # Apply a parametrization to the module to handle dequantization.
+    P.register_parametrization(module, param_name, Bnb4bitParametrization(quant_state), unsafe=True)
+
+    # Next, register hooks.
+    _register_parametrization_hooks(module, param_name)
+
+
+def replace_parameter_4bit(
+    module: nn.Module,
+    param_name: str,
+    compress_statistics: bool = False,
+    quant_type: Literal["nf4", "fp4"] = "nf4",
+    blocksize: Optional[int] = None,
+):
+    """
+    Replace a module parameter with a 4-bit quantized version using parametrization.
+
+    This function quantizes an existing parameter in a PyTorch module to 4-bit precision
+    and sets up parametrization to handle automatic dequantization during forward passes.
+    The original parameter is replaced with quantized data, and a parametrization layer
+    is registered to manage the quantization state and dequantization process.
+
+    Additional, it registers a state dict post-hook to ensure that the quantization state
+    is saved correctly when the model's state dict is saved.
+
+    It is useful for MoE models or other scenarios where you want to quantize parameters
+    outside of nn.Linear layers without changing the model's architecture.
+
+    <Tip warning={true}>This feature is experimental and may change in future releases.</Tip>
+
+    Args:
+        module (`nn.Module`):
+            The PyTorch module containing the parameter to be quantized.
+        param_name (`str`):
+            The name of the parameter within the module to quantize.
+        compress_statistics (`bool`, *optional*, defaults to `False`):
+            Whether to compress quantization statistics to reduce memory usage.
+        quant_type (`Literal["nf4", "fp4"]`, *optional*, defaults to `"nf4"`):
+            The quantization format to use.
+        blocksize (`int`, *optional*, defaults to `None`):
+            The block size for quantization. If None, uses the default block size.
+
+    Raises:
+        AttributeError: If the module does not have the specified parameter.
+        TypeError: If the specified attribute is not an instance of nn.Parameter.
+    """
+
+    if not hasattr(module, param_name):
+        raise AttributeError(f"Module does not have parameter '{param_name}'")
+
+    original_param = getattr(module, param_name)
+
+    if not isinstance(original_param, nn.Parameter):
+        raise TypeError(f"Parameter '{param_name}' is not an instance of nn.Parameter")
+
+    # Quantize the original parameter.
+    quantized_data, quant_state = F.quantize_4bit(
+        original_param.data,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+        quant_type=quant_type,
+    )
+
+    # Replace the parameter with the quantized data.
+    setattr(module, param_name, nn.Parameter(quantized_data, requires_grad=False))
+    del original_param
+
+    # Apply a parametrization to the module to handle dequantization.
+    P.register_parametrization(module, param_name, Bnb4bitParametrization(quant_state), unsafe=True)
+
+    # Next, register hooks.
+    _register_parametrization_hooks(module, param_name)
+
+
+def _disable_parametrization_cache(module: nn.Module, inputs: tuple[Any, ...], output: Any):
+    P._cache_enabled -= 1
+    if not P._cache_enabled:
+        P._cache = {}
+
+
+def _enable_parametrization_cache(module: nn.Module, inputs: tuple[Any, ...]):
+    P._cache_enabled += 1
+
+
+def _register_parametrization_hooks(module: nn.Module, param_name: str):
+    # Register a state dict hook for saving. Note that this requires torch >= 2.5.0.
+    if torch.__version__ >= (2, 5):
+        module.register_state_dict_post_hook(
+            partial(
+                _parametrized_state_dict_post_hook,
+                param_name=param_name,
+            )
+        )
+
+    # Register hooks to enable caching for the dequantization parametrization.
+    # This helps preserve time and memory when the same quantized parameter
+    # is accessed multiple times in the forward computation.
+    module.register_forward_pre_hook(_enable_parametrization_cache)
+    module.register_forward_hook(_disable_parametrization_cache)
+
+
+def _parametrized_state_dict_post_hook(
+    module: nn.Module,
+    state_dict: dict[str, Any],
+    prefix: str,
+    local_metadata: Any,
+    *,
+    param_name: str = "weight",
+    **kwargs: dict[str, Any],
+) -> None:
+    """
+    Hook to modify the state dict to include the quantization state.
+    """
+
+    original_key = f"{prefix}parametrizations.{param_name}.original"
+
+    if original_key in state_dict:
+        # Create a clean entry.
+        # The `parametrizations.{param_name}.original` key will have the quantized data,
+        # but we would like it to keep it in the state_dict as `{param_name}`.
+        clean_key = f"{prefix}{param_name}"
+        state_dict[clean_key] = state_dict.pop(original_key)
+
+        assert P.is_parametrized(module, param_name)
+
+        # Find the parametrization, which should have the quantization state.
+        parametrization: Bnb4bitParametrization = next(
+            filter(lambda x: isinstance(x, Bnb4bitParametrization), module.parametrizations[param_name]), None
+        )
+
+        assert parametrization is not None, "Parametrization not found for the parameter."
+
+        quant_state = parametrization.quant_state
+
+        # Next, we need to store the quantization state.
+        if quant_state is not None:
+            for k, v in quant_state.as_dict(packed=True).items():
+                state_dict[f"{prefix}{param_name}.{k}"] = v
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -34,6 +34,8 @@ def pytest_runtest_teardown(item, nextitem):
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        torch.mps.empty_cache()
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -1,9 +1,10 @@
 import math
+import platform
 import random
 import time
 
 import einops
-import numpy as np
+from packaging import version
 import pytest
 import torch
 
@@ -101,16 +102,16 @@ class Test8BitBlockwiseQuantizeFunctional:
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
         iters = 100
 
-        if device == "cpu":
+        if device != "cuda":
             iters = 10
 
-            # This test is slow on CPU, so avoid atypical use cases.
+            # This test is slow in our non-CUDA implementations, so avoid atypical use cases.
             if nested:
                 pytest.skip("Not a typical use case.")
             if blocksize != 256:
-                pytest.skip("Only blocksize 256 is used in CPU/XPU")
+                pytest.skip("Only blocksize 256 is used in CPU/MPS/XPU")
             if dtype != torch.float32:
-                pytest.skip("Only float32 is used in CPU/XPU")
+                pytest.skip("Only float32 is used in CPU/MPS/XPU")
 
         diffs = []
         reldiffs = []
@@ -239,7 +240,7 @@ def test_fp8_quant(self, device):
 
             abserr = []
             relerr = []
-            for i in range(100):
+            for i in range(10):
                 A1 = torch.randn(1024, 1024, device=device)
                 C, SC = F.quantize_blockwise(A1, code=code)
                 A2 = F.dequantize_blockwise(C, SC)
@@ -253,7 +254,7 @@ def test_fp8_quant(self, device):
 
             abserr = []
             relerr = []
-            for i in range(100):
+            for i in range(10):
                 A1 = torch.rand(1024, 1024, device=device)
                 C, SC = F.quantize_blockwise(A1, code=code)
                 A2 = F.dequantize_blockwise(C, SC)
@@ -267,7 +268,7 @@ def test_fp8_quant(self, device):
 
             abserr = []
             relerr = []
-            for i in range(100):
+            for i in range(10):
                 A1 = torch.randn(1024, 1024, device=device)
                 C, SC = F.quantize_blockwise(A1)
                 A2 = F.dequantize_blockwise(C, SC)
@@ -1169,8 +1170,12 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
             4096: 0.262457,
         }
 
-        assert err < error_dict[quant_type]["err"][blocksize] + 1e-3
-        assert relerr < error_dict[quant_type]["rel_err"][blocksize] + 1e-3
+        # Allow higher tolerance for fp32 on CPU with larger block sizes
+        reltol = 2.8e-3 if dtype == torch.float32 and blocksize >= 128 and device == "cpu" else 1e-3
+        errtol = 1.2e-3 if dtype == torch.float32 and blocksize >= 1024 and device == "cpu" else 1e-3
+
+        assert err < error_dict[quant_type]["err"][blocksize] + errtol
+        assert relerr < error_dict[quant_type]["rel_err"][blocksize] + reltol
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@@ -1403,28 +1408,29 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
-    @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
     @pytest.mark.skipif(
         HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a",
         reason="this test is not supported on ROCm with gfx90a architecture yet",
     )
-    def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
+    def test_gemv_eye_4bit(self, device, storage_type, dtype):
         if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
             pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
 
         if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
             pytest.skip("This configuration is not supported on HPU.")
 
-        dims = 10
-        torch.random.manual_seed(np.random.randint(0, 412424242))
+        if device == "cpu" and platform.system() == "Windows" and version.parse(torch.__version__).release == (2, 8, 0):
+            pytest.skip("Regression: CPU crash on Windows with torch 2.8.0")
+
+        dims = 4
         dims = get_test_dims(0, 8192, n=dims)
         dims = [dim + (64 - (dim % 64)) for dim in dims]
         # for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]:
         for dim in dims:
             A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device=device)
             B = torch.eye(dim, dtype=dtype, device=device)
 
-            qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant)
+            qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=False)
             C3 = torch.matmul(A, B.t())
             C2 = bnb.matmul_4bit(A, qB.t(), state)
             A.requires_grad = True
diff --git a/tests/test_optim.py b/tests/test_optim.py
@@ -172,6 +172,10 @@ def rm_path(path):
 @pytest.mark.parametrize("device", get_available_devices(no_cpu=True), ids=id_formatter("device"))
 @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
 def test_optimizer32bit(dim1, dim2, gtype, optim_name, device):
+
+    if device not in ["cuda", "xpu"]:
+        pytest.skip("Optimizers are only supported on CUDA and XPU")
+
     if optim_name.startswith("paged_") and sys.platform == "win32":
         pytest.skip("Paged optimizers can have issues on Windows.")
 
@@ -253,6 +257,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name, device):
 @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
 @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
 def test_global_config(dim1, dim2, gtype, device):
+    if device not in ["cuda", "xpu"]:
+        pytest.skip("Optimizers are only supported on CUDA and XPU")
+
     if dim1 == 1 and dim2 == 1:
         return
     p1 = torch.randn(dim1, dim2, device="cpu", dtype=gtype) * 0.1
@@ -310,6 +317,10 @@ def test_global_config(dim1, dim2, gtype, device):
 @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
 @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
 def test_optimizer8bit(dim1, dim2, gtype, optim_name, device):
+
+    if device not in ["cuda", "xpu"]:
+        pytest.skip("8-bit optimizers are only supported on CUDA and XPU")
+
     torch.set_printoptions(precision=6)
 
     if dim1 == 1 and dim2 == 1:
diff --git a/tests/test_parametrize.py b/tests/test_parametrize.py