Fix nested quant

matthewdouglas · matthewdouglas · commit 9d0f459c21f9 · 2025-02-17T15:51:55.000-05:00
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -131,6 +131,7 @@ def _(
 )
 
 
+@register_fake("bitsandbytes::quantize_4bit")
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -147,13 +148,15 @@ def _(
 )
 
 
+@register_fake("bitsandbytes::dequantize_blockwise")
 def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
     return torch.empty_like(A, dtype=dtype)
 
 
 torch.library.define("bitsandbytes::quantize_blockwise", "(Tensor A, Tensor code, int blocksize) -> (Tensor, Tensor)")
 
 
+@register_fake("bitsandbytes::quantize_blockwise")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
     n = A.numel()
     blocks = -(n // -blocksize)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -231,7 +231,7 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
 
-        return out, absmax
+    return out, absmax
 
 
 @register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -977,7 +977,7 @@ def dequantize_blockwise(
 
     return torch.ops.bitsandbytes.dequantize_blockwise(
         A,
-        quant_state.absmax,
+        absmax,
         quant_state.code.to(A.device),
         quant_state.blocksize,
         quant_state.dtype,
@@ -1142,8 +1142,9 @@ def quantize_4bit(
 
     if compress_statistics:
         offset = absmax.mean()
-        absmax -= offset
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+        # absmax -= offset
+        # qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+        qabsmax, state2 = quantize_blockwise(absmax - offset, blocksize=256)
         del absmax
         state = QuantState(
             absmax=qabsmax,
@@ -1249,7 +1250,7 @@ def dequantize_4bit(
     out = torch.ops.bitsandbytes.dequantize_4bit(
         A,
         absmax,
-        blocksize,
+        quant_state.blocksize,
         quant_state.quant_type,
         quant_state.shape,
         quant_state.dtype,
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-import bitsandbytes  # noqa: F401
+import bitsandbytes
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
@@ -63,3 +63,23 @@ def test_int8_mm_dequant(device):
     assert out.device == A.device
 
     torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_quantize_blockwise(device):
+    # if device == "cpu":
+    #     pytest.skip("CPU implementation is not available")
+    blocksize = 256
+
+    code = bitsandbytes.functional.create_dynamic_map().to(device)
+    A = torch.randn(1024, 1024, dtype=torch.float16, device=device)
+    out, absmax = torch.ops.bitsandbytes.quantize_blockwise(A, code, blocksize)
+
+    assert out.shape == A.shape
+    assert out.dtype == torch.uint8
+    assert out.device == A.device
+
+    assert absmax.device == A.device
+    assert absmax.dtype == torch.float32
+
+    torch.library.opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))