Cleanup; rename int8_linear_dequant -> int8_scaled_mm

matthewdouglas · matthewdouglas · commit 2b851001b1c4 · 2025-03-13T11:10:49.000-04:00
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -17,12 +17,12 @@
 
 # Higher level op: int8 matmul + dequant + bias
 torch.library.define(
-    "bitsandbytes::int8_linear_dequant",
+    "bitsandbytes::int8_scaled_mm",
     "(Tensor A, Tensor B, Tensor row_stats, Tensor col_stats, Tensor? bias=None, ScalarType dtype=float16) -> Tensor",
 )
 
 
-@register_fake("bitsandbytes::int8_linear_dequant")
+@register_fake("bitsandbytes::int8_scaled_mm")
 def _(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -35,7 +35,7 @@ def _(
     return torch.empty(shapeC, device=A.device, dtype=dtype)
 
 
-@register_kernel("bitsandbytes::int8_linear_dequant", None)
+@register_kernel("bitsandbytes::int8_scaled_mm", None)
 def _(
     A: torch.Tensor,
     B: torch.Tensor,
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -355,7 +355,7 @@ def forward(
             subA = None
 
         # 3. Int8 Matmul + Dequant + Bias
-        output = torch.ops.bitsandbytes.int8_linear_dequant(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
+        output = torch.ops.bitsandbytes.int8_scaled_mm(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
 
         # 4. Mixed-precision decomposition matmul
         if subA is not None and state.subB is not None:
@@ -405,7 +405,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
         if req_gradB:
             Cgrad, _, _, SCgradt, _ = F.int8_double_quant(grad_output.to(torch.float16))
 
-            grad_B = torch.ops.bitsandbytes.int8_linear_dequant(
+            grad_B = torch.ops.bitsandbytes.int8_scaled_mm(
                 Cgrad.t().contiguous(),
                 CAt.t(),
                 SCgradt,
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -28,6 +28,29 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: Optional[tor
     return result
 
 
+@register_kernel("bitsandbytes::int8_mm_dequant", "cpu")
+def _(
+    A: torch.Tensor,
+    row_stats: torch.Tensor,
+    col_stats: torch.Tensor,
+    dtype=torch.float16,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+    torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+    torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+    A_calc = A.view(-1, A.shape[-1])
+    row_stats = row_stats.reshape(-1).unsqueeze(-1)
+    col_stats = col_stats.reshape(-1).unsqueeze(0)
+
+    out = A_calc * (row_stats * col_stats) * 6.200124e-05
+    if bias is not None:
+        out += bias
+
+    return out.to(dtype)
+
+
 @register_kernel("bitsandbytes::quantize_blockwise", "cpu")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -348,7 +348,7 @@ def _(
 
 
 @register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _dequantize_4bit_impl(
+def _(
     A: torch.Tensor,
     absmax: torch.Tensor,
     blocksize: int,
@@ -358,7 +358,6 @@ def _dequantize_4bit_impl(
     out: torch.Tensor,
 ) -> None:
     torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
-    torch._check(out.device == A.device, lambda: f"Expected out.device == {A.device}, got {out.device}")
     torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
     _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
 
@@ -430,7 +429,6 @@ def _(
         out.shape == (*A.shape[:-1], shapeB[0]),
         lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
     )
-    torch._check(out.device == A.device, lambda: f"Expected out.device == {A.device}, got {out.device}")
     torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
     _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1540,12 +1540,21 @@ def optimizer_update_8bit_blockwise(
 
     is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
 
+    print(
+        f"{p.device} {g.device} {state1.device} {state2.device} {qmap1.device} {qmap2.device} {absmax1.device} {absmax2.device} \n\n"
+        f"{p.dtype} {g.dtype} {state1.dtype} {state2.dtype} {qmap1.dtype} {qmap2.dtype} {absmax1.dtype} {absmax2.dtype} \n\n"
+        f"{p.__class__} {g.__class__} {state1.__class__} {state2.__class__} {qmap1.__class__} {qmap2.__class__} {absmax1.__class__} {absmax2.__class__} \n\n"
+        f"{p.data_ptr()} {g.data_ptr()} {state1.data_ptr()} {state2.data_ptr()} {qmap1.data_ptr()} {qmap2.data_ptr()} {absmax1.data_ptr()} {absmax2.data_ptr()} \n\n"
+    )
+
+    print(p, g, state1, state2)
+
     with _cuda_device_of(g):
         optim_func(
-            get_ptr(p),
-            get_ptr(g),
-            get_ptr(state1),
-            get_ptr(state2),
+            get_ptr(p.to_local()),
+            get_ptr(g.to_local()),
+            get_ptr(state1.to_local()),
+            get_ptr(state2.to_local()),
             ct.c_float(beta1),
             ct.c_float(beta2),
             ct.c_float(beta3),
@@ -1560,7 +1569,7 @@ def optimizer_update_8bit_blockwise(
             ct.c_float(weight_decay),
             ct.c_float(gnorm_scale),
             ct.c_bool(skip_zeros),
-            ct.c_int32(g.numel()),
+            ct.c_int32(g.to_local().numel()),
         )
 
 
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -66,9 +66,6 @@ def test_int8_vectorwise_quant(self, threshold, device):
 
     @pytest.mark.parametrize("device", ["cpu", "cuda"])
     def test_int8_mm_dequant(self, device):
-        if device == "cpu":
-            pytest.skip("CPU implementation is not available")
-
         A = torch.randint(-128, 127, (256, 256), dtype=torch.int32, device=device)
         row_stats = torch.randn(256, dtype=torch.float32, device=device)
         col_stats = torch.randn(256, dtype=torch.float32, device=device)
@@ -83,22 +80,19 @@ def test_int8_mm_dequant(self, device):
     @pytest.mark.parametrize("device", ["cpu", "cuda"])
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("has_bias", TRUE_FALSE)
-    def test_int8_linear_dequant(self, device, dtype, has_bias):
-        if device == "cpu":
-            pytest.skip("CPU implementation is not available")
-
+    def test_int8_scaled_mm(self, device, dtype, has_bias):
         A = torch.randint(-128, 127, (10, 20), dtype=torch.int8, device=device)
         B = torch.randint(-128, 127, (30, 20), dtype=torch.int8, device=device)
         row_stats = torch.randn(10, dtype=torch.float32, device=device)
-        col_stats = torch.randn(20, dtype=torch.float32, device=device)
+        col_stats = torch.randn(30, dtype=torch.float32, device=device)
         bias = torch.randn(30, dtype=dtype, device=device) if has_bias else None
-        out = torch.ops.bitsandbytes.int8_linear_dequant(A, B, row_stats, col_stats, bias=bias, dtype=dtype)
+        out = torch.ops.bitsandbytes.int8_scaled_mm(A, B, row_stats, col_stats, bias=bias, dtype=dtype)
 
         assert out.shape == (10, 30)
         assert out.dtype == dtype
         assert out.device == A.device
 
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_dequant, (A, B, row_stats, col_stats, bias, dtype))
+        torch.library.opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
 
 
 class TestInt8BlockwiseQuantOps: