cleanup

matthewdouglas · matthewdouglas · commit fd74c06b5500 · 2025-03-13T12:38:13.000-04:00
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -7,11 +7,8 @@
 from . import _ops, research, utils
 from .autograd._functions import (
     MatmulLtState,
-    bmm_cublas,
     matmul,
     matmul_4bit,
-    matmul_cublas,
-    mm_cublas,
 )
 from .backends.cpu import ops as cpu_ops
 from .backends.cuda import ops as cuda_ops  ## TODO: We would guard this for CUDA only
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -106,121 +106,6 @@ def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -
     return outputs.reshape(rows, cols).contiguous()
 
 
-@deprecated(
-    "MatMul8bit is deprecated and will be removed in a future release. Please use MatMul8bitLt instead.",
-    category=FutureWarning,
-)
-class MatMul8bit(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, A, B, out=None, quant_type="vector", precision=None):
-        if precision is None:
-            precision = [8, 8, 8]
-        if precision[0] != 8:
-            with torch.no_grad():
-                output = torch.matmul(A, B)
-        else:
-            if len(B.shape) == 2:
-                dim = 0
-            else:
-                dim = 1
-            qA, SA = F.vectorwise_quant(A, dim=-1, quant_type=quant_type)
-            qB, SB = F.vectorwise_quant(B, dim=dim, quant_type=quant_type)
-            iout = F.igemm(qA, qB)
-            output = F.vectorwise_mm_dequant(iout, SA, SB, A.dtype, quant_type)
-
-        if A.requires_grad or B.requires_grad:
-            ctx.save_for_backward(A, B)
-
-        ctx.quant_type = quant_type
-        ctx.precision = precision
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        A, B = ctx.saved_tensors
-        quant_type = ctx.quant_type
-        precision = ctx.precision
-        grad_A = grad_B = None
-
-        if B.requires_grad:
-            if len(A.shape) == 3:
-                dims = [0, 1]
-                # bsi -> ibs
-                permute_dim = [0, 2, 1]
-            else:
-                dims = [0]
-                # bs -> sb
-                permute_dim = [1, 0]
-
-            if precision[1] != 8:
-                with torch.no_grad():
-                    grad_B = torch.matmul(A.permute(permute_dim), grad_output)
-            else:
-                if len(B.shape) == 2 and len(A.shape) == 3:
-                    grad_output = grad_output.contiguous()
-                    if not grad_output.is_contiguous():
-                        grad_output.contiguous()
-                    qgrad_output, S1 = F.vectorwise_quant(
-                        grad_output.view(-1, grad_output.shape[2]),
-                        dim=0,
-                        quant_type=quant_type,
-                    )
-                    if not A.is_contiguous():
-                        A = A.contiguous()
-                    qA, S2 = F.vectorwise_quant(A.view(-1, A.shape[2]), dim=0, quant_type=quant_type)
-                    igrad_B = F.igemm(qA.t(), qgrad_output)
-                    grad_B = F.vectorwise_mm_dequant(igrad_B, S2.t(), S1, grad_output.dtype, quant_type)
-                else:
-                    qgrad_output, S1 = F.vectorwise_quant(grad_output, dim=dims, quant_type=quant_type)
-                    qA, S2 = F.vectorwise_quant(A, dim=dims, quant_type=quant_type)
-                    igrad_B = F.igemm(qA.permute(permute_dim), qgrad_output)
-                    grad_B = F.vectorwise_mm_dequant(
-                        igrad_B,
-                        S2.permute(permute_dim),
-                        S1,
-                        grad_output.dtype,
-                        quant_type,
-                    )
-
-        if A.requires_grad:
-            if len(grad_output.shape) == 3:
-                dims = [2]
-            else:
-                dims = [1]
-
-            if len(B.shape) == 3:
-                # bio -> boi
-                permute_dim = [0, 2, 1]
-                dim_B = dims
-            else:
-                # io -> oi
-                permute_dim = [1, 0]
-                dim_B = [1]
-
-            if precision[2] != 8:
-                with torch.no_grad():
-                    grad_A = torch.matmul(grad_output, B.permute(permute_dim))
-            else:
-                qgrad_output, S1 = F.vectorwise_quant(grad_output, dim=dims, quant_type=quant_type)
-                qB, S3 = F.vectorwise_quant(B, dim=dim_B, quant_type=quant_type)
-                igrad_A = F.igemm(qgrad_output, qB.permute(permute_dim))
-                grad_A = F.vectorwise_mm_dequant(
-                    igrad_A,
-                    S1,
-                    S3.permute(permute_dim),
-                    grad_output.dtype,
-                    quant_type,
-                )
-
-        return grad_A, grad_B, None, None, None
-
-
-mm_cublas = MatMul8bit.apply
-bmm_cublas = MatMul8bit.apply
-matmul_cublas = MatMul8bit.apply
-
-
 @deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1541,21 +1541,12 @@ def optimizer_update_8bit_blockwise(
 
     is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
 
-    print(
-        f"{p.device} {g.device} {state1.device} {state2.device} {qmap1.device} {qmap2.device} {absmax1.device} {absmax2.device} \n\n"
-        f"{p.dtype} {g.dtype} {state1.dtype} {state2.dtype} {qmap1.dtype} {qmap2.dtype} {absmax1.dtype} {absmax2.dtype} \n\n"
-        f"{p.__class__} {g.__class__} {state1.__class__} {state2.__class__} {qmap1.__class__} {qmap2.__class__} {absmax1.__class__} {absmax2.__class__} \n\n"
-        f"{p.data_ptr()} {g.data_ptr()} {state1.data_ptr()} {state2.data_ptr()} {qmap1.data_ptr()} {qmap2.data_ptr()} {absmax1.data_ptr()} {absmax2.data_ptr()} \n\n"
-    )
-
-    print(p, g, state1, state2)
-
     with _cuda_device_of(g):
         optim_func(
-            get_ptr(p.to_local()),
-            get_ptr(g.to_local()),
-            get_ptr(state1.to_local()),
-            get_ptr(state2.to_local()),
+            get_ptr(p),
+            get_ptr(g),
+            get_ptr(state1),
+            get_ptr(state2),
             ct.c_float(beta1),
             ct.c_float(beta2),
             ct.c_float(beta3),
@@ -1570,7 +1561,7 @@ def optimizer_update_8bit_blockwise(
             ct.c_float(weight_decay),
             ct.c_float(gnorm_scale),
             ct.c_bool(skip_zeros),
-            ct.c_int32(g.to_local().numel()),
+            ct.c_int32(g.numel()),
         )
 
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -174,7 +174,7 @@ export BNB_CUDA_VERSION=126
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-12.6
 ```
 
-3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 12.6) and a different bitsandbytes library is loaded.
 
 ## Multi-backend Support (Alpha Release)[[multi-backend]]
 
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
@@ -1,201 +1,9 @@
-from typing import Tuple
-
 import numpy as np
 import pytest
 from scipy.stats import norm
 import torch
 
-import bitsandbytes as bnb
 from bitsandbytes import functional as F
-from tests.helpers import (
-    BOOLEAN_TUPLES,
-    describe_dtype,
-    get_test_dims,
-    id_formatter,
-)
-
-
-@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", get_test_dims(32, 96, n=1), ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
-@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
-@pytest.mark.parametrize(
-    "funcs",
-    [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)],
-    ids=["func=bmm", "func=matmul"],
-)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
-@pytest.mark.parametrize("req_grad", BOOLEAN_TUPLES, ids=id_formatter("req_grad"))
-@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
-@pytest.mark.deprecated
-def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool], transpose: Tuple[bool, bool]):
-    if dim2 > 0:
-        dim2 = dim2 - (dim2 % 16)
-    dim3 = dim3 - (dim3 % 16)
-    dim4 = dim4 - (dim4 % 16)
-    for i in range(25):
-        # normal multiply
-        if funcs[0] in [torch.mm, torch.matmul]:
-            dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
-            dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
-            A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0])
-            B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1])
-            target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1])
-            torch.nn.init.xavier_uniform_(B)
-
-            if not transpose[0] and not transpose[1]:
-                out_torch = funcs[0](A, B)
-                out_bnb = funcs[1](A, B)
-            elif not transpose[0] and transpose[1]:
-                out_torch = funcs[0](A, B.t())
-                out_bnb = funcs[1](A, B.t())
-            elif transpose[0] and not transpose[1]:
-                out_torch = funcs[0](A.t(), B)
-                out_bnb = funcs[1](A.t(), B)
-            elif transpose[0] and transpose[1]:
-                out_torch = funcs[0](A.t(), B.t())
-                out_bnb = funcs[1](A.t(), B.t())
-
-            n = out_bnb.numel()
-            idx = torch.isclose(out_bnb, out_torch, atol=0.01, rtol=0.1)
-            assert (idx == 0).sum().item() < n * 0.0175
-            idx = torch.isclose(out_bnb, out_torch, atol=0.035, rtol=0.2)
-            assert (idx == 0).sum().item() < n * 0.001
-
-            if any(req_grad):
-                out_bnb.data.copy_(out_torch)
-                torch.cuda.synchronize()
-                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
-                loss_bnb.backward()
-                gradA1 = A.grad
-                gradB1 = B.grad
-                A.grad = None
-                B.grad = None
-
-                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
-                loss_torch.backward()
-                gradA2 = A.grad
-                gradB2 = B.grad
-                A.grad = None
-                B.grad = None
-
-            if req_grad[0]:
-                torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
-            if req_grad[1]:
-                n = gradB1.numel()
-                idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.1
-                idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.02
-                torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3)
-
-        # batched matrix multiply
-        if funcs[0] in [torch.bmm, torch.matmul]:
-            A = torch.randn(
-                size=(dim1, dim2, dim3),
-                device="cuda",
-                requires_grad=req_grad[0],
-            )
-            B = torch.randn(
-                size=(dim1, dim3, dim4),
-                device="cuda",
-                requires_grad=req_grad[1],
-            )
-            target = torch.randn(
-                size=(dim1, dim2, dim4),
-                device="cuda",
-                requires_grad=req_grad[1],
-            )
-            torch.nn.init.xavier_uniform_(B)
-
-            out_torch = funcs[0](A, B)
-            out_bnb = funcs[1](A, B)
-
-            n = out_bnb.numel()
-            idx = torch.isclose(out_bnb, out_torch, atol=0.01, rtol=0.1)
-            assert (idx == 0).sum().item() < n * 0.01
-            torch.testing.assert_close(out_bnb, out_torch, atol=0.027, rtol=0.2)
-
-            if any(req_grad):
-                out_bnb.data.copy_(out_torch)
-                torch.cuda.synchronize()
-                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
-                loss_bnb.backward()
-                gradA1 = A.grad
-                gradB1 = B.grad
-                A.grad = None
-                B.grad = None
-
-                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
-                loss_torch.backward()
-                gradA2 = A.grad
-                gradB2 = B.grad
-                A.grad = None
-                B.grad = None
-
-            if req_grad[0]:
-                torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
-            if req_grad[1]:
-                n = gradB1.numel()
-                idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.1
-                idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.02
-
-        if funcs[0] in [torch.matmul]:
-            dim1 = dim1 - (dim1 % 16)
-            A = torch.randn(
-                size=(dim1, dim2, dim3),
-                device="cuda",
-                requires_grad=req_grad[0],
-            )
-            dimB = (dim4, dim3) if transpose[1] else (dim3, dim4)
-            B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1])
-            target = torch.randn(
-                size=(dim1, dim2, dim4),
-                device="cuda",
-                requires_grad=req_grad[1],
-            )
-            torch.nn.init.xavier_uniform_(B)
-
-            if transpose[1]:
-                out_torch = funcs[0](A, B.t())
-                out_bnb = funcs[1](A, B.t())
-            else:
-                out_torch = funcs[0](A, B)
-                out_bnb = funcs[1](A, B)
-
-            n = out_bnb.numel()
-            idx = torch.isclose(out_bnb, out_torch, atol=0.01, rtol=0.1)
-            assert (idx == 0).sum().item() < n * 0.0175
-            idx = torch.isclose(out_bnb, out_torch, atol=0.035, rtol=0.2)
-            assert (idx == 0).sum().item() < n * 0.001
-
-            if any(req_grad):
-                out_bnb.data.copy_(out_torch)
-                torch.cuda.synchronize()
-                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
-                loss_bnb.backward()
-                gradA1 = A.grad
-                gradB1 = B.grad
-                A.grad = None
-                B.grad = None
-
-                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
-                loss_torch.backward()
-                gradA2 = A.grad
-                gradB2 = B.grad
-                A.grad = None
-                B.grad = None
-
-            if req_grad[0]:
-                torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
-            if req_grad[1]:
-                n = gradB1.numel()
-                idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.1
-                idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
-                assert (idx == 0).sum().item() < n * 0.02
 
 
 @pytest.mark.deprecated
diff --git a/tests/test_functional.py b/tests/test_functional.py