Initial int8 op registration

matthewdouglas · matthewdouglas · commit 04482ff8e757 · 2025-02-11T19:50:41.000-05:00
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -3,7 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from . import research, utils
+
+from . import _ops, research, utils
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
@@ -12,6 +13,8 @@
     matmul_cublas,
     mm_cublas,
 )
+from .backends.cpu import ops as cpu_ops
+from .backends.cuda import ops as cuda_ops  ## TODO: We would guard this for CUDA only
 from .nn import modules
 from .optim import adam
 
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -1,12 +1,8 @@
-import ctypes as ct
 from math import prod
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 
-from .cextension import lib
-from .functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr, is_on_gpu
-
 _IS_TORCH_GTE_24 = False
 
 if hasattr(torch.library, "register_fake"):
@@ -27,11 +23,10 @@
 #           return () instead of `None` for compatibility, see here: https://github.com/pytorch/pytorch/issues/125044
 torch.library.define(
     "bitsandbytes::int8_linear_matmul",
-    "(Tensor A, Tensor B, Tensor(a!)? out=None, ScalarType dtype=int32) -> Tensor(a!)",
+    "(Tensor A, Tensor B, Tensor? out=None, ScalarType dtype=int32) -> Tensor",
 )
 
 
-# Fake/abstract op
 @register_fake("bitsandbytes::int8_linear_matmul")
 def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
     shapeC = (*A.shape[:-1], B.shape[0])
@@ -40,103 +35,71 @@ def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtyp
     return out
 
 
-# CPU implementation
-@register_kernel("bitsandbytes::int8_linear_matmul", "cpu")
-def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
-    # Naive implementation: perform matmul in fp32
-    result = torch.matmul(A.float(), B.float().t()).to(torch.int32)
-    if out is not None:
-        result = out.copy_(result)
-    return result
+torch.library.define(
+    "bitsandbytes::int8_vectorwise_quant",
+    "(Tensor A, Scalar threshold=0.0) -> (Tensor, Tensor, Tensor?)",
+)
 
 
-# MPS impl
-@register_kernel("bitsandbytes::int8_linear_matmul", "mps")
-def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
-    pass
+@register_fake("bitsandbytes::int8_vectorwise_quant")
+def _(A: torch.Tensor, threshold=0.0):
+    out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+    row_stats = torch.empty(prod(A.shape[:-1]), device=A.device, dtype=torch.float32)
 
+    if threshold == 0.0:
+        return out_row, row_stats, None
 
-# XPU impl
-@register_kernel("bitsandbytes::int8_linear_matmul", "xpu")
-def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
-    pass
+    outlier_cols = torch.library.get_ctx().new_dynamic_size()
 
+    return out_row, row_stats, A.new_empty(outlier_cols, dtype=torch.int64)
 
-# Ascend NPU impl
-@register_kernel("bitsandbytes::int8_linear_matmul", "npu")
-def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
-    pass
 
+torch.library.define("bitsandbytes::int8_vectorwise_dequant", "(Tensor A, Tensor stats) -> Tensor")
 
-# CUDA/ROCm impl
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
-    A, B = B, A
-
-    shapeA = A.shape
-    shapeB = B.shape
-
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    assert A.ndim == 2, "Only two dimensional matrices are supported for argument B"
-    assert B.ndim in [2, 3], "Only two or three dimensional matrices are supported for argument A"
-    assert prod(shapeB) > 0, f"Input tensor dimensions need to be > 0: {shapeB}"
-    assert out is None or out.dtype == dtype
-
-    shapeC = (*shapeB[:-1], shapeA[0])
-
-    k, m = shapeA
-    n = prod(shapeB[:-1])
-    lda = shapeA[-1]  # Weights (outputs, inputs)
-    ldb = shapeB[-1]  # Activations (batch, tokens, inputs)
-    ldc = shapeC[-1]  # Output (batch, tokens, outputs)
-
-    assert (
-        lda == ldb
-    ), f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
-
-    # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
-    # We'll fall back to a slower fp32 calculation in this circumstance.
-    # Fortunately, this should not be very common.
-    if lda % 4 != 0:
-        result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
-        if out is not None:
-            result = out.copy_(result)
-        return result
 
-    if out is None:
-        out = torch.empty(shapeC, device=A.device, dtype=dtype)
-
-    is_on_gpu([A, B, out])
-
-    with _cuda_device_of(A):
-        ctx = CUBLAS_Context.get_instance().get_context(A.device)
-        ptrA = get_ptr(A)
-        ptrB = get_ptr(B)
-        ptrC = get_ptr(out)
-        ptrRowScale = None
-        m = ct.c_int32(m)
-        n = ct.c_int32(n)
-        k = ct.c_int32(k)
-        lda = ct.c_int32(lda)
-        ldb = ct.c_int32(ldb)
-        ldc = ct.c_int32(ldc)
-        stream = _get_tensor_stream(A)
-
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-        else:
-            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
-    if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
-        raise NotImplementedError("int8_linear_matmul not implemented!")
-
-    if has_error:
-        raise RuntimeError(
-            f"cublasLt ran into an error!\n"
-            f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
-            f"\t{(lda, ldb, ldc)=}\n"
-            f"\t{(m, n, k)=}"
-        )
+@register_fake("bitsandbytes::int8_vectorwise_dequant")
+def _(A: torch.Tensor, stats: torch.Tensor) -> torch.Tensor:
+    torch._check(A.dtype == torch.int8, "A must be int8")
+    return torch.empty_like(A, dtype=torch.float32)
 
-    return out
+
+torch.library.define(
+    "bitsandbytes::int8_mm_dequant",
+    "(Tensor A, Tensor row_stats, Tensor col_stats, Tensor? out, Tensor? bias) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::int8_mm_dequant")
+def _(
+    A: torch.Tensor,
+    row_stats: torch.Tensor,
+    col_stats: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    torch._check(A.dtype == torch.int32, "A must be int32")
+    return torch.empty_like(A, dtype=torch.float16)
+
+
+torch.library.define(
+    "bitsandbytes::int8_double_quant",
+    "(Tensor A, Tensor? col_stats, Tensor? row_stats, Tensor? out_col, Tensor? out_row, Scalar threshold=0.0) -> (Tensor, Tensor, Tensor, Tensor, Tensor?)",
+)
+
+
+@register_fake("bitsandbytes::int8_double_quant")
+def _(
+    A: torch.Tensor,
+    col_stats: Optional[torch.Tensor] = None,
+    row_stats: Optional[torch.Tensor] = None,
+    out_col: Optional[torch.Tensor] = None,
+    out_row: Optional[torch.Tensor] = None,
+    threshold=0.0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    out_row = torch.empty_like(A, dtype=torch.int8)
+    out_col = torch.empty_like(A, dtype=torch.int8)
+    row_stats = torch.empty(prod(A.shape[:-1]), device=A.device, dtype=torch.float32)
+    col_stats = torch.empty(A.shape[-1], device=A.device, dtype=torch.float32)
+    outlier_n = torch.library.get_ctx().new_dynamic_size()
+    outlier_cols = A.new_empty(outlier_n, dtype=torch.int64)
+    return out_row, out_col, row_stats, col_stats, outlier_cols
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2291,88 +2291,7 @@ def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Ten
     Returns:
         `torch.Tensor`: The result of the operation.
     """
-
-    #
-    # To use the IMMA tensor core kernels without special Turing/Ampere layouts,
-    # cublasLt has some rules, namely: A must be transposed, B must not be transposed.
-    # The C++ API will calculate `C = A.T @ B` in with A, B, C in col-major.
-    # This will typically be used with row-major tensors to efficiently
-    # calculate the linear layer with `C = B @ A.T` without any transformations.
-    # We will swap A and B in the API invocation, so that we get `C = A @ B.T`.
-    #
-    # Quick explanation:
-    # With row-major A and B tensors, `C = A.T.T @ B.T = A @ B.T`.
-    # To get row-major output, `C.T = (A @ B.T).T = B @ A.T`.
-    #
-    A, B = B, A
-
-    shapeA = A.shape
-    shapeB = B.shape
-
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    assert A.ndim == 2, "Only two dimensional matrices are supported for argument B"
-    assert B.ndim in [2, 3], "Only two or three dimensional matrices are supported for argument A"
-    assert prod(shapeB) > 0, f"Input tensor dimensions need to be > 0: {shapeB}"
-    assert out is None or out.dtype == dtype
-
-    shapeC = (*shapeB[:-1], shapeA[0])
-
-    k, m = shapeA
-    n = prod(shapeB[:-1])
-    lda = shapeA[-1]  # Weights (outputs, inputs)
-    ldb = shapeB[-1]  # Activations (batch, tokens, inputs)
-    ldc = shapeC[-1]  # Output (batch, tokens, outputs)
-
-    assert (
-        lda == ldb
-    ), f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
-
-    # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
-    # We'll fall back to a slower fp32 calculation in this circumstance.
-    # Fortunately, this should not be very common.
-    if lda % 4 != 0:
-        result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
-        if out is not None:
-            result = out.copy_(result)
-        return result
-
-    if out is None:
-        out = torch.empty(shapeC, device=A.device, dtype=dtype)
-
-    is_on_gpu([A, B, out])
-
-    with _cuda_device_of(A):
-        ctx = CUBLAS_Context.get_instance().get_context(A.device)
-        ptrA = get_ptr(A)
-        ptrB = get_ptr(B)
-        ptrC = get_ptr(out)
-        ptrRowScale = None
-        m = ct.c_int32(m)
-        n = ct.c_int32(n)
-        k = ct.c_int32(k)
-        lda = ct.c_int32(lda)
-        ldb = ct.c_int32(ldb)
-        ldc = ct.c_int32(ldc)
-        stream = _get_tensor_stream(A)
-
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-        else:
-            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
-    if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
-        raise NotImplementedError("int8_linear_matmul not implemented!")
-
-    if has_error:
-        raise RuntimeError(
-            f"cublasLt ran into an error!\n"
-            f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
-            f"\t{(lda, ldb, ldc)=}\n"
-            f"\t{(m, n, k)=}"
-        )
-
-    return out
+    return torch.ops.bitsandbytes.int8_linear_matmul(A, B, out, dtype)
 
 
 def int8_mm_dequant(
@@ -2394,31 +2313,7 @@ def int8_mm_dequant(
     Returns:
         `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
     """
-
-    assert A.dtype == torch.int32
-
-    if bias is not None:
-        assert bias.dtype == torch.float16
-
-    if out is None:
-        out = torch.empty_like(A, dtype=torch.float16)
-
-    ptrA = get_ptr(A)
-    ptrOut = get_ptr(out)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrBias = get_ptr(bias)
-    numRows = ct.c_int32(prod(A.shape[:-1]))
-    numCols = ct.c_int32(A.shape[-1])
-
-    is_on_gpu([A, row_stats, col_stats, out, bias])
-
-    with _cuda_device_of(A):
-        lib.cdequant_mm_int32_fp16(
-            ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
-        )
-
-    return out
+    return torch.ops.bitsandbytes.int8_mm_dequant(A, row_stats, col_stats, out, bias)
 
 
 @deprecated("mm_dequant is deprecated. Please use int8_mm_dequant() instead.", category=FutureWarning)
@@ -2766,42 +2661,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
         - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
         - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
     """
-
-    assert A.dtype == torch.half
-    is_on_gpu([A])
-
-    rows = prod(A.shape[:-1])
-    cols = A.shape[-1]
-
-    row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
-    out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
-    outlier_cols = None
-
-    if threshold > 0.0:
-        # TODO we could improve perf of this
-        outliers = A.abs() >= threshold
-
-        if outliers.any():
-            outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
-
-    with _cuda_device_of(A):
-        lib.cint8_vector_quant(
-            get_ptr(A),
-            get_ptr(out_row),
-            get_ptr(row_stats),
-            ct.c_float(threshold),
-            ct.c_int32(rows),
-            ct.c_int32(cols),
-            _get_tensor_stream(A),
-        )
-
-    # Zero out values from outlier columns across all rows.
-    # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
-    if rows > 1 and outlier_cols is not None:
-        out_row[:, outlier_cols] = 0
-
-    return out_row, row_stats, outlier_cols
+    return torch.ops.bitsandbytes.int8_vectorwise_quant(A, threshold)
 
 
 @deprecated(