Int8 ops updates; tests

matthewdouglas · matthewdouglas · commit 4ad1d9e376a3 · 2025-02-13T16:02:26.000-05:00
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -63,9 +63,16 @@ def _(A: torch.Tensor, stats: torch.Tensor) -> torch.Tensor:
     return torch.empty_like(A, dtype=torch.float32)
 
 
+# Default PyTorch-native implementation
+@register_kernel("bitsandbytes::int8_vectorwise_dequant", None)
+def _(A: torch.Tensor, stats: torch.Tensor):
+    # To dequantize we divide by 127, or multiply by the reciprocal.
+    return A * stats.view(-1, 1) * 7.874015718698502e-3
+
+
 torch.library.define(
     "bitsandbytes::int8_mm_dequant",
-    "(Tensor A, Tensor row_stats, Tensor col_stats, Tensor? out, Tensor? bias) -> Tensor",
+    "(Tensor A, Tensor row_stats, Tensor col_stats, Tensor? out=None, Tensor? bias=None) -> Tensor",
 )
 
 
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -0,0 +1,14 @@
+from typing import Optional
+
+import torch
+
+from ..._ops import register_kernel
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cpu")
+def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
+    # Naive implementation: perform matmul in fp32
+    result = torch.matmul(A.float(), B.float().t()).to(torch.int32)
+    if out is not None:
+        result = out.copy_(result)
+    return result
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -0,0 +1,205 @@
+import ctypes as ct
+from math import prod
+from typing import Optional, Tuple
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr, is_on_gpu
+
+from ..._ops import register_kernel
+from ...cextension import lib
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
+    A, B = B, A
+
+    shapeA = A.shape
+    shapeB = B.shape
+
+    assert A.dtype == torch.int8
+    assert B.dtype == torch.int8
+    assert A.ndim == 2, "Only two dimensional matrices are supported for argument B"
+    assert B.ndim in [2, 3], "Only two or three dimensional matrices are supported for argument A"
+    assert prod(shapeB) > 0, f"Input tensor dimensions need to be > 0: {shapeB}"
+    assert out is None or out.dtype == dtype
+
+    shapeC = (*shapeB[:-1], shapeA[0])
+
+    k, m = shapeA
+    n = prod(shapeB[:-1])
+    lda = shapeA[-1]  # Weights (outputs, inputs)
+    ldb = shapeB[-1]  # Activations (batch, tokens, inputs)
+    ldc = shapeC[-1]  # Output (batch, tokens, outputs)
+
+    assert (
+        lda == ldb
+    ), f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
+
+    # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+    # We'll fall back to a slower fp32 calculation in this circumstance.
+    # Fortunately, this should not be very common.
+    if lda % 4 != 0:
+        result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+        if out is not None:
+            result = out.copy_(result)
+        return result
+
+    if out is None:
+        out = torch.empty(shapeC, device=A.device, dtype=dtype)
+
+    is_on_gpu([A, B, out])
+
+    with _cuda_device_of(A):
+        ctx = CUBLAS_Context.get_instance().get_context(A.device)
+        ptrA = get_ptr(A)
+        ptrB = get_ptr(B)
+        ptrC = get_ptr(out)
+        ptrRowScale = None
+        m = ct.c_int32(m)
+        n = ct.c_int32(n)
+        k = ct.c_int32(k)
+        lda = ct.c_int32(lda)
+        ldb = ct.c_int32(ldb)
+        ldc = ct.c_int32(ldc)
+        stream = _get_tensor_stream(A)
+
+        if dtype == torch.int32:
+            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+        else:
+            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+    if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+        raise NotImplementedError("int8_linear_matmul not implemented!")
+
+    if has_error:
+        raise RuntimeError(
+            f"cublasLt ran into an error!\n"
+            f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
+            f"\t{(lda, ldb, ldc)=}\n"
+            f"\t{(m, n, k)=}"
+        )
+
+    return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+    A: torch.Tensor,
+    row_stats: torch.Tensor,
+    col_stats: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert A.dtype == torch.int32
+
+    if bias is not None:
+        assert bias.dtype == torch.float16
+
+    if out is None:
+        out = torch.empty_like(A, dtype=torch.float16)
+
+    ptrA = get_ptr(A)
+    ptrOut = get_ptr(out)
+    ptrRowStats = get_ptr(row_stats)
+    ptrColStats = get_ptr(col_stats)
+    ptrBias = get_ptr(bias)
+    numRows = ct.c_int32(prod(A.shape[:-1]))
+    numCols = ct.c_int32(A.shape[-1])
+
+    is_on_gpu([A, row_stats, col_stats, out, bias])
+
+    with _cuda_device_of(A):
+        lib.cdequant_mm_int32_fp16(
+            ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+        )
+
+    return out
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+    assert A.dtype == torch.half
+    is_on_gpu([A])
+
+    rows = prod(A.shape[:-1])
+    cols = A.shape[-1]
+
+    row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+    out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+    outlier_cols = None
+
+    if threshold > 0.0:
+        # TODO we could improve perf of this
+        outliers = A.abs() >= threshold
+
+        if outliers.any():
+            outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+
+    with _cuda_device_of(A):
+        lib.cint8_vector_quant(
+            get_ptr(A),
+            get_ptr(out_row),
+            get_ptr(row_stats),
+            ct.c_float(threshold),
+            ct.c_int32(rows),
+            ct.c_int32(cols),
+            _get_tensor_stream(A),
+        )
+
+    # Zero out values from outlier columns across all rows.
+    # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+    if rows > 1 and outlier_cols is not None:
+        out_row[:, outlier_cols] = 0
+
+    return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+    A: torch.Tensor,
+    col_stats: Optional[torch.Tensor] = None,
+    row_stats: Optional[torch.Tensor] = None,
+    out_col: Optional[torch.Tensor] = None,
+    out_row: Optional[torch.Tensor] = None,
+    threshold=0.0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    # TODO: Optimize/write CUDA kernel for this?
+
+    # Use CUDA kernel for rowwise and COO tensor
+    quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant(A, threshold=threshold)
+
+    # PyTorch impl for colwise
+    col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+    if threshold > 0.0 and outlier_mask is not None:
+        A = A.masked_fill(outlier_mask, 0.0)
+    quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+    if out_row is not None:
+        quant_row = out_row.copy_(quant_row)
+    if out_col is not None:
+        quant_col = out_col.copy_(quant_col)
+
+    return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+    A: torch.Tensor,
+    threshold=0.0,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    assert A.is_floating_point()
+
+    outlier_mask = None
+
+    absA = A.abs().view(-1, A.shape[-1])
+
+    if threshold > 0.0:
+        # Filter outliers from stats when enabled
+        outlier_mask = absA >= threshold
+        absA.masked_fill_(outlier_mask, 0.0)
+
+    # shape [cols]; unsqueeze(0) gives [1,cols]
+    col_stats = absA.amax(dim=0, keepdim=False).float()
+
+    return col_stats, outlier_mask
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2498,24 +2498,7 @@ def int8_double_quant(
         - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
         - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
     """
-
-    # TODO: Optimize/write CUDA kernel for this?
-
-    # Use CUDA kernel for rowwise and COO tensor
-    quant_row, row_stats, outlier_cols = int8_vectorwise_quant(A, threshold=threshold)
-
-    # PyTorch impl for colwise
-    _, col_stats, outlier_mask = get_colrow_absmax(A, threshold=threshold)
-    if threshold > 0.0 and outlier_mask is not None:
-        A = A.masked_fill(outlier_mask, 0.0)
-    quant_col = torch.round(A.mul(C) / col_stats.unsqueeze(0)).to(torch.int8)
-
-    if out_row is not None:
-        quant_row = out_row.copy_(quant_row)
-    if out_col is not None:
-        quant_col = out_col.copy_(quant_col)
-
-    return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+    return torch.ops.bitsandbytes.int8_double_quant(A, col_stats, row_stats, out_col, out_row, threshold)
 
 
 def int8_vectorwise_dequant(A: torch.Tensor, stats: torch.Tensor):
@@ -2529,7 +2512,7 @@ def int8_vectorwise_dequant(A: torch.Tensor, stats: torch.Tensor):
         `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
     """
     # To dequantize we divide by 127, or multiply by the reciprocal.
-    return A * stats.view(-1, 1) * 7.874015718698502e-3
+    return torch.ops.bitsandbytes.int8_vectorwise_dequant(A, stats)
 
 
 def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -0,0 +1,65 @@
+import pytest
+import torch
+
+import bitsandbytes  # noqa: F401
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_int8_linear_matmul(device):
+    A = torch.randint(-128, 127, (10, 20), dtype=torch.int8, device=device)
+    B = torch.randint(-128, 127, (30, 20), dtype=torch.int8, device=device)
+    out = torch.ops.bitsandbytes.int8_linear_matmul(A, B)
+
+    assert out.shape == (10, 30)
+    assert out.dtype == torch.int32
+    assert out.device == A.device
+
+    torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul, (A, B))
+
+
+@pytest.mark.parametrize("threshold", [0.0, 6.0])
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_int8_vectorwise_quant(threshold, device):
+    if device == "cpu":
+        pytest.skip("CPU implementation is not available")
+
+    A = torch.randn(10, 20, dtype=torch.float16, device=device)
+    A[1][0] = 1000.0
+
+    out_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant(A, threshold=threshold)
+
+    assert out_row.shape == (10, 20)
+    assert out_row.dtype == torch.int8
+    assert out_row.device == A.device
+    assert row_stats.shape == (10,)
+    assert row_stats.dtype == torch.float32
+    assert row_stats.device == A.device
+
+    if threshold > 0.0:
+        assert outlier_cols is not None
+        assert outlier_cols.dim() == 1
+        assert outlier_cols.shape[0] <= A.shape[1]
+        assert outlier_cols.device == A.device
+    else:
+        assert outlier_cols is None
+
+    torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
+
+    torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_int8_mm_dequant(device):
+    if device == "cpu":
+        pytest.skip("CPU implementation is not available")
+
+    A = torch.randint(-128, 127, (10, 20), dtype=torch.int32, device=device)
+    row_stats = torch.randn(10, dtype=torch.float16, device=device)
+    col_stats = torch.randn(20, dtype=torch.float16, device=device)
+    out = torch.ops.bitsandbytes.int8_mm_dequant(A, row_stats, col_stats)
+
+    assert out.shape == A.shape
+    assert out.dtype == torch.float16
+    assert out.device == A.device
+
+    torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))