bitsandbytes-foundation
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 46 additions & 0 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 23 additions & 0 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 96 additions & 0 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 163 additions & 0 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 163 additions & 0 deletions
@@ -273,6 +273,52 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
     return out, absmax
 
 
+torch.library.define("bitsandbytes::quantize_blockwise_kbit", "(Tensor A, int k, Tensor code, int blocksize) -> (Tensor, Tensor)")
+
+
+@register_fake("bitsandbytes::quantize_blockwise_kbit")
+def _(A: torch.Tensor, k: int, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check_is_size(blocksize)
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    n = A.numel()
+    blocks = -(n // -blocksize)
+    absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+    out = torch.empty_like(A, dtype=torch.uint8)
+    return out, absmax
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_blockwise_kbit",
+    "(Tensor A, int k, Tensor absmax, Tensor code, int blocksize, ScalarType dtype) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::dequantize_blockwise_kbit")
+def _(A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+    return torch.empty_like(A, dtype=dtype)
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_blockwise_kbit.out",
+    "(Tensor A, int k, Tensor absmax, Tensor code, int blocksize, ScalarType dtype, Tensor! out) -> ()",
+)
+
+
+@register_fake("bitsandbytes::dequantize_blockwise_kbit.out")
+def _(
+    A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+):
+    torch._check_is_size(blocksize)
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+    torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+    torch._check(out.device == A.device, lambda: f"Expected out.device == {A.device}, got {out.device}")
+    torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+
+
 torch.library.define(
     "bitsandbytes::gemv_4bit",
     "(Tensor A, Tensor B, int[] shapeB, Tensor absmax, Tensor code, int blocksize) -> Tensor",
 
@@ -67,6 +67,29 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
     return out, absmax
 
 
+@register_kernel("bitsandbytes::quantize_blockwise_kbit", "cpu")
+def _(A: torch.Tensor, k: int, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+    raise NotImplementedError("K-bit quantization is not implemented for CPU backend")
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise_kbit", "cpu")
+def _(A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+    raise NotImplementedError("K-bit dequantization is not implemented for CPU backend")
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise_kbit.out", "cpu")
+def _(
+    A: torch.Tensor,
+    k: int,
+    absmax: torch.Tensor,
+    code: torch.Tensor,
+    blocksize: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> None:
+    raise NotImplementedError("K-bit dequantization is not implemented for CPU backend")
+
+
 @register_kernel("bitsandbytes::dequantize_blockwise", "cpu")
 def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
     torch._check_is_size(blocksize)
 
@@ -245,6 +245,102 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
     return out, absmax
 
 
+@register_kernel("bitsandbytes::quantize_blockwise_kbit", "cuda")
+def _(A: torch.Tensor, k: int, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    torch._check(A.device.type == "cuda", lambda: "Input tensor must be on CUDA device")
+    torch._check(code.device.type == "cuda", lambda: "Code tensor must be on CUDA device")
+    torch._check(code.dtype == torch.float32, lambda: "Code must be float32")
+    torch._check(A.is_contiguous(), lambda: "A must be contiguous")
+    torch._check(code.is_contiguous(), lambda: "Code must be contiguous")
+
+    n = A.numel()
+    blocks = -(n // -blocksize)
+    absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+    out = torch.zeros_like(A, dtype=torch.uint8)
+
+    with torch.cuda.device_of(A):
+        args = (
+            get_ptr(code),
+            get_ptr(A),
+            get_ptr(absmax),
+            get_ptr(out),
+            ct.c_int32(blocksize),
+            ct.c_int(A.numel()),
+        )
+
+        # Call the appropriate k-bit function based on dtype and k value
+        if A.dtype == torch.float16:
+            getattr(lib, f"cquantize_blockwise_fp16_k{k}")(*args)
+        elif A.dtype == torch.bfloat16:
+            getattr(lib, f"cquantize_blockwise_bf16_k{k}")(*args)
+        elif A.dtype == torch.float32:
+            getattr(lib, f"cquantize_blockwise_fp32_k{k}")(*args)
+        else:
+            raise ValueError(f"K-bit quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+    return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise_kbit", "cuda")
+def _(A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    out = torch.empty_like(A, dtype=dtype)
+    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, out=out)
+    return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise_kbit.out", "cuda")
+def _(
+    A: torch.Tensor,
+    k: int,
+    absmax: torch.Tensor,
+    code: torch.Tensor,
+    blocksize: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> None:
+    torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+    torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_kbit_impl(
+    A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+    torch._check(
+        dtype in [torch.float16, torch.bfloat16, torch.float32],
+        lambda: f"K-bit dequantization only supports 16/32-bit floats, but got {dtype}",
+    )
+    torch._check(absmax.is_contiguous(), lambda: "Absmax must be contiguous")
+    torch._check(code.is_contiguous(), lambda: "Code must be contiguous")
+
+    with torch.cuda.device_of(A):
+        args = (
+            get_ptr(code),
+            get_ptr(A),
+            get_ptr(absmax),
+            get_ptr(out),
+            ct.c_int32(blocksize),
+            ct.c_int(A.numel()),
+            _get_tensor_stream(A),
+        )
+
+        # Call the appropriate k-bit function based on dtype and k value
+        if dtype == torch.float16:
+            getattr(lib, f"cdequantize_blockwise_fp16_k{k}")(*args)
+        elif dtype == torch.bfloat16:
+            getattr(lib, f"cdequantize_blockwise_bf16_k{k}")(*args)
+        elif dtype == torch.float32:
+            getattr(lib, f"cdequantize_blockwise_fp32_k{k}")(*args)
+        else:
+            raise ValueError(f"K-bit dequantization only supports 16/32-bit floats, but got {dtype}")
+
+
 @register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
 def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
     out = torch.empty_like(A, dtype=dtype)
 
@@ -407,6 +407,7 @@ class QuantState:
         "nested_blocksize",
         "nested_dtype",
         "nested_offset",
+        "k",
     ]
 
     def __init__(
@@ -419,6 +420,7 @@ def __init__(
         dtype=None,
         offset=None,
         state2=None,
+        k=None,
     ):
         self.absmax = absmax
         self.shape = shape
@@ -428,6 +430,7 @@ def __init__(
         self.quant_type = quant_type
         self.offset = offset
         self.state2 = state2
+        self.k = k
         self.nested = state2 is not None
 
     def __getitem__(self, idx):
@@ -637,6 +640,81 @@ def quantize_blockwise(
     return out, quant_state
 
 
+def quantize_blockwise_kbit(
+    A: torch.Tensor,
+    k: int,
+    code: Optional[torch.Tensor] = None,
+    absmax: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    blocksize=4096,
+    nested=False,
+) -> tuple[torch.Tensor, QuantState]:
+    """Quantize a tensor in blocks using k-bit quantization.
+
+    The input tensor is quantized by dividing it into blocks of `blocksize` values.
+    The the absolute maximum value within these blocks is calculated for scaling
+    the k-bit quantization.
+
+    Args:
+        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+        k (`int`): The number of bits for quantization (2-8).
+        code (`torch.Tensor`, *optional*):
+            A mapping describing the k-bit data type. If not provided, a linear map is created.
+        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 4096.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+
+    Raises:
+        ValueError: Raised when the input data type or k value is not supported.
+
+    Returns:
+        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
+        - `torch.Tensor`: The quantized tensor.
+        - [`QuantState`]: The state object used to undo the quantization.
+    """
+    if k < 2 or k > 8:
+        raise ValueError(f"k must be between 2 and 8, got {k}")
+
+    if code is None:
+        # Create a linear k-bit quantization map
+        code = create_linear_map(signed=True, total_bits=k).to(A.device)
+
+    _out, _absmax = torch.ops.bitsandbytes.quantize_blockwise_kbit.default(
+        A,
+        k,
+        code.to(A.device),
+        blocksize,
+    )
+
+    if nested:
+        offset = _absmax.mean()
+        _absmax -= offset
+        qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
+        quant_state = QuantState(
+            absmax=qabsmax,
+            code=code.to(A.device, copy=True),
+            blocksize=blocksize,
+            dtype=A.dtype,
+            offset=offset,
+            state2=state2,
+            k=k,
+        )
+    else:
+        quant_state = QuantState(absmax=_absmax, code=code.to(A.device, copy=True), blocksize=blocksize, dtype=A.dtype, k=k)
+
+    # TODO(matthewdouglas): Deprecate out kwarg
+    out = out.copy_(_out) if out is not None else _out
+
+    # TODO(matthewdouglas): Deprecate absmax kwarg
+    if absmax is not None:
+        quant_state.absmax = absmax.copy_(quant_state.absmax)
+
+    return out, quant_state
+
+
 def dequantize_blockwise(
     A: torch.Tensor,
     quant_state: Optional[QuantState] = None,
@@ -714,6 +792,91 @@ def dequantize_blockwise(
     )
 
 
+def dequantize_blockwise_kbit(
+    A: torch.Tensor,
+    k: int,
+    quant_state: Optional[QuantState] = None,
+    absmax: Optional[torch.Tensor] = None,
+    code: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    blocksize: int = 4096,
+    nested=False,
+) -> torch.Tensor:
+    """Dequantize a tensor in blocks using k-bit dequantization.
+
+    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+    The the absolute maximum value within these blocks is used for scaling
+    the k-bit dequantization.
+
+    Args:
+        A (`torch.Tensor`): The quantized input tensor.
+        k (`int`): The number of bits used for quantization (2-8).
+        quant_state ([`QuantState`], *optional*):
+            The quantization state as returned by [`quantize_blockwise_kbit`].
+            Required if `absmax` is not provided.
+        absmax (`torch.Tensor`, *optional*):
+            A tensor containing the scaling values.
+            Required if `quant_state` is not provided and ignored otherwise.
+        code (`torch.Tensor`, *optional*):
+            A mapping describing the k-bit data type. If not provided, a linear map is created.
+            Ignored when `quant_state` is provided.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 4096.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+            Ignored when `quant_state` is provided.
+
+    Raises:
+        ValueError: Raised when the input data type or k value is not supported.
+
+    Returns:
+        `torch.Tensor`:
+            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
+    """
+    if k < 2 or k > 8:
+        raise ValueError(f"k must be between 2 and 8, got {k}")
+
+    assert quant_state is not None or absmax is not None
+    if code is None and quant_state is None:
+        # Create a linear k-bit quantization map
+        code = create_linear_map(signed=True, total_bits=k).to(A.device)
+
+    if quant_state is None:
+        quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32, k=k)
+
+    absmax = quant_state.absmax
+    if quant_state.nested:
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+        absmax += quant_state.offset
+        if absmax.dtype != torch.float32:
+            absmax = absmax.float()
+
+    # Get k from quant_state if available
+    if hasattr(quant_state, 'k'):
+        k = quant_state.k
+
+    if out is not None:
+        torch.ops.bitsandbytes.dequantize_blockwise_kbit.out(
+            A,
+            k,
+            absmax,
+            quant_state.code.to(A.device),
+            quant_state.blocksize,
+            quant_state.dtype,
+            out=out,
+        )
+        return out
+
+    return torch.ops.bitsandbytes.dequantize_blockwise_kbit.default(
+        A,
+        k,
+        absmax,
+        quant_state.code.to(A.device),
+        quant_state.blocksize,
+        quant_state.dtype,
+    )
+
+
 def get_4bit_type(typename, device=None, blocksize=64):
     if device is None:
         device = "cuda"