bitsandbytes-foundation
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions b/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎bitsandbytes/autograd/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎bitsandbytes/autograd/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 60 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 60 deletions
diff --git a/‎bitsandbytes/backends/utils.py‎
Lines changed: 2 additions & 1 deletion b/‎bitsandbytes/backends/utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 3 additions & 99 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 3 additions & 99 deletions
diff --git a/‎bitsandbytes/nn/modules.py‎
Lines changed: 13 additions & 13 deletions b/‎bitsandbytes/nn/modules.py‎
Lines changed: 13 additions & 13 deletions
@@ -103,7 +103,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
+        torch_version: ["2.3.1", "2.8.0", "2.9.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
 
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.14.3
     hooks:
       - id: ruff
         args:
 
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 ## System Requirements
 bitsandbytes has the following minimum requirements for all platforms:
 
-* Python 3.9+
+* Python 3.10+
 * [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
   * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
 
 
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
     B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
     torch.nn.init.xavier_uniform_(B)
 
-    B_fp4, state = F.quantize_fp4(B)
-    B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
+    _B_fp4, _state = F.quantize_fp4(B)
+    _B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)
 
     B_nf4, state_nf4 = F.quantize_nf4(B)
     B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
         f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
-    CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
-    CB, SCB, _ = F.int8_vectorwise_quant(B)
+    CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
+    CB, _SCB, _ = F.int8_vectorwise_quant(B)
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
 
@@ -54,10 +54,7 @@ def _import_backends():
     """
     from importlib.metadata import entry_points
 
-    if sys.version_info < (3, 10):
-        extensions = entry_points().get("bitsandbytes.backends", [])
-    else:
-        extensions = entry_points(group="bitsandbytes.backends")
+    extensions = entry_points(group="bitsandbytes.backends")
 
     for ext in extensions:
         try:
@@ -75,4 +72,4 @@ def _import_backends():
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.48.3.dev0"
+__version__ = "0.49.0.dev0"
@@ -1 +0,0 @@
-from ._functions import get_inverse_transform_indices, undo_layout
@@ -1,11 +1,10 @@
 from dataclasses import dataclass
 from math import prod
-from typing import Callable, Optional
+from typing import Optional
 import warnings
 from warnings import warn
 
 import torch
-from typing_extensions import deprecated
 
 import bitsandbytes.functional as F
 
@@ -49,66 +48,9 @@ def get_current_outlier_idx(self):
         return torch.Tensor(list(self.outliers)).to(torch.int64)
 
 
-@deprecated(
-    "This function is deprecated and will be removed in a future release.",
-    category=FutureWarning,
-)
-def get_inverse_transform_indices(
-    transform_tile: Callable[[torch.Tensor], torch.Tensor],
-    tile_size: tuple[int, int],
-):
-    """
-    Compute a permutation of indices that invert the specified (tiled) matrix transformation
-
-    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
-    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
-    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
-    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
-    :returns: indices
-    """
-    d1, d2 = tile_size
-    assert 0 < d1 * d2 < 2**64
-    tile_indices = torch.arange(d1 * d2, dtype=torch.int64).view(d1, d2)
-    # encode each position in tile as a tuple of <= 8 unique bytes
-    permuted_tile_indices = torch.zeros_like(tile_indices)
-    for i in range(8):
-        # select i-th byte, apply transformation and trace where each index ended up
-        ith_dim_indices = torch.div(tile_indices, 256**i, rounding_mode="trunc") % 256
-        sample_tile_i = (ith_dim_indices - 128).to(torch.int8).contiguous()
-        assert torch.all(sample_tile_i.int() + 128 == ith_dim_indices), "int overflow"
-        permuted_tile_i = transform_tile(sample_tile_i)
-        ith_permuted_indices = permuted_tile_i.to(tile_indices.dtype) + 128
-        permuted_tile_indices += ith_permuted_indices * (256**i)
-        if d1 * d2 < 256**i:
-            break  # if all indices fit in i bytes, stop early
-    return permuted_tile_indices
-
-
 _is_compiling = torch.compiler.is_compiling
 
 
-@deprecated(
-    "This function is deprecated and will be removed in a future release.",
-    category=FutureWarning,
-)
-def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -> torch.Tensor:
-    """
-    Undo a tiled permutation such as turing or ampere layout
-
-    :param permuted_tensor: torch tensor in a permuted layout
-    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
-    :return: contiguous row-major tensor
-    """
-    (rows, cols), (tile_rows, tile_cols) = permuted_tensor.shape, tile_indices.shape
-    assert rows % tile_rows == cols % tile_cols == 0, "tensor must contain a whole number of tiles"
-    tensor = permuted_tensor.reshape(-1, tile_indices.numel()).t()
-    outputs = torch.empty_like(tensor)  # note: not using .index_copy because it was slower on cuda
-    outputs[tile_indices.flatten()] = tensor
-    outputs = outputs.reshape(tile_rows, tile_cols, cols // tile_cols, rows // tile_rows)
-    outputs = outputs.permute(3, 0, 2, 1)  # (rows // tile_rows, tile_rows), (cols // tile_cols, tile_cols)
-    return outputs.reshape(rows, cols).contiguous()
-
-
 @dataclass
 class MatmulLtState:
     _tile_indices: Optional[torch.Tensor] = None  # TODO: remove
@@ -257,7 +199,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        CAt, subA, A = ctx.tensors
+        CAt, subA, _A = ctx.tensors
         SCAt, idx = ctx.tensor_states
         state: MatmulLtState = ctx.state
         grad_A = grad_B = grad_bias = None
 
@@ -4,9 +4,10 @@
 import torch
 
 try:
-    import triton  # noqa: F401
     import triton.language as tl  # noqa: F401
 
+    import triton  # noqa: F401
+
     triton_available = True
 except ImportError:
     triton_available = False
 
@@ -6,7 +6,7 @@
 import ctypes as ct
 import itertools
 from math import prod
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -1413,7 +1413,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
             raise ValueError(f"Gradient type {grad.dtype} not supported!")
 
     current_gnorm = torch.sqrt(gnorm_vec[step % 100])
-    vals, idx = torch.sort(gnorm_vec)
+    vals, _ = torch.sort(gnorm_vec)
     clip_value = torch.sqrt(vals[percentile])
     gnorm_scale = 1.0
 
@@ -1795,102 +1795,6 @@ def int8_mm_dequant(
     return result
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def get_colrow_absmax(
-    A: torch.Tensor,
-    row_stats: Optional[torch.Tensor] = None,
-    col_stats: Optional[torch.Tensor] = None,
-    nnz_block_ptr: Optional[torch.Tensor] = None,
-    threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    """ "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
-
-    The row-wise and column-wise absmax values are determined.
-
-    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
-
-    <Tip>
-    This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
-    The column-wise quantization scales are not typically needed in inference scenarios.
-    </Tip>
-
-    Args:
-        A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
-        row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
-        col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
-        nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
-        threshold (`float`, *optional*):
-            An optional threshold for sparse decomposition of outlier features.
-            No outliers are held back when 0.0. Defaults to 0.0.
-
-    Returns:
-        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
-        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
-        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
-        - `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
-    """
-    assert A.is_floating_point()
-
-    outlier_mask = None
-
-    if row_stats is None or col_stats is None:
-        absA = A.abs().view(-1, A.shape[-1])
-
-        if threshold > 0.0:
-            # Filter outliers from stats when enabled
-            outlier_mask = absA >= threshold
-            absA.masked_fill_(outlier_mask, 0.0)
-
-        if row_stats is None:
-            # shape [rows]; unsqueeze(-1) gives [rows,1]
-            # We have a CUDA kernel for row max, but not yet for cols.
-            row_stats = get_row_absmax(A, threshold)
-
-        if col_stats is None:
-            # shape [cols]; unsqueeze(0) gives [1,cols]
-            col_stats = absA.amax(dim=0, keepdim=False).float()
-
-    return row_stats, col_stats, outlier_mask
-
-
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def get_row_absmax(A: torch.Tensor, threshold=0.0):
-    """Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
-
-    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
-
-    Args:
-        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
-        threshold (`float`, *optional*):
-            An optional threshold for sparse decomposition of outlier features.
-            No outliers are held back when 0.0. Defaults to 0.0.
-
-    Returns:
-        `torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
-    """
-
-    assert A.dtype == torch.float16
-
-    rows = prod(A.shape[:-1])
-    cols = A.shape[-1]
-
-    row_stats = torch.empty((rows,), dtype=torch.float32, device=A.device)
-
-    is_on_gpu([A])
-
-    with _cuda_device_of(A):
-        lib.cget_row_stats(
-            get_ptr(A),
-            get_ptr(row_stats),
-            ct.c_float(threshold),
-            ct.c_int32(rows),
-            ct.c_int32(cols),
-            _get_tensor_stream(A),
-        )
-
-    return row_stats
-
-
 class COOSparseTensor:
     def __init__(
         self, rows: int, cols: int, nnz: int, rowidx: torch.Tensor, colidx: torch.Tensor, values: torch.Tensor
@@ -2059,7 +1963,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
 
 
 def spmm_coo(
-    cooA: Union[COOSparseTensor, torch.Tensor],
+    cooA: COOSparseTensor | torch.Tensor,
     B: torch.Tensor,
     out: Optional[torch.Tensor] = None,
 ):
 
@@ -310,28 +310,28 @@ def _quantize(self, device):
     def cpu(self):
         return self.to(device="cpu")
 
-    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
-    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     @overload
     def to(
         self: T,
-        device: Optional[Union[int, device]] = ...,
-        dtype: Optional[Union[dtype, str]] = ...,
+        device: Optional[int | device] = ...,
+        dtype: Optional[dtype | str] = ...,
         non_blocking: bool = ...,
     ) -> T: ...
 
     @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
+    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...
 
     @overload
     def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
 
     def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         if device is not None and device.type != "meta" and not self.bnb_quantized:
             return self._quantize(device)
@@ -644,10 +644,10 @@ def _quantize(self, device):
     def cpu(self):
         return self.to(device="cpu")
 
-    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
-    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     def __deepcopy__(self, memo):
@@ -665,19 +665,19 @@ def __deepcopy__(self, memo):
     @overload
     def to(
         self: T,
-        device: Optional[Union[int, device]] = ...,
-        dtype: Optional[Union[dtype, str]] = ...,
+        device: Optional[int | device] = ...,
+        dtype: Optional[dtype | str] = ...,
         non_blocking: bool = ...,
     ) -> T: ...
 
     @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
+    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...
 
     @overload
     def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
 
     def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         is_quantized = self.data.dtype == torch.int8
 
@@ -1048,7 +1048,7 @@ def to(self, *args, **kwargs):
         # Call the parent to() method to handle standard parameter/buffer movement
         result = super().to(*args, **kwargs)
 
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         # Handle state tensors if needed.
         if device is not None:
Original file line number	Diff line number	Diff line change
`@@ -1 +0,0 @@`
`1`		`-from ._functions import get_inverse_transform_indices, undo_layout`