Drop Python 3.9 support (#1795)

matthewdouglas · web-flow · commit 45dcd4d37a3d · 2025-11-03T16:36:57.000-05:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.14.3
     hooks:
       - id: ruff
         args:
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 ## System Requirements
 bitsandbytes has the following minimum requirements for all platforms:
 
-* Python 3.9+
+* Python 3.10+
 * [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
   * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
 
diff --git a/benchmarking/matmul_benchmark.py b/benchmarking/matmul_benchmark.py
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
     B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
     torch.nn.init.xavier_uniform_(B)
 
-    B_fp4, state = F.quantize_fp4(B)
-    B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
+    _B_fp4, _state = F.quantize_fp4(B)
+    _B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)
 
     B_nf4, state_nf4 = F.quantize_nf4(B)
     B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
         f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
-    CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
-    CB, SCB, _ = F.int8_vectorwise_quant(B)
+    CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
+    CB, _SCB, _ = F.int8_vectorwise_quant(B)
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -54,10 +54,7 @@ def _import_backends():
     """
     from importlib.metadata import entry_points
 
-    if sys.version_info < (3, 10):
-        extensions = entry_points().get("bitsandbytes.backends", [])
-    else:
-        extensions = entry_points(group="bitsandbytes.backends")
+    extensions = entry_points(group="bitsandbytes.backends")
 
     for ext in extensions:
         try:
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -1,6 +1,7 @@
+from collections.abc import Callable
 from dataclasses import dataclass
 from math import prod
-from typing import Callable, Optional
+from typing import Optional
 import warnings
 from warnings import warn
 
@@ -257,7 +258,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        CAt, subA, A = ctx.tensors
+        CAt, subA, _A = ctx.tensors
         SCAt, idx = ctx.tensor_states
         state: MatmulLtState = ctx.state
         grad_A = grad_B = grad_bias = None
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
@@ -4,9 +4,10 @@
 import torch
 
 try:
-    import triton  # noqa: F401
     import triton.language as tl  # noqa: F401
 
+    import triton  # noqa: F401
+
     triton_available = True
 except ImportError:
     triton_available = False
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -6,7 +6,7 @@
 import ctypes as ct
 import itertools
 from math import prod
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -1413,7 +1413,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
             raise ValueError(f"Gradient type {grad.dtype} not supported!")
 
     current_gnorm = torch.sqrt(gnorm_vec[step % 100])
-    vals, idx = torch.sort(gnorm_vec)
+    vals, _ = torch.sort(gnorm_vec)
     clip_value = torch.sqrt(vals[percentile])
     gnorm_scale = 1.0
 
@@ -2059,7 +2059,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
 
 
 def spmm_coo(
-    cooA: Union[COOSparseTensor, torch.Tensor],
+    cooA: COOSparseTensor | torch.Tensor,
     B: torch.Tensor,
     out: Optional[torch.Tensor] = None,
 ):
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -310,28 +310,28 @@ def _quantize(self, device):
     def cpu(self):
         return self.to(device="cpu")
 
-    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
-    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     @overload
     def to(
         self: T,
-        device: Optional[Union[int, device]] = ...,
-        dtype: Optional[Union[dtype, str]] = ...,
+        device: Optional[int | device] = ...,
+        dtype: Optional[dtype | str] = ...,
         non_blocking: bool = ...,
     ) -> T: ...
 
     @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
+    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...
 
     @overload
     def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
 
     def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         if device is not None and device.type != "meta" and not self.bnb_quantized:
             return self._quantize(device)
@@ -644,10 +644,10 @@ def _quantize(self, device):
     def cpu(self):
         return self.to(device="cpu")
 
-    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
-    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     def __deepcopy__(self, memo):
@@ -665,19 +665,19 @@ def __deepcopy__(self, memo):
     @overload
     def to(
         self: T,
-        device: Optional[Union[int, device]] = ...,
-        dtype: Optional[Union[dtype, str]] = ...,
+        device: Optional[int | device] = ...,
+        dtype: Optional[dtype | str] = ...,
         non_blocking: bool = ...,
     ) -> T: ...
 
     @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
+    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...
 
     @overload
     def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
 
     def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         is_quantized = self.data.dtype == torch.int8
 
@@ -1048,7 +1048,7 @@ def to(self, *args, **kwargs):
         # Call the parent to() method to handle standard parameter/buffer movement
         result = super().to(*args, **kwargs)
 
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)
 
         # Handle state tensors if needed.
         if device is not None:
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
@@ -507,7 +507,7 @@ def update_step(self, group, p, gindex, pindex):
         step = state["step"]
 
         if config["percentile_clipping"] < 100:
-            current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
+            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
                 grad,
                 state["gnorm_vec"],
                 step,
@@ -725,7 +725,7 @@ def update_step(self, group, p, gindex, pindex):
         step = state["step"]
 
         if config["percentile_clipping"] < 100:
-            current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
+            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
                 grad,
                 state["gnorm_vec"],
                 step,
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
@@ -307,8 +307,8 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        CAt, subA, A = ctx.tensors
-        SCAt, idx = ctx.tensor_states
+        _CAt, _subA, A = ctx.tensors
+        _SCAt, _idx = ctx.tensor_states
         state = ctx.state
         grad_A = grad_B = grad_bias = None
 
@@ -320,7 +320,7 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
+        _Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
 
         if req_gradB:
             # print('back A shape', A.shape)
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
@@ -91,7 +91,7 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
     zstd = (std - stdm) / stdstd
 
     if topk is not None:
-        val, idx = torch.topk(std.abs(), k=topk, dim=0)
+        _, idx = torch.topk(std.abs(), k=topk, dim=0)
     else:
         idx = torch.where(zstd > zscore)[0]
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -25,7 +25,7 @@ additional platforms such as AMD ROCm.
 
 These are the minimum requirements for `bitsandbytes` across all platforms. Please be aware that some compute platforms may impose more strict requirements.
 
-* Python >= 3.9
+* Python >= 3.10
 * PyTorch >= 2.3
 
 ## NVIDIA CUDA[[cuda]]
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ maintainers = [
     {name="Titus von Köller", email="titus@huggingface.co"},
     {name="Matthew Douglas", email="matthew.douglas@huggingface.co"}
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 readme = "README.md"
 license = "MIT"
 license-files = ["LICENSE"]
@@ -35,11 +35,11 @@ classifiers = [
     "Operating System :: Microsoft :: Windows",
     "Programming Language :: C++",
     "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Topic :: Scientific/Engineering :: Artificial Intelligence"
 ]
 dependencies = [
@@ -60,7 +60,7 @@ docs = ["hf-doc-builder==0.5.0"]
 dev = [
     "bitsandbytes[test]",
     "build>=1.0.0,<2",
-    "ruff==0.11.2",
+    "ruff~=0.14.3",
     "pre-commit>=3.5.0,<4",
     "wheel>=0.42,<1"
 ]
@@ -108,7 +108,7 @@ src = [
     "tests",
     "benchmarking"
 ]
-target-version = "py39"
+target-version = "py310"
 line-length = 119
 
 [tool.ruff.lint]
@@ -125,13 +125,14 @@ select = [
 ignore = [
     "B007",  # Loop control variable not used within the loop body (TODO: enable)
     "B028",  # Warning without stacklevel (TODO: enable)
+    "B905",  # zip without explicit `strict=` kwarg
     "E501",  # Suppress line-too-long warnings: trust yapf's judgement on this one.
     "E701",  # Multiple statements on one line (TODO: enable)
     "E712",  # Allow using if x == False, as it's not always equivalent to if x.
     "E731",  # Do not use lambda
-    "RUF012",  # Mutable class attribute annotations
-    "RUF034", # Useless if-else (TODO: enable)
-    "ISC001",   # single-line-implicit-string-concatenation incompatible with formatter
+    "RUF012",# Mutable class attribute annotations
+    "RUF034",# Useless if-else (TODO: enable)
+    "UP045", # Use `X | None` instead of `Optional[X]`
 ]
 
 [tool.ruff.lint.extend-per-file-ignores]
@@ -145,6 +146,9 @@ ignore = [
     "F841",
     "UP030",
 ]
+"bitsandbytes/**/triton/**/*.py" = [
+    "I001",  # import order
+]
 
 [tool.ruff.lint.isort]
 combine-as-imports = true
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
@@ -52,7 +52,7 @@ def test_percentile_clipping(gtype):
         else:
             gnorm_vec1[step % 100] = gnorm2
 
-        vals, idx = torch.sort(gnorm_vec1)
+        vals, _ = torch.sort(gnorm_vec1)
         clip1 = vals[percentile]
 
         torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -312,7 +312,7 @@ def test_fp8_quant(self, device):
     def test_bench_dequantization(self):
         a = torch.rand(1024, 1024, device="cuda").half()
         code = F.create_fp8_map(True, 3, 0, 4).cuda()
-        qa, SA = F.quantize_blockwise(a, code=code)
+        qa, _SA = F.quantize_blockwise(a, code=code)
         print(qa.max())
 
         max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000
@@ -321,7 +321,7 @@ def test_bench_dequantization(self):
         torch.cuda.synchronize()
         t0 = time.time()
         for i in range(100):
-            qa, SA = F.quantize_blockwise(a)
+            qa, _SA = F.quantize_blockwise(a)
         torch.cuda.synchronize()
         # print((time.time()-t0)/1e6)
 
@@ -1004,7 +1004,7 @@ def test_spmm_coo_dequant(self, dim1, dim2, dtype):
         torch.nn.init.xavier_uniform_(B)
         Bt = B.t().contiguous()
 
-        CB, CBt, statsB, statsBt, coo_tensor = F.int8_double_quant(B)
+        _CB, CBt, _statsB, statsBt, _coo_tensor = F.int8_double_quant(B)
 
         rowidx = torch.randint(0, A.shape[-1], size=(15,))
 
@@ -1023,7 +1023,7 @@ def test_spmm_coo_dequant(self, dim1, dim2, dtype):
 
         values, counts = torch.unique(cooA.rowidx, return_counts=True)
         offset = counts.cumsum(0).int()
-        max_count, max_idx = torch.sort(counts, descending=True)
+        max_count, _ = torch.sort(counts, descending=True)
         print(torch.median(max_count.float()))
 
         torch.testing.assert_close(out2, out3, rtol=0.05, atol=0.001)
diff --git a/tests/test_optim.py b/tests/test_optim.py
@@ -496,7 +496,7 @@ def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits):
         g2 = g1.clone()
         p2.grad = g2
 
-        current_gnorm, clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
+        _current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
         g1 = (g1.float() * gnorm_scale).to(gtype)
         p1.grad = g1
 
diff --git a/tests/test_parametrize.py b/tests/test_parametrize.py
@@ -246,14 +246,14 @@ def test_error_conditions():
         replace_parameter_4bit(module, "nonexistent")
 
     # Test TypeError for non-Parameter attribute
-    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
+    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
         replace_parameter_4bit(module, "not_param")
 
     # Test same errors for prequantized version
     with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
         replace_parameter_4bit_prequantized(module, "nonexistent", {}, torch.device("cpu"))
 
-    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
+    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
         replace_parameter_4bit_prequantized(module, "not_param", {}, torch.device("cpu"))