bitsandbytes-foundation
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-package.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/int8/int8_benchmark.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarking/int8/int8_benchmark.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/matmul_benchmark.py‎
Lines changed: 10 additions & 8 deletions b/‎benchmarking/matmul_benchmark.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 5 additions & 4 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 2 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 3 additions & 3 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 7 additions & 9 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎bitsandbytes/cuda_specs.py‎
Lines changed: 5 additions & 5 deletions b/‎bitsandbytes/cuda_specs.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎bitsandbytes/diagnostics/cuda.py‎
Lines changed: 2 additions & 2 deletions b/‎bitsandbytes/diagnostics/cuda.py‎
Lines changed: 2 additions & 2 deletions
@@ -111,7 +111,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         # The specific Python version is irrelevant in this context as we are only packaging non-C extension
-        # code. This ensures compatibility across Python versions, including Python 3.8, as compatibility is
+        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
         # dictated by the packaged code itself, not the Python version used for packaging.
         python-version: ["3.10"]
         arch: [x86_64, aarch64]
 
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.9
+    rev: v0.11.2
     hooks:
       - id: ruff
         args:
 
@@ -65,4 +65,4 @@
 print("=" * 40)
 print(f"Example:\n{tokenizer.decode(generated_ids[0])}")
 print("=" * 40)
-print(f"Speed: {num/(time.time() - time_1)}token/s")
+print(f"Speed: {num / (time.time() - time_1)}token/s")
@@ -66,7 +66,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         torch.matmul(A, B.t())
     torch.cuda.synchronize()
     print(
-        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s",
+        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s",
     )
 
     # torch.cuda.synchronize()
@@ -88,22 +88,24 @@ def test_bench_matmul(batch, seq, model, hidden):
     for i in range(iters):
         bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4)
     torch.cuda.synchronize()
-    print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+    print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s")
 
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
         bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c)
     torch.cuda.synchronize()
-    print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+    print(
+        f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
+    )
 
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
         bnb.matmul(A, B)
     torch.cuda.synchronize()
     print(
-        f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     torch.cuda.synchronize()
@@ -112,7 +114,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         bnb.matmul(A, B, threshold=6.0)
     torch.cuda.synchronize()
     print(
-        f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
@@ -124,7 +126,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         out32 = F.int8_linear_matmul(CA, CB)
     torch.cuda.synchronize()
     print(
-        f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     # C32A, SA = F.transform(CA, "col32")
@@ -183,7 +185,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         linear8bit(A)
     torch.cuda.synchronize()
     print(
-        f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     linearMixedBit(A)
@@ -193,7 +195,7 @@ def test_bench_matmul(batch, seq, model, hidden):
         linearMixedBit(A)
     torch.cuda.synchronize()
     print(
-        f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
     # linear8bit_train(A)
 
@@ -1,5 +1,6 @@
+from collections.abc import Sequence
 from math import prod
-from typing import Optional, Sequence, Tuple
+from typing import Optional
 
 import torch
 
@@ -131,7 +132,7 @@ def _(
 def _(
     A: torch.Tensor,
     threshold=0.0,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     out_row = torch.empty_like(A, dtype=torch.int8)
     out_col = torch.empty_like(A, dtype=torch.int8)
     row_stats = torch.empty(prod(A.shape[:-1]), device=A.device, dtype=torch.float32)
@@ -191,7 +192,7 @@ def _(
 @register_fake("bitsandbytes::quantize_4bit")
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
 
     n = A.numel()
@@ -235,7 +236,7 @@ def _(
 
 
 @register_fake("bitsandbytes::quantize_blockwise")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
     n = A.numel()
     blocks = -(n // -blocksize)
 
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from math import prod
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional
 import warnings
 from warnings import warn
 
@@ -55,7 +55,7 @@ def get_current_outlier_idx(self):
 )
 def get_inverse_transform_indices(
     transform_tile: Callable[[torch.Tensor], torch.Tensor],
-    tile_size: Tuple[int, int],
+    tile_size: tuple[int, int],
 ):
     """
     Compute a permutation of indices that invert the specified (tiled) matrix transformation
 
@@ -1,5 +1,5 @@
 import ctypes as ct
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -47,7 +47,7 @@ def _(
 
 
 @register_kernel("bitsandbytes::quantize_blockwise", "cpu")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
     torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on cpu, got {A.dtype}")
 
@@ -116,7 +116,7 @@ def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int,
 @register_kernel("bitsandbytes::quantize_4bit", "cpu")
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
     torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")
 
 
@@ -1,6 +1,7 @@
+from collections.abc import Sequence
 import ctypes as ct
 from math import prod
-from typing import Optional, Sequence, Tuple
+from typing import Optional
 
 import torch
 
@@ -78,10 +79,7 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
             raise NotImplementedError("int8_linear_matmul not implemented!")
         else:
             raise RuntimeError(
-                f"cublasLt ran into an error!\n"
-                f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
-                f"\t{(lda, ldb, ldc)=}\n"
-                f"\t{(m, n, k)=}"
+                f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
             )
 
     return out
@@ -169,7 +167,7 @@ def _(A: torch.Tensor, threshold=0.0):
 def _(
     A: torch.Tensor,
     threshold=0.0,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     # Use CUDA kernel for rowwise and COO tensor
     quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
         A,
@@ -188,7 +186,7 @@ def _(
 def _get_col_absmax(
     A: torch.Tensor,
     threshold=0.0,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     torch._check(A.is_floating_point())
 
     outlier_mask = None
@@ -207,7 +205,7 @@ def _get_col_absmax(
 
 
 @register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
     torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
     torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
@@ -292,7 +290,7 @@ def _dequantize_blockwise_impl(
 @register_kernel("bitsandbytes::quantize_4bit", "cuda")
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
     torch._check(quant_type in ["fp4", "nf4"])
     torch._check(
 
@@ -1,27 +1,27 @@
 import dataclasses
 from functools import lru_cache
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 
 
 @dataclasses.dataclass(frozen=True)
 class CUDASpecs:
-    highest_compute_capability: Tuple[int, int]
+    highest_compute_capability: tuple[int, int]
     cuda_version_string: str
-    cuda_version_tuple: Tuple[int, int]
+    cuda_version_tuple: tuple[int, int]
 
     @property
     def has_imma(self) -> bool:
         return torch.version.hip or self.highest_compute_capability >= (7, 5)
 
 
-def get_compute_capabilities() -> List[Tuple[int, int]]:
+def get_compute_capabilities() -> list[tuple[int, int]]:
     return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count()))
 
 
 @lru_cache(None)
-def get_cuda_version_tuple() -> Tuple[int, int]:
+def get_cuda_version_tuple() -> tuple[int, int]:
     if torch.version.cuda:
         return map(int, torch.version.cuda.split(".")[0:2])
     elif torch.version.hip:
 
@@ -1,7 +1,7 @@
+from collections.abc import Iterable, Iterator
 import logging
 import os
 from pathlib import Path
-from typing import Dict, Iterable, Iterator
 
 import torch
 
@@ -76,7 +76,7 @@ def is_relevant_candidate_env_var(env_var: str, value: str) -> bool:
     )
 
 
-def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]:
+def get_potentially_lib_path_containing_env_vars() -> dict[str, str]:
     return {env_var: value for env_var, value in os.environ.items() if is_relevant_candidate_env_var(env_var, value)}