Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.2
rev: v0.14.3
hooks:
- id: ruff
args:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
## System Requirements
bitsandbytes has the following minimum requirements for all platforms:

* Python 3.9+
* Python 3.10+
* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
* _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._

Expand Down
8 changes: 4 additions & 4 deletions benchmarking/matmul_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
torch.nn.init.xavier_uniform_(B)

B_fp4, state = F.quantize_fp4(B)
B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
_B_fp4, _state = F.quantize_fp4(B)
_B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)

B_nf4, state_nf4 = F.quantize_nf4(B)
B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
Expand Down Expand Up @@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
CB, SCB, _ = F.int8_vectorwise_quant(B)
CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
CB, _SCB, _ = F.int8_vectorwise_quant(B)
torch.cuda.synchronize()
t0 = time.time()
for i in range(iters):
Expand Down
5 changes: 1 addition & 4 deletions bitsandbytes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,7 @@ def _import_backends():
"""
from importlib.metadata import entry_points

if sys.version_info < (3, 10):
extensions = entry_points().get("bitsandbytes.backends", [])
else:
extensions = entry_points(group="bitsandbytes.backends")
extensions = entry_points(group="bitsandbytes.backends")

for ext in extensions:
try:
Expand Down
5 changes: 3 additions & 2 deletions bitsandbytes/autograd/_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Callable
from dataclasses import dataclass
from math import prod
from typing import Callable, Optional
from typing import Optional
import warnings
from warnings import warn

Expand Down Expand Up @@ -257,7 +258,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
CAt, subA, A = ctx.tensors
CAt, subA, _A = ctx.tensors
SCAt, idx = ctx.tensor_states
state: MatmulLtState = ctx.state
grad_A = grad_B = grad_bias = None
Expand Down
3 changes: 2 additions & 1 deletion bitsandbytes/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import torch

try:
import triton # noqa: F401
import triton.language as tl # noqa: F401

import triton # noqa: F401

triton_available = True
except ImportError:
triton_available = False
Expand Down
6 changes: 3 additions & 3 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import ctypes as ct
import itertools
from math import prod
from typing import Any, Optional, Union
from typing import Any, Optional

import numpy as np
import torch
Expand Down Expand Up @@ -1413,7 +1413,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
raise ValueError(f"Gradient type {grad.dtype} not supported!")

current_gnorm = torch.sqrt(gnorm_vec[step % 100])
vals, idx = torch.sort(gnorm_vec)
vals, _ = torch.sort(gnorm_vec)
clip_value = torch.sqrt(vals[percentile])
gnorm_scale = 1.0

Expand Down Expand Up @@ -2059,7 +2059,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):


def spmm_coo(
cooA: Union[COOSparseTensor, torch.Tensor],
cooA: COOSparseTensor | torch.Tensor,
B: torch.Tensor,
out: Optional[torch.Tensor] = None,
):
Expand Down
26 changes: 13 additions & 13 deletions bitsandbytes/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,28 +310,28 @@ def _quantize(self, device):
def cpu(self):
return self.to(device="cpu")

def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

@overload
def to(
self: T,
device: Optional[Union[int, device]] = ...,
dtype: Optional[Union[dtype, str]] = ...,
device: Optional[int | device] = ...,
dtype: Optional[dtype | str] = ...,
non_blocking: bool = ...,
) -> T: ...

@overload
def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

@overload
def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

if device is not None and device.type != "meta" and not self.bnb_quantized:
return self._quantize(device)
Expand Down Expand Up @@ -644,10 +644,10 @@ def _quantize(self, device):
def cpu(self):
return self.to(device="cpu")

def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

def __deepcopy__(self, memo):
Expand All @@ -665,19 +665,19 @@ def __deepcopy__(self, memo):
@overload
def to(
self: T,
device: Optional[Union[int, device]] = ...,
dtype: Optional[Union[dtype, str]] = ...,
device: Optional[int | device] = ...,
dtype: Optional[dtype | str] = ...,
non_blocking: bool = ...,
) -> T: ...

@overload
def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

@overload
def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

is_quantized = self.data.dtype == torch.int8

Expand Down Expand Up @@ -1048,7 +1048,7 @@ def to(self, *args, **kwargs):
# Call the parent to() method to handle standard parameter/buffer movement
result = super().to(*args, **kwargs)

device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)

# Handle state tensors if needed.
if device is not None:
Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/optim/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ def update_step(self, group, p, gindex, pindex):
step = state["step"]

if config["percentile_clipping"] < 100:
current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
_current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
grad,
state["gnorm_vec"],
step,
Expand Down Expand Up @@ -725,7 +725,7 @@ def update_step(self, group, p, gindex, pindex):
step = state["step"]

if config["percentile_clipping"] < 100:
current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
_current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
grad,
state["gnorm_vec"],
step,
Expand Down
6 changes: 3 additions & 3 deletions bitsandbytes/research/autograd/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,8 @@ def backward(ctx, grad_output):
return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
CAt, subA, A = ctx.tensors
SCAt, idx = ctx.tensor_states
_CAt, _subA, A = ctx.tensors
_SCAt, _idx = ctx.tensor_states
state = ctx.state
grad_A = grad_B = grad_bias = None

Expand All @@ -320,7 +320,7 @@ def backward(ctx, grad_output):
if len(grad_output.shape) == 3:
grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()

Cgrad, Cgradt, SCgrad, SCgradt, outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
_Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))

if req_gradB:
# print('back A shape', A.shape)
Expand Down
2 changes: 1 addition & 1 deletion bitsandbytes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
zstd = (std - stdm) / stdstd

if topk is not None:
val, idx = torch.topk(std.abs(), k=topk, dim=0)
_, idx = torch.topk(std.abs(), k=topk, dim=0)
else:
idx = torch.where(zstd > zscore)[0]

Expand Down
2 changes: 1 addition & 1 deletion docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ additional platforms such as AMD ROCm.

These are the minimum requirements for `bitsandbytes` across all platforms. Please be aware that some compute platforms may impose more strict requirements.

* Python >= 3.9
* Python >= 3.10
* PyTorch >= 2.3

## NVIDIA CUDA[[cuda]]
Expand Down
18 changes: 11 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ maintainers = [
{name="Titus von Köller", email="[email protected]"},
{name="Matthew Douglas", email="[email protected]"}
]
requires-python = ">=3.9"
requires-python = ">=3.10"
readme = "README.md"
license = "MIT"
license-files = ["LICENSE"]
Expand All @@ -35,11 +35,11 @@ classifiers = [
"Operating System :: Microsoft :: Windows",
"Programming Language :: C++",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
Expand All @@ -60,7 +60,7 @@ docs = ["hf-doc-builder==0.5.0"]
dev = [
"bitsandbytes[test]",
"build>=1.0.0,<2",
"ruff==0.11.2",
"ruff~=0.14.3",
"pre-commit>=3.5.0,<4",
"wheel>=0.42,<1"
]
Expand Down Expand Up @@ -108,7 +108,7 @@ src = [
"tests",
"benchmarking"
]
target-version = "py39"
target-version = "py310"
line-length = 119

[tool.ruff.lint]
Expand All @@ -125,13 +125,14 @@ select = [
ignore = [
"B007", # Loop control variable not used within the loop body (TODO: enable)
"B028", # Warning without stacklevel (TODO: enable)
"B905", # zip without explicit `strict=` kwarg
"E501", # Suppress line-too-long warnings: trust yapf's judgement on this one.
"E701", # Multiple statements on one line (TODO: enable)
"E712", # Allow using if x == False, as it's not always equivalent to if x.
"E731", # Do not use lambda
"RUF012", # Mutable class attribute annotations
"RUF034", # Useless if-else (TODO: enable)
"ISC001", # single-line-implicit-string-concatenation incompatible with formatter
"RUF012",# Mutable class attribute annotations
"RUF034",# Useless if-else (TODO: enable)
"UP045", # Use `X | None` instead of `Optional[X]`
]

[tool.ruff.lint.extend-per-file-ignores]
Expand All @@ -145,6 +146,9 @@ ignore = [
"F841",
"UP030",
]
"bitsandbytes/**/triton/**/*.py" = [
"I001", # import order
]

[tool.ruff.lint.isort]
combine-as-imports = true
Expand Down
2 changes: 1 addition & 1 deletion tests/test_deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_percentile_clipping(gtype):
else:
gnorm_vec1[step % 100] = gnorm2

vals, idx = torch.sort(gnorm_vec1)
vals, _ = torch.sort(gnorm_vec1)
clip1 = vals[percentile]

torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))
Expand Down
8 changes: 4 additions & 4 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def test_fp8_quant(self, device):
def test_bench_dequantization(self):
a = torch.rand(1024, 1024, device="cuda").half()
code = F.create_fp8_map(True, 3, 0, 4).cuda()
qa, SA = F.quantize_blockwise(a, code=code)
qa, _SA = F.quantize_blockwise(a, code=code)
print(qa.max())

max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000
Expand All @@ -321,7 +321,7 @@ def test_bench_dequantization(self):
torch.cuda.synchronize()
t0 = time.time()
for i in range(100):
qa, SA = F.quantize_blockwise(a)
qa, _SA = F.quantize_blockwise(a)
torch.cuda.synchronize()
# print((time.time()-t0)/1e6)

Expand Down Expand Up @@ -1004,7 +1004,7 @@ def test_spmm_coo_dequant(self, dim1, dim2, dtype):
torch.nn.init.xavier_uniform_(B)
Bt = B.t().contiguous()

CB, CBt, statsB, statsBt, coo_tensor = F.int8_double_quant(B)
_CB, CBt, _statsB, statsBt, _coo_tensor = F.int8_double_quant(B)

rowidx = torch.randint(0, A.shape[-1], size=(15,))

Expand All @@ -1023,7 +1023,7 @@ def test_spmm_coo_dequant(self, dim1, dim2, dtype):

values, counts = torch.unique(cooA.rowidx, return_counts=True)
offset = counts.cumsum(0).int()
max_count, max_idx = torch.sort(counts, descending=True)
max_count, _ = torch.sort(counts, descending=True)
print(torch.median(max_count.float()))

torch.testing.assert_close(out2, out3, rtol=0.05, atol=0.001)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits):
g2 = g1.clone()
p2.grad = g2

current_gnorm, clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
_current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
g1 = (g1.float() * gnorm_scale).to(gtype)
p1.grad = g1

Expand Down
4 changes: 2 additions & 2 deletions tests/test_parametrize.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,14 +246,14 @@ def test_error_conditions():
replace_parameter_4bit(module, "nonexistent")

# Test TypeError for non-Parameter attribute
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
replace_parameter_4bit(module, "not_param")

# Test same errors for prequantized version
with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
replace_parameter_4bit_prequantized(module, "nonexistent", {}, torch.device("cpu"))

with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
replace_parameter_4bit_prequantized(module, "not_param", {}, torch.device("cpu"))


Expand Down