rebase main branch

jiqing-feng · jiqing-feng · commit bdab075925f1 · 2025-09-04T10:42:25.000Z
Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -422,9 +422,9 @@ def matmul(
     if threshold > 0.0:
         state.threshold = threshold
     # MatMul8bitLt is slower because no fast kernel for quant/dequant 8bit in CPU/XPU
-    if state.is_training and A.device.type in ("cpu", "xpu"):
-        return MatMul8bitFp.apply(A, B, out, bias, state)
-
+    if state.is_training:
+        if A.device.type in ("cpu", "xpu"):
+            return MatMul8bitFp.apply(A, B, out, bias, state)
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
 
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
@@ -8,7 +8,7 @@
     import triton.language as tl  # noqa: F401
 
     triton_available = True
-except ImportError as e:
+except ImportError:
     triton_available = False
 
 
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
@@ -2,6 +2,7 @@
 import ctypes as ct
 import logging
 
+from packaging import version
 import torch
 
 from bitsandbytes.functional import _get_tensor_stream, get_ptr
@@ -12,6 +13,16 @@
 
 logger = logging.getLogger(__name__)
 
+# _int_mm is available in torch starting from 2.9 version
+if version.parse(torch.__version__).release >= version.parse("2.9"):
+
+    @register_kernel("bitsandbytes::int8_linear_matmul", "xpu")
+    def _(A: torch.Tensor, B: torch.Tensor):
+        return torch._int_mm(
+            A.reshape(-1, A.shape[-1]),
+            B.t(),
+        ).reshape(*A.shape[:-1], B.shape[0])
+
 
 def _dequantize_4bit_impl(
     A: torch.Tensor,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -242,7 +242,6 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8)
     assert e + p == total_bits - has_sign
     # the exponent is biased to 2^(e-1) -1 == 0
     evalues = []
-    pvalues = []
     for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)):
         evalues.append(2**val)
 
@@ -1357,8 +1356,6 @@ def optimizer_update_8bit_blockwise(
     gnorm_scale: float = 1.0,
     skip_zeros=False,
 ) -> None:
-    optim_func = None
-
     is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
 
     torch.ops.bitsandbytes.optimizer_update_8bit_blockwise(
@@ -2089,7 +2086,7 @@ def spmm_coo(
     assert cooA.values.numel() == nnz
     assert cooA.cols == B.shape[0]
 
-    transposed_B = False if B.is_contiguous() else True
+    transposed_B = not B.is_contiguous()
 
     ldb = B.stride()[(1 if transposed_B else 0)]
     ldc = B.shape[1]
@@ -2138,12 +2135,7 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     assert cooA.values.numel() == nnz
     assert cooA.cols == B.shape[0], f"{cooA.cols} vs {B.shape}"
 
-    transposed_B = False if B.is_contiguous() else True
-
-    ldb = B.stride()[(1 if transposed_B else 0)]
-    ldc = B.shape[1]
-
-    values, counts = torch.unique(cooA.rowidx, return_counts=True)
+    _, counts = torch.unique(cooA.rowidx, return_counts=True)
     offset = counts.cumsum(0).int()
     max_count, max_idx = torch.sort(counts, descending=True)
     max_idx = max_idx.int()
@@ -2163,11 +2155,8 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     cnnz_rows = ct.c_int32(counts.numel())
     cnnz = ct.c_int32(cooA.nnz)
     crowsA = ct.c_int32(cooA.rows)
-    ccolsA = ct.c_int32(cooA.cols)
     crowsB = ct.c_int32(B.shape[1])
     ccolsB = ct.c_int32(B.shape[1])
-    cldb = ct.c_int32(ldb)
-    cldc = ct.c_int32(ldc)
 
     with _cuda_device_of(B):
         is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats])
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -476,7 +476,7 @@ def __init__(
         )
         # self.persistent_buffers = []  # TODO consider as way to save quant state
         self.compute_dtype = compute_dtype
-        self.compute_type_is_set = False if compute_dtype is None else True
+        self.compute_type_is_set = compute_dtype is not None
         self.quant_state = None
         self.quant_storage = quant_storage
 
@@ -1117,4 +1117,4 @@ def forward(self, x):
         if self.weight.CB is not None:
             self.init_8bit_state()
 
-        out = bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias
+        return bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias
diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py
@@ -231,9 +231,6 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
             weight_decay = group["weight_decay"]
             momentum = group["momentum"]
             dampening = group["dampening"]
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
@@ -272,8 +272,6 @@ def step(self, closure=None):
             with torch.enable_grad():
                 loss = closure()
 
-        overflows = []
-
         if not self.initialized:
             self.check_overrides()
             self.to_gpu()  # needed for fairseq pure fp16 training
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
@@ -235,7 +235,7 @@ def forward(ctx, A, B, out=None, bias=None, state: Optional[MatmulLtState] = Non
         # 2. Quantize B
         if state.has_fp16_weights:
             # print('B shape', B.shape)
-            has_grad = True if (getattr(B, "grad", None) is not None) else False
+            has_grad = getattr(B, "grad", None) is not None
             is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)
             if is_transposed:
                 B = B.contiguous()
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
@@ -84,11 +84,6 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
     if rdm:
         return torch.randint(0, weight.shape[1], size=(topk,), device=weight.device).long()
 
-    m = weight.mean(reduction_dim)
-    mm = m.mean()
-    mstd = m.std()
-    zm = (m - mm) / mstd
-
     std = weight.std(reduction_dim)
     stdm = std.mean()
     stdstd = std.std()
diff --git a/install_cuda.py b/install_cuda.py
@@ -87,7 +87,7 @@ def main():
 
     # Install CUDA version(s)
     if version == "all":
-        for ver in cuda_versions.keys():
+        for ver in cuda_versions:
             install_cuda(ver, base_path, download_path)
     elif version in cuda_versions:
         install_cuda(version, base_path, download_path)
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,8 @@ classifiers = [
 ]
 dependencies = [
     "torch>=2.2,<3",
-    "numpy>=1.17"
+    "numpy>=1.17",
+    "packaging>=20.9"
 ]
 
 [project.urls]
@@ -123,11 +124,10 @@ select = [
 ignore = [
     "B007",  # Loop control variable not used within the loop body (TODO: enable)
     "B028",  # Warning without stacklevel (TODO: enable)
-    "E501",  # Supress line-too-long warnings: trust yapf's judgement on this one.
+    "E501",  # Suppress line-too-long warnings: trust yapf's judgement on this one.
     "E701",  # Multiple statements on one line (TODO: enable)
     "E712",  # Allow using if x == False, as it's not always equivalent to if x.
     "E731",  # Do not use lambda
-    "F841",  # Local assigned but not used (TODO: enable, these are likely bugs)
     "RUF012",  # Mutable class attribute annotations
     "RUF034", # Useless if-else (TODO: enable)
     "ISC001",   # single-line-implicit-string-concatenation incompatible with formatter
diff --git a/tests/test_generation.py b/tests/test_generation.py
@@ -112,7 +112,7 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
     assert len(outputs) == n_cases
     failure_count = 0
     for i in range(n_cases):
-        if not outputs[i][: len(str(math.pi))] == str(math.pi):
+        if outputs[i][: len(str(math.pi))] != str(math.pi):
             failure_count += 1
     failure_max = 2 if fixture_config[0] == "huggyllama/llama-7b" else 4
     if failure_count > failure_max: