bitsandbytes-foundation
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 11 additions & 6 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 37 additions & 20 deletions b/‎CMakeLists.txt‎
Lines changed: 37 additions & 20 deletions
diff --git a/‎bitsandbytes/autograd/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎bitsandbytes/autograd/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 1 addition & 60 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 1 addition & 60 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 28 additions & 13 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 28 additions & 13 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 5 additions & 5 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 5 additions & 5 deletions
@@ -171,6 +171,9 @@ jobs:
           retention-days: 7
 
   build-wheels:
+    env:
+      # Skip rebuilding the CPU library when building the wheels.
+      BNB_SKIP_CMAKE: 1
     needs:
       - build-cpu
       - build-cuda
 
@@ -10,6 +10,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  # Skip rebuilding the CPU library when installing the wheels.
+  # We build the libraries in separate jobs and upload as artifacts.
+  BNB_SKIP_CMAKE: 1
+
 jobs:
 
   build-cpu:
@@ -103,7 +108,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
+        torch_version: ["2.3.1", "2.8.0", "2.9.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -146,7 +151,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       # We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
@@ -188,7 +193,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -263,7 +268,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -321,7 +326,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -438,7 +443,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
       - name: Show installed packages
         run: pip list
 
@@ -85,6 +85,7 @@ endif()
 if (BUILD_CPU)
     set(CMAKE_CXX_STANDARD 17)
     set(CMAKE_CXX_STANDARD_REQUIRED ON)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
     find_package(OpenMP)
 endif()
 
@@ -270,30 +271,46 @@ target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
 if (BUILD_CPU)
-    target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
-    include(CheckCXXCompilerFlag)
-    check_cxx_compiler_flag(-mavx512f HAS_AVX512F)
-    check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16)
-    check_cxx_compiler_flag(-mavx512dq HAS_AVX512DQ)
-    check_cxx_compiler_flag(-mavx512bw HAS_AVX512BW)
-    check_cxx_compiler_flag(-mavx512vl HAS_AVX512VL)
-    if(HAS_AVX512F)
-        target_compile_options(bitsandbytes PRIVATE -mavx512f)
+    if (OpenMP_CXX_FOUND)
+        target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
+        add_definitions(-DHAS_OPENMP)
     endif()
-    if(HAS_AVX512BF16)
-        target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
-    endif()
-    if(HAS_AVX512DQ)
-        target_compile_options(bitsandbytes PRIVATE -mavx512dq)
-    endif()
-    if(HAS_AVX512BW)
-        target_compile_options(bitsandbytes PRIVATE -mavx512bw)
-    endif()
-    if(HAS_AVX512VL)
-        target_compile_options(bitsandbytes PRIVATE -mavx512vl)
+
+    if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
+        include(CheckCXXCompilerFlag)
+        check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
+        check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+        check_cxx_compiler_flag(-mavx512dq HAS_AVX512DQ)
+        check_cxx_compiler_flag(-mavx512bw HAS_AVX512BW)
+        check_cxx_compiler_flag(-mavx512vl HAS_AVX512VL)
+        if (HAS_AVX512F_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512f)
+        endif()
+        if (HAS_AVX512BF16_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+        endif()
+        if(HAS_AVX512DQ)
+            target_compile_options(bitsandbytes PRIVATE -mavx512dq)
+        endif()
+        if(HAS_AVX512BW)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bw)
+        endif()
+        if(HAS_AVX512VL)
+            target_compile_options(bitsandbytes PRIVATE -mavx512vl)
+        endif()
+        target_compile_options(
+            bitsandbytes PRIVATE
+            -mprefer-vector-width=256
+            -mfma
+            -mavx2
+            -mlzcnt
+            -mbmi
+            -mbmi2
+        )
     endif()
 endif()
 
+
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
     target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
 
@@ -1 +0,0 @@
-from ._functions import get_inverse_transform_indices, undo_layout
@@ -1,12 +1,10 @@
-from collections.abc import Callable
 from dataclasses import dataclass
 from math import prod
 from typing import Optional
 import warnings
 from warnings import warn
 
 import torch
-from typing_extensions import deprecated
 
 import bitsandbytes.functional as F
 
@@ -50,66 +48,9 @@ def get_current_outlier_idx(self):
         return torch.Tensor(list(self.outliers)).to(torch.int64)
 
 
-@deprecated(
-    "This function is deprecated and will be removed in a future release.",
-    category=FutureWarning,
-)
-def get_inverse_transform_indices(
-    transform_tile: Callable[[torch.Tensor], torch.Tensor],
-    tile_size: tuple[int, int],
-):
-    """
-    Compute a permutation of indices that invert the specified (tiled) matrix transformation
-
-    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
-    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
-    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
-    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
-    :returns: indices
-    """
-    d1, d2 = tile_size
-    assert 0 < d1 * d2 < 2**64
-    tile_indices = torch.arange(d1 * d2, dtype=torch.int64).view(d1, d2)
-    # encode each position in tile as a tuple of <= 8 unique bytes
-    permuted_tile_indices = torch.zeros_like(tile_indices)
-    for i in range(8):
-        # select i-th byte, apply transformation and trace where each index ended up
-        ith_dim_indices = torch.div(tile_indices, 256**i, rounding_mode="trunc") % 256
-        sample_tile_i = (ith_dim_indices - 128).to(torch.int8).contiguous()
-        assert torch.all(sample_tile_i.int() + 128 == ith_dim_indices), "int overflow"
-        permuted_tile_i = transform_tile(sample_tile_i)
-        ith_permuted_indices = permuted_tile_i.to(tile_indices.dtype) + 128
-        permuted_tile_indices += ith_permuted_indices * (256**i)
-        if d1 * d2 < 256**i:
-            break  # if all indices fit in i bytes, stop early
-    return permuted_tile_indices
-
-
 _is_compiling = torch.compiler.is_compiling
 
 
-@deprecated(
-    "This function is deprecated and will be removed in a future release.",
-    category=FutureWarning,
-)
-def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -> torch.Tensor:
-    """
-    Undo a tiled permutation such as turing or ampere layout
-
-    :param permuted_tensor: torch tensor in a permuted layout
-    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
-    :return: contiguous row-major tensor
-    """
-    (rows, cols), (tile_rows, tile_cols) = permuted_tensor.shape, tile_indices.shape
-    assert rows % tile_rows == cols % tile_cols == 0, "tensor must contain a whole number of tiles"
-    tensor = permuted_tensor.reshape(-1, tile_indices.numel()).t()
-    outputs = torch.empty_like(tensor)  # note: not using .index_copy because it was slower on cuda
-    outputs[tile_indices.flatten()] = tensor
-    outputs = outputs.reshape(tile_rows, tile_cols, cols // tile_cols, rows // tile_rows)
-    outputs = outputs.permute(3, 0, 2, 1)  # (rows // tile_rows, tile_rows), (cols // tile_cols, tile_cols)
-    return outputs.reshape(rows, cols).contiguous()
-
-
 @dataclass
 class MatmulLtState:
     _tile_indices: Optional[torch.Tensor] = None  # TODO: remove
@@ -433,7 +374,7 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
-    # Change dtype to bfloat16 on CPU
+    # Change dtype to input dtype on CPU
     if A.device.type == "cpu":
         quant_state.dtype = A.dtype
 
 
@@ -1,6 +1,7 @@
 from collections.abc import Sequence
 import ctypes as ct
 import logging
+from math import prod
 
 import torch
 
@@ -132,6 +133,13 @@ def _(
             dtype in [torch.bfloat16, torch.float16, torch.float32],
             lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
         )
+
+        # Odd shape is not supported by this kernel; fallback to generic implementation
+        if shape[-1] % 2 != 0:
+            from ..default.ops import _dequantize_4bit_impl
+
+            return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
         # Enable non uint8 dtype
         if A.dtype != torch.uint8:
             A = A.view(torch.uint8)
@@ -140,35 +148,42 @@ def _(
         if absmax.dtype != torch.float32:
             absmax = absmax.float()
 
-        A = A.reshape(shape[0], shape[1] // 2)
+        if len(shape) == 1:
+            shape = (1, shape[0])
+
+        m = prod(shape[:-1])
+        n = shape[-1]
+
+        A = A.reshape(m, n // 2)
         out = torch.empty(shape, dtype=dtype, device=A.device)
+
         if quant_type == "fp4":
             if dtype == torch.float32:
                 lib.cdequantize_blockwise_cpu_fp4_fp32(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.bfloat16:
                 lib.cdequantize_blockwise_cpu_fp4_bf16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.float16:
                 lib.cdequantize_blockwise_cpu_fp4_fp16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
         elif quant_type == "nf4":
             if dtype == torch.float32:
@@ -177,26 +192,26 @@ def _(
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.bfloat16:
                 lib.cdequantize_blockwise_cpu_nf4_bf16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.float16:
                 lib.cdequantize_blockwise_cpu_nf4_fp16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
         else:
             raise ValueError
 
@@ -8,7 +8,7 @@
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
 
 from ..._ops import register_kernel
-from ...cextension import HIP_ENVIRONMENT, lib
+from ...cextension import ROCM_WARP_SIZE_64, lib
 
 
 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -211,7 +211,7 @@ def _get_col_absmax(
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
 
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -269,7 +269,7 @@ def _(
 def _dequantize_blockwise_impl(
     A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -303,7 +303,7 @@ def _dequantize_blockwise_impl(
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -385,7 +385,7 @@ def _dequantize_4bit_impl(
     dtype: torch.dtype,
     out: torch.Tensor,
 ) -> None:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
Original file line number	Diff line number	Diff line change
`@@ -1 +0,0 @@`
`1`		`-from ._functions import get_inverse_transform_indices, undo_layout`