bitsandbytes-foundation
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 10 additions & 5 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 121 additions & 3 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 121 additions & 3 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 5 additions & 5 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎bitsandbytes/backends/default/ops.py‎
Lines changed: 20 additions & 9 deletions b/‎bitsandbytes/backends/default/ops.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎bitsandbytes/cextension.py‎
Lines changed: 8 additions & 1 deletion b/‎bitsandbytes/cextension.py‎
Lines changed: 8 additions & 1 deletion
@@ -171,6 +171,9 @@ jobs:
           retention-days: 7
 
   build-wheels:
+    env:
+      # Skip rebuilding the CPU library when building the wheels.
+      BNB_SKIP_CMAKE: 1
     needs:
       - build-cpu
       - build-cuda
 
@@ -10,6 +10,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  # Skip rebuilding the CPU library when installing the wheels.
+  # We build the libraries in separate jobs and upload as artifacts.
+  BNB_SKIP_CMAKE: 1
+
 jobs:
 
   build-cpu:
@@ -146,7 +151,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       # We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
@@ -188,7 +193,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -263,7 +268,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -321,7 +326,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -438,7 +443,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
       - name: Show installed packages
         run: pip list
 
@@ -93,9 +93,17 @@ else()
     set(BUILD_MPS OFF)
     set(BUILD_XPU OFF)
     set(BUILD_NPU OFF)
+    set(BUILD_CPU ON)
 endif()
 
 
+if (BUILD_CPU)
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
+    find_package(OpenMP)
+endif()
+
 if(BUILD_CUDA)
     # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
     # Workaround: use --allow-unsupported-compiler
@@ -311,6 +319,34 @@ add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
+if (BUILD_CPU)
+    if (OpenMP_CXX_FOUND)
+        target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
+        add_definitions(-DHAS_OPENMP)
+    endif()
+
+    if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
+        include(CheckCXXCompilerFlag)
+        check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
+        check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+        if (HAS_AVX512F_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512f)
+        endif()
+        if (HAS_AVX512BF16_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+        endif()
+        target_compile_options(
+            bitsandbytes PRIVATE
+            -mprefer-vector-width=256
+            -mfma
+            -mavx2
+            -mlzcnt
+            -mbmi
+            -mbmi2
+        )
+    endif()
+endif()
+
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
@@ -374,6 +374,9 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
+    # Change dtype to bfloat16 on CPU
+    if A.device.type == "cpu":
+        quant_state.dtype = A.dtype
 
     if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu" and A.device.type != "npu":
         if A.shape[-1] % quant_state.blocksize != 0:
 
@@ -1,5 +1,7 @@
+from collections.abc import Sequence
 import ctypes as ct
 import logging
+from math import prod
 
 import torch
 
@@ -76,10 +78,8 @@ def _(
         torch._check_is_size(blocksize)
         torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
 
-        # Only FP32 has c++ kernrl
+        out = torch.empty_like(A, dtype=dtype)
         if dtype == torch.float32:
-            out = torch.empty_like(A, dtype=dtype)
-
             lib.cdequantize_blockwise_cpu_fp32(
                 get_ptr(code),
                 get_ptr(A),
@@ -88,6 +88,24 @@ def _(
                 ct.c_longlong(blocksize),
                 ct.c_longlong(A.numel()),
             )
+        elif dtype == torch.bfloat16:
+            lib.cdequantize_blockwise_cpu_bf16(
+                get_ptr(code),
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_longlong(blocksize),
+                ct.c_longlong(A.numel()),
+            )
+        elif dtype == torch.float16:
+            lib.cdequantize_blockwise_cpu_fp16(
+                get_ptr(code),
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_longlong(blocksize),
+                ct.c_longlong(A.numel()),
+            )
         else:
             out = code[A.reshape(-1).int()]
             blocks = out.shape[-1] // blocksize
@@ -99,3 +117,103 @@ def _(
             out = out.reshape(A.shape)
 
         return out
+
+    @register_kernel("bitsandbytes::dequantize_4bit", "cpu")
+    def _(
+        A: torch.Tensor,
+        absmax: torch.Tensor,
+        blocksize: int,
+        quant_type: str,
+        shape: Sequence[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        torch._check_is_size(blocksize)
+        torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+        torch._check(
+            dtype in [torch.bfloat16, torch.float16, torch.float32],
+            lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+        )
+
+        # Odd shape is not supported by this kernel; fallback to generic implementation
+        if shape[-1] % 2 != 0:
+            from ..default.ops import _dequantize_4bit_impl
+
+            return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
+        # Enable non uint8 dtype
+        if A.dtype != torch.uint8:
+            A = A.view(torch.uint8)
+
+        # TODO: support half precision absmax
+        if absmax.dtype != torch.float32:
+            absmax = absmax.float()
+
+        if len(shape) == 1:
+            shape = (1, shape[0])
+
+        m = prod(shape[:-1])
+        n = shape[-1]
+
+        A = A.reshape(m, n // 2)
+        out = torch.empty(shape, dtype=dtype, device=A.device)
+
+        if quant_type == "fp4":
+            if dtype == torch.float32:
+                lib.cdequantize_blockwise_cpu_fp4_fp32(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.bfloat16:
+                lib.cdequantize_blockwise_cpu_fp4_bf16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.float16:
+                lib.cdequantize_blockwise_cpu_fp4_fp16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+        elif quant_type == "nf4":
+            if dtype == torch.float32:
+                lib.cdequantize_blockwise_cpu_nf4_fp32(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.bfloat16:
+                lib.cdequantize_blockwise_cpu_nf4_bf16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.float16:
+                lib.cdequantize_blockwise_cpu_nf4_fp16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+        else:
+            raise ValueError
+
+        return out
@@ -8,7 +8,7 @@
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
 
 from ..._ops import register_kernel
-from ...cextension import HIP_ENVIRONMENT, lib
+from ...cextension import ROCM_WARP_SIZE_64, lib
 
 
 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -211,7 +211,7 @@ def _get_col_absmax(
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
 
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -269,7 +269,7 @@ def _(
 def _dequantize_blockwise_impl(
     A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -303,7 +303,7 @@ def _dequantize_blockwise_impl(
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
@@ -385,7 +385,7 @@ def _dequantize_4bit_impl(
     dtype: torch.dtype,
     out: torch.Tensor,
 ) -> None:
-    if HIP_ENVIRONMENT:
+    if ROCM_WARP_SIZE_64:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
     else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
 
@@ -232,22 +232,14 @@ def _(
     return packed, absmax.float()
 
 
-@register_kernel("bitsandbytes::dequantize_4bit", "default")
-def _(
+def _dequantize_4bit_impl(
     A: torch.Tensor,
     absmax: torch.Tensor,
     blocksize: int,
     quant_type: str,
     shape: Sequence[int],
     dtype: torch.dtype,
 ) -> torch.Tensor:
-    torch._check_is_size(blocksize)
-    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
-    torch._check(
-        dtype in [torch.bfloat16, torch.float16, torch.float32],
-        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
-    )
-
     # Enable non uint8 dtype
     if A.dtype != torch.uint8:
         A = A.view(torch.uint8)
@@ -283,6 +275,25 @@ def _(
     return out
 
 
+@register_kernel("bitsandbytes::dequantize_4bit", "default")
+def _(
+    A: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int,
+    quant_type: str,
+    shape: Sequence[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+    torch._check(
+        dtype in [torch.bfloat16, torch.float16, torch.float32],
+        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+    )
+
+    return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
+
 @register_kernel("bitsandbytes::gemv_4bit", "default")
 def _(
     A: torch.Tensor,
 
@@ -10,7 +10,13 @@
 import torch
 
 from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
-from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple, get_rocm_gpu_arch
+from bitsandbytes.cuda_specs import (
+    CUDASpecs,
+    get_cuda_specs,
+    get_cuda_version_tuple,
+    get_rocm_gpu_arch,
+    get_rocm_warpsize,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -307,6 +313,7 @@ def get_native_library() -> BNBNativeLibrary:
 
 
 ROCM_GPU_ARCH = get_rocm_gpu_arch()
+ROCM_WARP_SIZE_64 = True if get_rocm_warpsize() == 64 else False
 
 HIP_ENVIRONMENT = False
 BNB_BACKEND = "CPU"