bitsandbytes-foundation
diff --git a/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 121 additions & 3 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 121 additions & 3 deletions
diff --git a/‎bitsandbytes/backends/default/ops.py‎
Lines changed: 20 additions & 9 deletions b/‎bitsandbytes/backends/default/ops.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎csrc/common.h‎
Lines changed: 6 additions & 0 deletions b/‎csrc/common.h‎
Lines changed: 6 additions & 0 deletions
@@ -78,9 +78,17 @@ else()
     set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
     set(BUILD_XPU OFF)
+    set(BUILD_CPU ON)
 endif()
 
 
+if (BUILD_CPU)
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
+    find_package(OpenMP)
+endif()
+
 if(BUILD_CUDA)
     # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
     # Workaround: use --allow-unsupported-compiler
@@ -262,6 +270,34 @@ add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
+if (BUILD_CPU)
+    if (OpenMP_CXX_FOUND)
+        target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
+        add_definitions(-DHAS_OPENMP)
+    endif()
+
+    if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
+        include(CheckCXXCompilerFlag)
+        check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
+        check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+        if (HAS_AVX512F_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512f)
+        endif()
+        if (HAS_AVX512BF16_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+        endif()
+        target_compile_options(
+            bitsandbytes PRIVATE
+            -mprefer-vector-width=256
+            -mfma
+            -mavx2
+            -mlzcnt
+            -mbmi
+            -mbmi2
+        )
+    endif()
+endif()
+
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
@@ -374,6 +374,9 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
+    # Change dtype to bfloat16 on CPU
+    if A.device.type == "cpu":
+        quant_state.dtype = A.dtype
 
     if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
         if A.shape[-1] % quant_state.blocksize != 0:
 
@@ -1,5 +1,7 @@
+from collections.abc import Sequence
 import ctypes as ct
 import logging
+from math import prod
 
 import torch
 
@@ -76,10 +78,8 @@ def _(
         torch._check_is_size(blocksize)
         torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
 
-        # Only FP32 has c++ kernrl
+        out = torch.empty_like(A, dtype=dtype)
         if dtype == torch.float32:
-            out = torch.empty_like(A, dtype=dtype)
-
             lib.cdequantize_blockwise_cpu_fp32(
                 get_ptr(code),
                 get_ptr(A),
@@ -88,6 +88,24 @@ def _(
                 ct.c_longlong(blocksize),
                 ct.c_longlong(A.numel()),
             )
+        elif dtype == torch.bfloat16:
+            lib.cdequantize_blockwise_cpu_bf16(
+                get_ptr(code),
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_longlong(blocksize),
+                ct.c_longlong(A.numel()),
+            )
+        elif dtype == torch.float16:
+            lib.cdequantize_blockwise_cpu_fp16(
+                get_ptr(code),
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_longlong(blocksize),
+                ct.c_longlong(A.numel()),
+            )
         else:
             out = code[A.reshape(-1).int()]
             blocks = out.shape[-1] // blocksize
@@ -99,3 +117,103 @@ def _(
             out = out.reshape(A.shape)
 
         return out
+
+    @register_kernel("bitsandbytes::dequantize_4bit", "cpu")
+    def _(
+        A: torch.Tensor,
+        absmax: torch.Tensor,
+        blocksize: int,
+        quant_type: str,
+        shape: Sequence[int],
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        torch._check_is_size(blocksize)
+        torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+        torch._check(
+            dtype in [torch.bfloat16, torch.float16, torch.float32],
+            lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+        )
+
+        # Odd shape is not supported by this kernel; fallback to generic implementation
+        if shape[-1] % 2 != 0:
+            from ..default.ops import _dequantize_4bit_impl
+
+            return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
+        # Enable non uint8 dtype
+        if A.dtype != torch.uint8:
+            A = A.view(torch.uint8)
+
+        # TODO: support half precision absmax
+        if absmax.dtype != torch.float32:
+            absmax = absmax.float()
+
+        if len(shape) == 1:
+            shape = (1, shape[0])
+
+        m = prod(shape[:-1])
+        n = shape[-1]
+
+        A = A.reshape(m, n // 2)
+        out = torch.empty(shape, dtype=dtype, device=A.device)
+
+        if quant_type == "fp4":
+            if dtype == torch.float32:
+                lib.cdequantize_blockwise_cpu_fp4_fp32(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.bfloat16:
+                lib.cdequantize_blockwise_cpu_fp4_bf16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.float16:
+                lib.cdequantize_blockwise_cpu_fp4_fp16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+        elif quant_type == "nf4":
+            if dtype == torch.float32:
+                lib.cdequantize_blockwise_cpu_nf4_fp32(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.bfloat16:
+                lib.cdequantize_blockwise_cpu_nf4_bf16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+            elif dtype == torch.float16:
+                lib.cdequantize_blockwise_cpu_nf4_fp16(
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_longlong(blocksize),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
+                )
+        else:
+            raise ValueError
+
+        return out
@@ -232,22 +232,14 @@ def _(
     return packed, absmax.float()
 
 
-@register_kernel("bitsandbytes::dequantize_4bit", "default")
-def _(
+def _dequantize_4bit_impl(
     A: torch.Tensor,
     absmax: torch.Tensor,
     blocksize: int,
     quant_type: str,
     shape: Sequence[int],
     dtype: torch.dtype,
 ) -> torch.Tensor:
-    torch._check_is_size(blocksize)
-    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
-    torch._check(
-        dtype in [torch.bfloat16, torch.float16, torch.float32],
-        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
-    )
-
     # Enable non uint8 dtype
     if A.dtype != torch.uint8:
         A = A.view(torch.uint8)
@@ -283,6 +275,25 @@ def _(
     return out
 
 
+@register_kernel("bitsandbytes::dequantize_4bit", "default")
+def _(
+    A: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int,
+    quant_type: str,
+    shape: Sequence[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+    torch._check(
+        dtype in [torch.bfloat16, torch.float16, torch.float32],
+        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+    )
+
+    return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
+
 @register_kernel("bitsandbytes::gemv_4bit", "default")
 def _(
     A: torch.Tensor,
 
@@ -5,6 +5,12 @@
 
 using namespace BinSearch;
 
+typedef enum DataType_t {
+    General8bit = 0,
+    FP4 = 1,
+    NF4 = 2,
+} DataType_t;
+
 struct quantize_block_args {
     BinAlgo<Scalar, float, Direct2>* bin_searcher;
     float* code;