bitsandbytes-foundation · jiqing-feng · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -280,12 +280,24 @@ if (BUILD_CPU)
         include(CheckCXXCompilerFlag)
         check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
         check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+        check_cxx_compiler_flag(-mavx512dq HAS_AVX512DQ)
+        check_cxx_compiler_flag(-mavx512bw HAS_AVX512BW)
+        check_cxx_compiler_flag(-mavx512vl HAS_AVX512VL)
         if (HAS_AVX512F_FLAG)
             target_compile_options(bitsandbytes PRIVATE -mavx512f)
         endif()
         if (HAS_AVX512BF16_FLAG)
             target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
         endif()
+        if(HAS_AVX512DQ)
+            target_compile_options(bitsandbytes PRIVATE -mavx512dq)
+        endif()
+        if(HAS_AVX512BW)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bw)
+        endif()
+        if(HAS_AVX512VL)
+            target_compile_options(bitsandbytes PRIVATE -mavx512vl)
+        endif()
         target_compile_options(
             bitsandbytes PRIVATE
             -mprefer-vector-width=256

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -374,10 +374,16 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
-    # Change dtype to bfloat16 on CPU
+    # Change dtype to input dtype on CPU
     if A.device.type == "cpu":
         quant_state.dtype = A.dtype
 
+    if getattr(quant_state, "enable_optimized_cpu", False):
+        out = F.gemv_4bit(A, B, out, state=quant_state)
+        if bias is not None:
+            out += bias
+        return out
+
     if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(

diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from bitsandbytes.functional import get_ptr
+from bitsandbytes.functional import get_ptr, has_avx512bf16
 
 from ..._ops import register_kernel
 from ...cextension import ErrorHandlerMockBNBNativeLibrary, lib
@@ -217,3 +217,62 @@ def _(
             raise ValueError
 
         return out
+
+    if has_avx512bf16():
+
+        @register_kernel("bitsandbytes::gemv_4bit", "cpu")
+        def _(
+            A: torch.Tensor,
+            B: torch.Tensor,
+            shapeB: Sequence[int],
+            absmax: torch.Tensor,
+            code: torch.Tensor,
+            blocksize: int,
+        ) -> torch.Tensor:
+            # Applied from dequantize_4bit
+            dtype = A.dtype
+            quant_type = "fp4" if code[1] > 0 else "nf4"
+            # cpu fused op only support bf16 for now.
+            if dtype != torch.bfloat16:
+                A = A.to(torch.bfloat16)
+
+            final_out_shape = (*A.shape[:-1], shapeB[0])
+            A = A.reshape(-1, A.shape[-1])
+            out_shape = (*A.shape[:-1], shapeB[0])
+            out = torch.empty(out_shape, dtype=A.dtype, device=A.device)
+            M = A.shape[0]
+            N = shapeB[0]
+            K = A.shape[1]
+            x_strideM = A.stride(0)
+            out_strideM = out.stride(0)
+            if quant_type == "fp4":
+                lib.gemv_4bit_inference_cpu_fp4_bf16(
+                    ct.c_int64(M),
+                    ct.c_int64(N),
+                    ct.c_int64(K),
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int64(blocksize),
+                    ct.c_int64(x_strideM),
+                    ct.c_int64(out_strideM),
+                )
+            elif quant_type == "nf4":
+                lib.gemv_4bit_inference_cpu_nf4_bf16(
+                    ct.c_int64(M),
+                    ct.c_int64(N),
+                    ct.c_int64(K),
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int64(blocksize),
+                    ct.c_int64(x_strideM),
+                    ct.c_int64(out_strideM),
+                )
+
+            if dtype != torch.bfloat16:
+                out = out.to(dtype)
+
+            return out.reshape(final_out_shape)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2103,4 +2103,56 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     return out
 
 
+def convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantState, block_n: int = 32):
+    """
+    qweight: (K * N / 2)  uint8
+    return: packed_weight
+    """
+    assert qweight.dtype == torch.uint8, "qweight must be uint8"
+    qweight = qweight.reshape(-1)
+    unpacked_w = torch.empty(qweight.shape[0] * 2, dtype=torch.int32, device=qweight.device)
+    unpacked_w[1::2] = qweight & 0xF
+    unpacked_w[::2] = qweight >> 4
+    qweight_final = unpacked_w.reshape(quant_state.shape).to(torch.uint8)  # (*, N, K)
+    # pack weight: [*, N, K] -> [*, N, K/2] combine low and high bit
+    assert len(qweight_final.shape) == 2
+    N, K = qweight_final.shape[0], qweight_final.shape[1]
+    assert N % block_n == 0, "N must be divisible by block_n"
+    assert K % 2 == 0, "K must be even"
+    BLOCK_N = block_n
+    BIT_COUNT = 32  # (=32 low +32 high)
+    new_shape = [N // BLOCK_N, BLOCK_N, K // 2, 2]
+    out_shape = [N, K // 2]
+    qw = qweight_final.reshape(new_shape)  # (..., N/B, B, K/2, 2)
+    qw = qw.transpose(-3, -2).contiguous()  # (..., N/B, K/2, B, 2)
+    qw = qw.reshape(-1, BIT_COUNT * 2)  # [-1, 64]
+    high = qw[:, BIT_COUNT:]  # high 32
+    low = qw[:, :BIT_COUNT]  # low 32
+    packed = ((high << 4) | low).to(torch.uint8)  # combine
+    final_qweight = packed.reshape(out_shape)
+    if quant_state.nested:
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+        absmax += quant_state.offset
+        if absmax.dtype != torch.float32:
+            absmax = absmax.float()
+
+        quant_state.absmax = (
+            absmax.reshape(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+            .T.to(torch.bfloat16)
+            .contiguous()
+        )
+        quant_state.nested = False
+        delattr(quant_state, "state2")
+
+    quant_state.dtype = torch.bfloat16
+    return final_qweight, quant_state
+
+
+def has_avx512bf16():
+    if hasattr(lib, "has_avx512bf16_cpu") and lib.has_avx512bf16_cpu():
+        return True
+    else:
+        return False
+
+
 C = 127.0
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -12,7 +12,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes.cextension import ROCM_WARP_SIZE_64
-from bitsandbytes.functional import QuantState
+from bitsandbytes.functional import QuantState, convert_weight_packed_for_cpu, has_avx512bf16
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING, OutlierTracer
 
@@ -479,6 +479,7 @@ def __init__(
         self.compute_type_is_set = compute_dtype is not None
         self.quant_state = None
         self.quant_storage = quant_storage
+        self.enable_optimized_cpu = False
 
     def set_compute_type(self, x):
         if x.dtype in [torch.float32, torch.bfloat16]:
@@ -512,8 +513,20 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
     def forward(self, x: torch.Tensor):
+        quant_state = self.weight.quant_state
         fix_4bit_weight_quant_state_from_module(self)
 
+        if (
+            not self.enable_optimized_cpu
+            and x.device.type == "cpu"
+            and has_avx512bf16()
+            and not self.training
+            and x.requires_grad == False
+        ):
+            self.weight.data, quant_state = convert_weight_packed_for_cpu(self.weight.data, quant_state)
+            self.enable_optimized_cpu = True
+            quant_state.enable_optimized_cpu = True
+
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
             self.bias.data = self.bias.data.to(x.dtype)
@@ -527,9 +540,9 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        weight = self.weight.t()
+        weight = self.weight if getattr(quant_state, "enable_optimized_cpu", False) else self.weight.t()
 
-        return bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
+        return bnb.matmul_4bit(x, weight, bias=bias, quant_state=quant_state).to(inp_dtype)
 
 
 class LinearFP4(Linear4bit):