Fix some tests

matthewdouglas · matthewdouglas · commit 42e2d059cf90 · 2025-11-12T17:04:53.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -286,7 +286,15 @@ if (BUILD_CPU)
         if (HAS_AVX512BF16_FLAG)
             target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
         endif()
-        target_compile_options(bitsandbytes PRIVATE -mprefer-vector-width=256)
+        target_compile_options(
+            bitsandbytes PRIVATE
+            -mprefer-vector-width=256
+            -mfma
+            -mavx2
+            -mlzcnt
+            -mbmi
+            -mbmi2
+        )
     endif()
 endif()
 
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -1,6 +1,7 @@
 from collections.abc import Sequence
 import ctypes as ct
 import logging
+from math import prod
 
 import torch
 
@@ -132,6 +133,13 @@ def _(
             dtype in [torch.bfloat16, torch.float16, torch.float32],
             lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
         )
+
+        # Odd shape is not supported by this kernel; fallback to generic implementation
+        if shape[-1] % 2 != 0:
+            from ..default.ops import _dequantize_4bit_impl
+
+            return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
         # Enable non uint8 dtype
         if A.dtype != torch.uint8:
             A = A.view(torch.uint8)
@@ -140,35 +148,42 @@ def _(
         if absmax.dtype != torch.float32:
             absmax = absmax.float()
 
-        A = A.reshape(shape[0], shape[1] // 2)
+        if len(shape) == 1:
+            shape = (1, shape[0])
+
+        m = prod(shape[:-1])
+        n = shape[-1]
+
+        A = A.reshape(m, n // 2)
         out = torch.empty(shape, dtype=dtype, device=A.device)
+
         if quant_type == "fp4":
             if dtype == torch.float32:
                 lib.cdequantize_blockwise_cpu_fp4_fp32(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.bfloat16:
                 lib.cdequantize_blockwise_cpu_fp4_bf16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.float16:
                 lib.cdequantize_blockwise_cpu_fp4_fp16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
         elif quant_type == "nf4":
             if dtype == torch.float32:
@@ -177,26 +192,26 @@ def _(
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.bfloat16:
                 lib.cdequantize_blockwise_cpu_nf4_bf16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
             elif dtype == torch.float16:
                 lib.cdequantize_blockwise_cpu_nf4_fp16(
                     get_ptr(A),
                     get_ptr(absmax),
                     get_ptr(out),
                     ct.c_longlong(blocksize),
-                    ct.c_longlong(shape[0]),
-                    ct.c_longlong(shape[1]),
+                    ct.c_longlong(m),
+                    ct.c_longlong(n),
                 )
         else:
             raise ValueError
diff --git a/bitsandbytes/backends/default/ops.py b/bitsandbytes/backends/default/ops.py
@@ -232,22 +232,14 @@ def _(
     return packed, absmax.float()
 
 
-@register_kernel("bitsandbytes::dequantize_4bit", "default")
-def _(
+def _dequantize_4bit_impl(
     A: torch.Tensor,
     absmax: torch.Tensor,
     blocksize: int,
     quant_type: str,
     shape: Sequence[int],
     dtype: torch.dtype,
 ) -> torch.Tensor:
-    torch._check_is_size(blocksize)
-    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
-    torch._check(
-        dtype in [torch.bfloat16, torch.float16, torch.float32],
-        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
-    )
-
     # Enable non uint8 dtype
     if A.dtype != torch.uint8:
         A = A.view(torch.uint8)
@@ -283,6 +275,25 @@ def _(
     return out
 
 
+@register_kernel("bitsandbytes::dequantize_4bit", "default")
+def _(
+    A: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int,
+    quant_type: str,
+    shape: Sequence[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+    torch._check(
+        dtype in [torch.bfloat16, torch.float16, torch.float32],
+        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+    )
+
+    return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+
+
 @register_kernel("bitsandbytes::gemv_4bit", "default")
 def _(
     A: torch.Tensor,