fix tests

jiqing-feng · jiqing-feng · commit 3179b42be6ed · 2025-11-20T15:44:56.000Z
Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -229,7 +229,7 @@ def _(
             code: torch.Tensor,
             blocksize: int,
         ) -> torch.Tensor:
-            # Applied from dequantize_4bit
+            assert B.dtype == torch.uint8, "Only support uint8 qweight"
             dtype = A.dtype
             quant_type = "fp4" if code[1] > 0 else "nf4"
             # cpu fused op only support bf16 for now.
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2108,7 +2108,9 @@ def _convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantStat
     qweight: (K * N / 2)  uint8
     return: packed_weight
     """
-    assert qweight.dtype == torch.uint8, "qweight must be uint8"
+    if qweight.dtype != torch.uint8:
+        quant_state.original_storage_type = qweight.dtype
+        qweight = qweight.view(torch.uint8)
     quant_state.original_dtype = quant_state.dtype
     quant_state.original_nested = quant_state.nested
     quant_state.original_qshape = qweight.shape
@@ -2200,6 +2202,7 @@ def _convert_weight_packed_for_cpu_inverse(
 
     # 2) Best-effort restore of quant_state fields (absmax / dtype / nested flags, etc.)
     recovered_state = quant_state
+    qweight = qweight.to(torch.uint8).reshape(recovered_state.original_qshape)
 
     # quantize absmax
     if recovered_state.original_nested:
@@ -2213,7 +2216,10 @@ def _convert_weight_packed_for_cpu_inverse(
     recovered_state.dtype = recovered_state.original_dtype
     recovered_state.packing_format_for_cpu = False
 
-    return qweight.to(torch.uint8).reshape(recovered_state.original_qshape), recovered_state
+    if getattr(recovered_state, "original_storage_type", None):
+        qweight = qweight.view(recovered_state.original_storage_type)
+
+    return qweight, recovered_state
 
 
 def has_avx512bf16():
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -1318,6 +1318,10 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
                 quant_storage=quant_storage,
             )
             C3 = torch.matmul(A, B.t())
+            # CPU requires convert weight packed for gemv
+            if device == "cpu" and F.has_avx512bf16():
+                qB, state = F._convert_weight_packed_for_cpu(qB, state)
+                qB = qB.t()
             C2 = F.gemv_4bit(A, qB.t(), state=state)
             A.requires_grad = True
             C1 = bnb.matmul_4bit(A, qB.t(), state)