add reverse format

jiqing-feng · jiqing-feng · commit 6858a90b51de · 2025-11-20T10:32:47.000Z
Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2109,6 +2109,10 @@ def _convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantStat
     return: packed_weight
     """
     assert qweight.dtype == torch.uint8, "qweight must be uint8"
+    quant_state.original_dtype = quant_state.dtype
+    quant_state.original_nested = quant_state.nested
+    quant_state.original_qshape = qweight.shape
+
     qweight = qweight.reshape(-1)
     unpacked_w = torch.empty(qweight.shape[0] * 2, dtype=torch.int32, device=qweight.device)
     unpacked_w[1::2] = qweight & 0xF
@@ -2145,9 +2149,73 @@ def _convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantStat
         delattr(quant_state, "state2")
 
     quant_state.dtype = torch.bfloat16
+    quant_state.packing_format_for_cpu = True
     return final_qweight, quant_state
 
 
+def _convert_weight_packed_for_cpu_inverse(
+    packed_weight: torch.Tensor,
+    quant_state: QuantState,
+    block_n: int = 32,
+) -> tuple[torch.Tensor, QuantState]:
+    """
+    packed_weight: [N, K/2] uint8, output of `_convert_weight_packed_for_cpu` (final_qweight)
+    quant_state:   QuantState that was modified by `_convert_weight_packed_for_cpu`
+    Returns:
+        qweight: [*, N, K] uint8, original qweight shape (quant_state.shape)
+        recovered_state: QuantState with partially restored fields (best-effort inverse)
+    """
+    assert quant_state.packing_format_for_cpu, "only for packing format"
+    assert packed_weight.dtype == torch.uint8
+    assert len(packed_weight.shape) == 2, "packed_weight should be [N, K/2]"
+    N, K_half = packed_weight.shape
+    K = K_half * 2
+
+    # 1) packed [N, K/2] -> [N//BLOCK_N, BLOCK_N, K/2, 2]
+    BLOCK_N = block_n
+    BIT_COUNT = 32  # (=32 low + 32 high)
+
+    assert N % BLOCK_N == 0, "N must be divisible by block_n"
+    assert K % 2 == 0, "K must be even"
+
+    # [N, K/2] -> [-1, 64] (32 low + 32 high)
+    packed = packed_weight.reshape(-1, BIT_COUNT)  # [-1, 64]
+    # split high/low nibbles
+    high = (packed >> 4) & 0xF
+    low = packed & 0xF
+    # concatenate to [..., 64], first 32 are low, last 32 are high
+    qw = torch.cat([low, high], dim=-1).to(torch.uint8)  # [..., 64]
+
+    # -> [N/BLOCK_N, K/2, BLOCK_N, 2] -> [N, K]
+    qw = qw.reshape(N // BLOCK_N, K_half, BLOCK_N, 2)  # [N/B, K/2, B, 2]
+    qw = qw.transpose(-3, -2).contiguous()  # [N/B, B, K/2, 2]
+    qw = qw.reshape(N, K)  # [N, K]
+
+    qweight = qw  # [N, K]
+
+    unpacked_w = qweight.reshape(-1).to(torch.int32)  # [K*N]
+    high4 = (unpacked_w[::2] & 0xF).to(torch.uint8)
+    low4 = (unpacked_w[1::2] & 0xF).to(torch.uint8)
+    qweight = (high4 << 4) | low4  # [K*N/2]
+
+    # 2) Best-effort restore of quant_state fields (absmax / dtype / nested flags, etc.)
+    recovered_state = quant_state
+
+    # quantize absmax
+    if recovered_state.original_nested:
+        absmax = recovered_state.absmax.T.reshape(-1).to(recovered_state.original_dtype)
+        offset = absmax.mean()
+        qabsmax, state2 = quantize_blockwise(absmax - offset, blocksize=256)
+        recovered_state.absmax = qabsmax
+        recovered_state.offset = offset
+        recovered_state.state2 = state2
+
+    recovered_state.dtype = recovered_state.original_dtype
+    recovered_state.packing_format_for_cpu = False
+
+    return qweight.to(torch.uint8).reshape(recovered_state.original_qshape), recovered_state
+
+
 def has_avx512bf16():
     if hasattr(lib, "has_avx512bf16_cpu") and lib.has_avx512bf16_cpu():
         return True
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -12,7 +12,12 @@
 
 import bitsandbytes as bnb
 from bitsandbytes.cextension import ROCM_WARP_SIZE_64
-from bitsandbytes.functional import QuantState, _convert_weight_packed_for_cpu, has_avx512bf16
+from bitsandbytes.functional import (
+    QuantState,
+    _convert_weight_packed_for_cpu,
+    _convert_weight_packed_for_cpu_inverse,
+    has_avx512bf16,
+)
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING, OutlierTracer
 
@@ -311,9 +316,13 @@ def cpu(self):
         return self.to(device="cpu")
 
     def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
+        if getattr(self.quant_state, "packing_format_for_cpu", False):
+            self.data, self.quant_state = _convert_weight_packed_for_cpu_inverse(self.data, self.quant_state)
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
     def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
+        if getattr(self.quant_state, "packing_format_for_cpu", False):
+            self.data, self.quant_state = _convert_weight_packed_for_cpu_inverse(self.data, self.quant_state)
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     @overload
@@ -479,7 +488,6 @@ def __init__(
         self.compute_type_is_set = compute_dtype is not None
         self.quant_state = None
         self.quant_storage = quant_storage
-        self.packing_format_for_cpu = False
 
     def set_compute_type(self, x):
         if x.dtype in [torch.float32, torch.bfloat16]:
@@ -507,7 +515,10 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         then fill state_dict with components of quant_state
         """
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
-
+        if getattr(self.weight.quant_state, "packing_format_for_cpu", False):
+            self.weight.data, self.weight.quant_state = _convert_weight_packed_for_cpu_inverse(
+                self.weight.data, self.weight.quant_state
+            )
         if getattr(self.weight, "quant_state", None) is not None:
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
@@ -517,15 +528,13 @@ def forward(self, x: torch.Tensor):
         quant_state = self.weight.quant_state
 
         if (
-            not self.packing_format_for_cpu
+            not getattr(quant_state, "packing_format_for_cpu", False)
             and x.device.type == "cpu"
             and has_avx512bf16()
             and not self.training
             and x.requires_grad == False
         ):
             self.weight.data, quant_state = _convert_weight_packed_for_cpu(self.weight.data, quant_state)
-            self.packing_format_for_cpu = True
-            quant_state.packing_format_for_cpu = True
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype: