Merge branch 'multi-backend-refactor' into xpu

jiqing-feng · jiqing-feng · commit 8c0271d3b001 · 2025-03-27T14:34:13.000Z
diff --git a/benchmarking/generation_benchmark.py b/benchmarking/generation_benchmark.py
@@ -0,0 +1,67 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--model_name", default="meta-llama/Llama-3.1-8B-Instruct", required=False, type=str, help="model_name"
+)
+parser.add_argument("--quant_type", default="int8", type=str, help="quant type", choices=["int8", "nf4", "fp4"])
+parser.add_argument("--device_map", default="cpu", type=str, help="device_map", choices=["cpu", "xpu", "cuda"])
+args = parser.parse_args()
+
+model_name = args.model_name
+device_map = args.device_map
+if args.quant_type == "int8":
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+else:
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type=args.quant_type,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name, torch_dtype="auto", device_map=device_map, quantization_config=quantization_config
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
+
+output = quantized_model.generate(**input_ids, max_new_tokens=10)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+
+# benchmark the performance
+def benchmark_fn(f, *args, **kwargs):
+    # Manual warmup
+    for _ in range(2):
+        f(*args, **kwargs)
+
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return t0.blocked_autorange().mean
+
+
+MAX_NEW_TOKENS = 100
+
+quantized_model_latency = benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, torch_dtype=torch.bfloat16)
+bf16_model_latency = benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+print(f"bnb model latency: {quantized_model_latency:.3f}")
+print(f"bf16 model latency: {bf16_model_latency:.3f}")
+print(f"BNB vs. bf16 model speed-up: {(bf16_model_latency / quantized_model_latency):.3f}")
+
+print(f"BNB model memory: {(quantized_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(f"bf16 model memory: {(bf16_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(
+    f"BNB vs. bf16 model memory ratio: {(bf16_model.get_memory_footprint() / quantized_model.get_memory_footprint()):.3f}"
+)
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
@@ -140,8 +140,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_cpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
-        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
 
     def dequantize_4bit(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
@@ -194,8 +194,10 @@ def int8_linear_matmul_impl(
 
     A_reshaped = A.reshape(m, k)
 
-    # torch._int_mm is available on CPU since torch 2.4
-    if _torch_version_prereq(2, 4) and A.device.type == "cpu":
+    # torch._int_mm is available on CPU since torch 2.4, XPU since torch 2.6
+    if (A.device.type == "cpu" and _torch_version_prereq(2, 4)) or (
+        A.device.type == "xpu" and _torch_version_prereq(2, 6)
+    ):
         C = torch._int_mm(A_reshaped, B.T).to(dtype)
     else:
         C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)
@@ -296,6 +298,7 @@ def quantize_4bit_impl(
     blocksize=64,
     compress_statistics=False,
     quant_type="nf4",
+    quant_storage=torch.uint8,
 ) -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
@@ -314,6 +317,8 @@ def quantize_4bit_impl(
         The blocksize used in quantization.
     quant_type : str
         The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+    quant_storage: torch.dtype
+        We can use bytes to convert storage type.
 
     Returns
     -------
@@ -401,6 +406,10 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
+    if quant_storage != torch.uint8:
+        bytes_value = out.cpu().numpy().tobytes()
+        out = torch.frombuffer(bytes_value, dtype=quant_storage).to(A.device)
+
     return out.reshape(-1, 1), state
 
 
@@ -418,7 +427,8 @@ def dequant_8bit(A, offset, quant_state):
     return absmax
 
 
-@_maybe_torch_compile
+# Compile will fail in torch.frombuffer
+# @_maybe_torch_compile
 def dequantize_4bit_impl(
     A: Tensor,
     quant_state=None,
@@ -428,8 +438,7 @@ def dequantize_4bit_impl(
     quant_type="nf4",
 ) -> Tensor:
     """
-    Dequantizes FP4 blockwise quantized values.
-
+    Dequantizes 4-bit blockwise quantized values.
     Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
 
     Parameters
@@ -445,8 +454,7 @@ def dequantize_4bit_impl(
     blocksize : int
         The blocksize used in quantization.
     quant_type : str
-        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
-
+        The 4-bit quantization data type {fp4, nf4}
 
     Returns
     -------
@@ -455,6 +463,10 @@ def dequantize_4bit_impl(
     """
     transpose = True if A.shape[0] == 1 else False
     A = A.reshape(-1)
+    device = A.device
+    if A.dtype != torch.uint8:
+        bytes_value = A.cpu().numpy().tobytes()
+        A = torch.frombuffer(bytes_value, dtype=torch.uint8).to(device)
 
     if quant_state is None:
         assert absmax is not None and out is not None
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
@@ -158,8 +158,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
-        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
         return output
 
     def dequantize_4bit(
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1076,7 +1076,7 @@ def dequantize_fp4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
@@ -1086,7 +1086,7 @@ def dequantize_nf4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
@@ -1096,8 +1096,8 @@ def dequantize_4bit(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
-    quant_type="fp4",
+    blocksize: Optional[int] = None,
+    quant_type: Optional[str] = "fp4",
 ) -> torch.Tensor:
     """Dequantizes a packed 4-bit quantized tensor.
 
@@ -1115,9 +1115,9 @@ def dequantize_4bit(
             Required if `quant_state` is not provided and ignored otherwise.
         out (`torch.Tensor`, *optional*): A tensor to use to store the result.
         blocksize (`int`, *optional*):
-            The size of the blocks. Defaults to 64.
+            The size of the blocks. Defaults to 64 if not HIP_ENVIRONMENT else 128.
             Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
-        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
+        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to "fp4".
 
     Raises:
         ValueError: Raised when the input data type or blocksize is not supported.
@@ -1127,9 +1127,9 @@ def dequantize_4bit(
     """
     ensure_backend_is_available(A.device.type)
     if quant_state is not None:
-        absmax = absmax or quant_state.absmax
-        quant_type = quant_type or quant_state.quant_type
-        blocksize = blocksize or quant_state.blocksize
+        absmax = quant_state.absmax
+        quant_type = quant_state.quant_type
+        blocksize = quant_state.blocksize
     if blocksize is None:
         # Some AMD GPUs have warpsize 64
         # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -487,6 +487,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                 self.weight.data = reverse_4bit_compress_format(self.weight.data.reshape(1, -1))
 
             self.weight.quant_state.ipex = False
+            self.ipex_linear_is_set = False
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
@@ -496,14 +497,13 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def set_ipex_linear(self, x: torch.Tensor):
         if (
-            (x.device.type in ("cpu", "xpu"))
-            and not getattr(self.weight.quant_state, "ipex", False)
+            not getattr(self.weight.quant_state, "ipex", False)
+            and self.weight.data.dtype == torch.uint8
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
-            and not self.training
-            and x.requires_grad == False
         ):
-            enable_ipex_fusion(self, x)
+            if x.device.type == "xpu" or (x.device.type == "cpu" and not self.training and x.requires_grad == False):
+                enable_ipex_fusion(self, x)
 
     def forward(self, x: torch.Tensor):
         # Check if ipex fusion can be used
@@ -700,26 +700,24 @@ def to(self, *args, **kwargs):
             elif device.type == "cpu":
                 if self.data.dtype == torch.int8:
                     self.CB = self.data
-                    return self
                 else:
                     return self.cpu()
             elif device.type == "xpu":
                 if self.data.dtype == torch.int8:
-                    self.data = self.data.contiguous().xpu(device)
+                    self.data = self.data.contiguous()
                     self.CB = self.data
-                    return self
-                else:
+                if self.data.device.type == "cpu":
                     return self.xpu(device)
-        else:
-            new_param = Int8Params(
-                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
-                requires_grad=self.requires_grad,
-                has_fp16_weights=self.has_fp16_weights,
-            )
-            new_param.CB = self.CB
-            new_param.SCB = self.SCB
 
-            return new_param
+        new_param = Int8Params(
+            super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+        )
+        new_param.CB = self.CB
+        new_param.SCB = self.SCB
+
+        return new_param
 
 
 def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
@@ -27,18 +27,25 @@ Thank you for your support!
 
 ### Intel
 
-The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+The below performance data is collected from the Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+
+You may run `benchmarking/generation_benchmark.py` to reproduce the below model memory and inference results. Please note that you need to bind cores if you are using the CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket.
+
+The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py).
+
+#### Model memory (CPU)
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Memory (GB) | 15.0 | 8.5 | 5.2 | 5.2 |
 
 #### Inference (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
-| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.57x | 2.6x | 0.1x |
 
 #### Fine-Tuning (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
-| Memory (GB) | 40 | 9 | 6.6 | 6.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.91x | 1.0x | 1.0x |
diff --git a/setup.py b/setup.py
@@ -1,11 +1,19 @@
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
 
+VERSION = "1.0.0"
+
 
 # Tested with wheel v0.45.1
 class BinaryDistribution(Distribution):
     def has_ext_modules(self):
         return True
 
 
-setup(packages=find_packages(), distclass=BinaryDistribution)
+def write_version_file(version, filepath="bitsandbytes/_version.py"):
+    with open(filepath, "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+    return version
+
+
+setup(packages=find_packages(), distclass=BinaryDistribution, version=write_version_file(VERSION))