Merge branch 'main' into patch-1

matthewdouglas · web-flow · commit e679ba69d9da · 2025-09-26T10:48:30.000-04:00
diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
@@ -11,7 +11,7 @@ if [[ -v cuda_targets ]]; then
 elif [ "${build_arch}" = "aarch64" ]; then
     build_capability="75;80;90"
 
-    # CUDA 12.8+: Add sm100/sm120
+    # CUDA 12.8-12.9: Add sm100/sm120
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
 
     # CUDA 13.0+: Add sm100/sm110/sm120
diff --git a/.github/scripts/build-xpu.sh b/.github/scripts/build-xpu.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+declare build_os
+
+set -xeuo pipefail
+
+# We currently only build XPU on Linux.
+if [ "${build_os:0:6}" == ubuntu ]; then
+    # TODO: We might want to pre-build this as our own customized image in the future.
+    image=intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04
+    echo "Using image $image"
+    docker run --rm -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        cmake bison intel-fw-gpu intel-ocloc \
+      && cmake -DCOMPUTE_BACKEND=xpu . \
+      && cmake --build . --config Release"
+fi
+
+output_dir="output/${build_os}/x86_64"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -104,6 +104,24 @@ jobs:
           path: output/*
           retention-days: 7
 
+  build-shared-libs-xpu:
+    strategy:
+      matrix:
+        os: [ubuntu-22.04]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build C++
+        run: bash .github/scripts/build-xpu.sh
+        env:
+          build_os: ${{ matrix.os }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_xpu_${{ matrix.os }}_x86_64
+          path: output/*
+          retention-days: 7
+
   build-shared-libs-rocm:
     strategy:
       matrix:
@@ -153,6 +171,7 @@ jobs:
       - build-shared-libs
       - build-shared-libs-cuda
       - build-shared-libs-rocm
+      - build-shared-libs-xpu
     strategy:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -70,6 +70,7 @@ elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
         message(FATAL_ERROR "XPU is not supported on macOS" )
     endif()
     set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
     set(BUILD_XPU ON)
 else()
diff --git a/README.md b/README.md
@@ -92,7 +92,7 @@ bitsandbytes has the following minimum requirements for all platforms:
     <tr>
       <td></td>
       <td>🟪 Intel Gaudi <br><code>hpu</code></td>
-      <td>Gaudi1, Gaudi2, Gaudi3</td>
+      <td>Gaudi2, Gaudi3</td>
       <td>✅</td>
       <td>〰️</td>
       <td>❌</td>
@@ -173,7 +173,9 @@ bitsandbytes has the following minimum requirements for all platforms:
 ## :heart: Sponsors
 The continued maintenance and development of `bitsandbytes` is made possible thanks to the generous support of our sponsors. Their contributions help ensure that we can keep improving the project and delivering valuable updates to the community.
 
-<a href="https://hf.co" target="_blank"><img width="100" src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" alt="Hugging Face"></a>
+<kbd><a href="https://hf.co" target="_blank"><img width="100" src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg" alt="Hugging Face"></a></kbd>
+&nbsp;
+<kbd><a href="https://intel.com" target="_blank"><img width="100" src="https://avatars.githubusercontent.com/u/17888862?s=100&v=4" alt="Intel"></a></kbd>
 
 ## License
 `bitsandbytes` is MIT licensed.
diff --git a/bitsandbytes/backends/hpu/ops.py b/bitsandbytes/backends/hpu/ops.py
@@ -3,12 +3,19 @@
 
 import torch
 
-from bitsandbytes.utils import _reverse_4bit_compress_format
-
 from ..._ops import register_kernel
 from ..utils import GAUDI_SW_VER
 
 
+# convert btw standard 4-bit compression format and ipex compression format
+# needed for backward compatibility with older versions of gaudi sw
+def _reverse_4bit_compress_format(weight: torch.Tensor):
+    out_1 = (weight & 0xF0) >> 4
+    out_2 = (weight & 0xF) << 4
+    out = out_1 | out_2
+    return out
+
+
 @register_kernel("bitsandbytes::dequantize_4bit", "hpu")
 def _(
     A: torch.Tensor,
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
@@ -294,9 +294,6 @@ def get_native_library() -> BNBNativeLibrary:
     if hasattr(dll, "get_context"):  # only a CUDA-built library exposes this
         return CudaBNBNativeLibrary(dll)
 
-    # TODO: Remove this log for XPU after 8-bit optimizer is supported
-    logger.warning("The 8-bit optimizer is not available on your device, only available on CUDA for now.")
-
     return BNBNativeLibrary(dll)
 
 
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
@@ -19,37 +19,42 @@
 #define NUM 4
 #define NUM_BLOCK 4096
 
-__device__ static float nf4_data[16] = {-1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453, -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0, 0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224, 0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0};
+__device__ static float fp4_dequantization_lut[8] = {
+    0.0f,            // 0b000
+    0.005208333333f, // 0b001
+    0.66666667f,     // 0b010
+    1.0f,            // 0b011
+    0.33333333f,     // 0b100
+    0.5f,            // 0b101
+    0.16666667f,     // 0b110
+    0.25f            // 0b111
+};
+
+__device__ static float nf4_dequantization_lut[16] = {
+    -1.0f,                 // 0b0000
+    -0.6961928009986877f,  // 0b0001
+    -0.5250730514526367f,  // 0b0010
+    -0.39491748809814453f, // 0b0011
+    -0.28444138169288635f, // 0b0100
+    -0.18477343022823334f, // 0b0101
+    -0.09105003625154495f, // 0b0110
+    0.0f,                  // 0b0111
+    0.07958029955625534f,  // 0b1000
+    0.16093020141124725f,  // 0b1001
+    0.24611230194568634f,  // 0b1010
+    0.33791524171829224f,  // 0b1011
+    0.44070982933044434f,  // 0b1100
+    0.5626170039176941f,   // 0b1101
+    0.7229568362236023f,   // 0b1110
+    1.0f                   // 0b1111
+};
 
 // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
 // Luckily we have atomicmax and atomicmin in ROCm
 
-
-__device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
-{
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if((val & 0b0100) == 4) // 0
-    if((val & 0b0010) == 2) //01
-      if((val & 0b0001) == 1) // 111
-        return 0.25000000f*absmax*sign; // 1111
-      else
-        return 0.16666667f*absmax*sign; // 1110
-    else
-      if((val & 0b0001) == 1) // 110
-        return 0.50000000f*absmax*sign; // 1101
-      else
-        return 0.33333333f*absmax*sign; // 1100
-  else
-    if((val & 0b0010) == 2) //10
-      if((val & 0b0001) == 1) // 101
-        return 1.00000000f*absmax*sign; // 1011
-      else
-        return 0.66666667f*absmax*sign; // 1010
-    else
-      if((val & 0b0001) == 1) // 100
-        return 5.208333333e-03f*absmax*sign; // 1001
-      else
-        return 0.00000000f*absmax*sign; // 1000
+__device__ __forceinline__ float dDequantizeFP4Tree(unsigned char val) {
+    float sign = 1.0f - 2 * ((val & 0b1000) >> 3);
+    return fp4_dequantization_lut[val & 0b111] * sign;
 }
 
 __device__ unsigned char dQuantizeFP4(float x)
@@ -101,61 +106,7 @@ __device__ unsigned char dQuantizeFP4(float x)
         return 0b0000+sign;
 }
 
-
-__device__ __forceinline__ float dDequantizeNF4(unsigned char val)
-{
-
-  // the values for this tree was generated by test_normal_map_tree
-  // in the file tests/test_functional.py
-  if((val & 0b1000) == 8)
-    if((val & 0b0100) == 4) // 1
-      if((val & 0b0010) == 2) // 11
-        if((val & 0b0001) == 1) // 111
-          return 1.0f;
-        else
-          return 0.7229568362236023f;
-      else
-        if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f;
-        else
-          return 0.44070982933044434f;
-    else
-      if((val & 0b0010) == 2) //10
-        if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f;
-        else
-          return 0.24611230194568634f;
-      else
-        if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f;
-        else
-          return 0.07958029955625534f;
-
-  else
-    if((val & 0b0100) == 4) // 0
-      if((val & 0b0010) == 2) //01
-        if((val & 0b0001) == 1) // 011
-          return 0.0f;
-        else
-          return -0.09105003625154495f;
-      else
-        if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f;
-        else
-          return -0.28444138169288635f;
-    else
-      if((val & 0b0010) == 2) //00
-        if((val & 0b0001) == 1) // 001
-          return -0.39491748809814453f;
-        else
-          return -0.5250730514526367f;
-      else
-        if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f;
-        else
-          return -1.0f;
-
-}
+__device__ __forceinline__ float dDequantizeNF4(unsigned char val) { return nf4_dequantization_lut[val & 0x0F]; }
 
 __device__ unsigned char dQuantizeNF4(float x)
 {
@@ -456,7 +407,6 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
       LoadFloat(loadf).Load(&rand[local_rand_idx], rand_vals, BLOCK_SIZE, 0);
     }
 
-    unsigned char packed_4bit = 0;
     switch(DATA_TYPE)
     {
         case General8bit:
@@ -473,18 +423,16 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
             #pragma unroll NUM_PER_TH
             for(int j = 0; j < NUM_PER_TH/2; j++)
             {
-              packed_4bit |= dQuantizeFP4(((float)vals[2*j])*local_abs_max) << 4;
-              packed_4bit |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max);
-              qvals[j] = packed_4bit;
+                qvals[j] = dQuantizeFP4(((float)vals[2 * j]) * local_abs_max) << 4;
+                qvals[j] |= dQuantizeFP4(((float)vals[2 * j + 1]) * local_abs_max);
             }
             break;
         case NF4:
             #pragma unroll NUM_PER_TH
             for(int j = 0; j < NUM_PER_TH/2; j++)
             {
-              packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4;
-              packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max);
-              qvals[j] = packed_4bit;
+                qvals[j] = dQuantizeNF4(((float)vals[2 * j]) * local_abs_max) << 4;
+                qvals[j] |= dQuantizeNF4(((float)vals[2 * j + 1]) * local_abs_max);
             }
             break;
     }
@@ -546,8 +494,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
           #pragma unroll NUM_PER_TH
           for(int j = 0; j < NUM_PER_TH; j++)
           {
-            vals[j*2] = dDequantizeFP4Tree(qvals[j] >> 4, local_abs_max);
-            vals[j*2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F, local_abs_max);
+                vals[j * 2] = dDequantizeFP4Tree(qvals[j] >> 4) * local_abs_max;
+                vals[j * 2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F) * local_abs_max;
           }
           break;
         case NF4:
@@ -2109,7 +2057,11 @@ __global__ void kdequant_mm_int32_fp16(
 #define DENORM 1.0f/127.0f
 #define MAX_SPARSE_COUNT 32
 #define SMEM_SIZE 8*256
-#define WARP_SIZE warpSize
+#if defined(__GFX9__)
+  #define WARP_SIZE 64
+#else
+  #define WARP_SIZE 32
+#endif
 template <typename T, int SPMM_ITEMS, int BITS>
 __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB)
 {
@@ -2503,7 +2455,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 
   #pragma unroll 16
   for(int i = 0; i < 16; i++)
-    quant_map[i] = nf4_data[i];
+    quant_map[i] = nf4_dequantization_lut[i];
   //__shared__ T quant_map[16*160];
 
   T local_A[2];
@@ -2708,13 +2660,13 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
   // load step-by-step in chunks of [warp_size,warps]: 1xwarp_size * [warp_size,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1xwarp_size * warp_sizex4 -> 1x4 outputs per thread block
-  typedef hipcub::WarpReduce<float, warpSize> WarpReduce;
-  __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/warpSize];
+  typedef hipcub::WarpReduce<float, WARP_SIZE> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/WARP_SIZE];
 
-  const int warp_idx = threadIdx.x / warpSize;
-  const int warp_lane = threadIdx.x % warpSize;
-  const int row_B = (THREADS/warpSize)*blockIdx.x + warp_idx;
-  const int offset_B = ldb*row_B;
+  const int warp_idx = threadIdx.x / WARP_SIZE;
+  const int warp_lane = threadIdx.x % WARP_SIZE;
+  const int row_B = (THREADS/WARP_SIZE)*blockIdx.x + warp_idx;
+  const int offset_B = ldb * row_B;
   const int num_values_8bit = num_values_4bit/2;
   float local_C = 0.0f;
 
@@ -2732,7 +2684,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
 
   // A: [1, K]
   // B: [M, K]
-  for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += warpSize*num_values_4bit)
+  for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += WARP_SIZE*num_values_4bit)
   {
     const int inner_idx_halved = inner_idx/2;
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
@@ -20,6 +20,12 @@
 
 #define ERR_NOT_IMPLEMENTED 100
 
+#if defined(__GFX9__)
+  #define WARP_SIZE 64
+#else
+  #define WARP_SIZE 32
+#endif
+
 using namespace BinSearch;
 using std::cout;
 using std::endl;
@@ -692,7 +698,7 @@ template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int
 	//warpsize - 32
         int num_blocks = (m+3)/4;
 	//warpsize - 64
-        if (warpSize == 64) {
+        if (WARP_SIZE == 64) {
           num_blocks = (m+1)/2;
         }
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
diff --git a/tests/test_ops.py b/tests/test_ops.py