bitsandbytes-foundation
diff --git a/‎.github/scripts/build-cuda.sh‎
Lines changed: 2 additions & 2 deletions b/‎.github/scripts/build-cuda.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 10 additions & 32 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 10 additions & 32 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 29 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎_typos.toml‎
Lines changed: 7 additions & 0 deletions b/‎_typos.toml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 0 additions & 21 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 20 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 2 additions & 20 deletions
@@ -14,8 +14,8 @@ elif [ "${build_arch}" = "aarch64" ]; then
     # CUDA 12.8+: Add sm100/sm120
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
 else
-    # By default, target Maxwell through Hopper.
-    build_capability="50;60;70;75;80;86;89;90"
+    # By default, target Pascal through Hopper.
+    build_capability="60;70;75;80;86;89;90"
 
     # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
 
@@ -102,7 +102,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.2.2", "2.7.1", "2.8.0"]
+        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -118,7 +118,7 @@ jobs:
             arch: arm64
         exclude:
           - os: ubuntu-22.04-arm
-            torch_version: "2.2.2"
+            torch_version: "2.3.1"
 
     runs-on: ${{ matrix.runner || matrix.os }}
     env:
@@ -144,13 +144,14 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.8.0' && 'test/cpu') || 'cpu' }}
+          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
           pip install -e ".[test]"
           pip install pytest-cov
 
-      # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      # We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
+      # See: https://github.com/pytorch/pytorch/issues/131668
       - name: Downgrade NumPy
-        if: startsWith(matrix.torch_version, '2.2.')
+        if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
         run: pip install "numpy<2"
 
       - name: Show installed packages
@@ -162,7 +163,7 @@ jobs:
       - name: Run tests
         run: pytest --durations=100
 
-  test-cpu-ipex:
+  test-cpu-intel:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     runs-on: banb-aws-general-8-plus-use1-public-80
@@ -186,7 +187,6 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
           pip install -e ".[test]"
           pip install pytest-cov
 
@@ -196,9 +196,6 @@ jobs:
       - name: Show environment information
         run: python -m torch.utils.collect_env
 
-      - name: IPEX smoke test
-        run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
-
       - name: Run tests
         run: pytest --durations=100
 
@@ -286,15 +283,6 @@ jobs:
       fail-fast: false
       matrix:
         torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
-        ipex: [false]
-        # ipex: [true, false]
-        # include:
-        #   - torch_version: "2.6.0"
-        #     ipex: true
-        #     ipex_version: "2.6.10+xpu"
-        #   - torch_version: "2.7.1"
-        #     ipex: true
-        #     ipex_version: "2.7.10+xpu"
     runs-on:
       group: bandb-itac-bmsprpvc1550-8-1gpu
     env:
@@ -330,10 +318,6 @@ jobs:
       - name: Install PyTorch
         run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
 
-      - name: Install IPEX
-        if: matrix.ipex == true
-        run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
       - name: Install dependencies
         run: |
           pip install -e ".[test]"
@@ -362,7 +346,7 @@ jobs:
         cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
         include:
           - cuda_version: "11.8.0"
-            torch_version: "2.2.2"
+            torch_version: "2.3.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - cuda_version: "12.6.3"
             torch_version: "2.6.0"
@@ -372,7 +356,7 @@ jobs:
             pypi_index: "https://download.pytorch.org/whl/cu128"
           - cuda_version: "12.9.1"
             torch_version: "2.8.0"
-            pypi_index: "https://download.pytorch.org/whl/test/cu129"
+            pypi_index: "https://download.pytorch.org/whl/cu129"
 
 
           # Linux L40S runners
@@ -391,7 +375,7 @@ jobs:
             gpu: T4
             runner: CUDA-Windows-x64
             cuda_version: "11.8.0"
-            torch_version: "2.2.0"
+            torch_version: "2.3.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - os: windows-2025
             arch: x86_64
@@ -447,12 +431,6 @@ jobs:
           pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
           pip install -e ".[test]"
           pip install pytest-cov
-
-        # We need to downgrade to numpy<2 for torch<2.3 compatibility.
-      - name: Downgrade NumPy
-        if: startsWith(matrix.torch_version, '2.2.')
-        run: pip install "numpy<2"
-
       - name: Show installed packages
         run: pip list
 
 
@@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
+set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -64,10 +65,18 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS ON)
+elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
+    if(APPLE)
+        message(FATAL_ERROR "XPU is not supported on macOS" )
+    endif()
+    set(BUILD_CUDA OFF)
+    set(BUILD_MPS OFF)
+    set(BUILD_XPU ON)
 else()
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
+    set(BUILD_XPU OFF)
 endif()
 
 
@@ -222,6 +231,15 @@ elseif(BUILD_MPS)
                 COMMENT "Compiling Metal kernels"
                 VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+elseif(BUILD_XPU)
+    list(APPEND SRC_FILES ${XPU_FILES})
+    string(APPEND BNB_OUTPUT_NAME "_xpu")
+    add_compile_definitions(BUILD_XPU)
+    set(CMAKE_C_COMPILER icx)
+    set(CMAKE_CXX_COMPILER icpx)
+    if(WIN32)
+        set(CMAKE_CXX_COMPILER icx)
+    endif()
 else()
     string(APPEND BNB_OUTPUT_NAME "_cpu")
     set(GPU_SOURCES)
@@ -290,6 +308,15 @@ if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
+if(BUILD_XPU)
+    set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
+    set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
+
+    set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
+    target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
+    target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})
+
+endif()
 
 if(WIN32)
     set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
 
@@ -20,7 +20,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 bitsandbytes has the following minimum requirements for all platforms:
 
 * Python 3.9+
-* [PyTorch](https://pytorch.org/get-started/locally/) 2.2+
+* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
   * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
 
 #### Accelerator support:
@@ -61,7 +61,7 @@ bitsandbytes has the following minimum requirements for all platforms:
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
-      <td>SM50+ minimum<br>SM75+ recommended</td>
+      <td>SM60+ minimum<br>SM75+ recommended</td>
       <td>✅</td>
       <td>✅</td>
       <td>✅</td>
@@ -87,7 +87,7 @@ bitsandbytes has the following minimum requirements for all platforms:
       </td>
       <td>✅</td>
       <td>✅</td>
-      <td>🚧</td>
+      <td>〰️</td>
     </tr>
     <tr>
       <td></td>
@@ -127,7 +127,7 @@ bitsandbytes has the following minimum requirements for all platforms:
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
-      <td>SM50+ minimum<br>SM75+ recommended</td>
+      <td>SM60+ minimum<br>SM75+ recommended</td>
       <td>✅</td>
       <td>✅</td>
       <td>✅</td>
@@ -141,7 +141,7 @@ bitsandbytes has the following minimum requirements for all platforms:
       </td>
       <td>✅</td>
       <td>✅</td>
-      <td>🚧</td>
+      <td>〰️</td>
     </tr>
     <tr>
       <td colspan="6">🍎 <strong>macOS 14+</strong></td>
 
@@ -1,4 +1,11 @@
 [files]
+# Skip these files in typo checks
+extend-exclude = [
+    "csrc/xpu_ops.h",
+    "csrc/xpu_ops.cpp",
+    "csrc/xpu_kernels.h",
+    "csrc/xpu_kernels.cpp"
+]
 
 [default]
 extend-ignore-re = [
 
@@ -4,8 +4,6 @@
 
 import torch
 
-from .cextension import ipex_cpu, ipex_xpu
-
 _IS_TORCH_GTE_24 = False
 
 if hasattr(torch.library, "register_fake"):
@@ -331,25 +329,6 @@ def _(
     torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
 
 
-if ipex_cpu or ipex_xpu:
-    # Register the dequantize_nf4_ipex implementation
-    torch.library.define(
-        "bitsandbytes::dequantize_nf4_ipex",
-        "(Tensor A, Tensor absmax, int blocksize, int[] shape, ScalarType dtype) -> Tensor",
-    )
-
-    @register_fake("bitsandbytes::dequantize_nf4_ipex")
-    def _(
-        A: torch.Tensor,
-        absmax: torch.Tensor,
-        blocksize: int,
-        shape: Sequence[int],
-        dtype: torch.dtype,
-    ) -> torch.Tensor:
-        torch._check_is_size(blocksize)
-        return torch.empty(shape, dtype=dtype, device=A.device)
-
-
 torch.library.define(
     "bitsandbytes::optimizer_update_32bit",
     "(str optimizer_name, Tensor(a0!) g, Tensor(a1!) p, Tensor(a2!) state1, Tensor(a3!)? state2, Tensor(a4!)? unorm_vec, float max_unorm, float param_norm, float beta1, float beta2, float beta3, float alpha, float eps, float weight_decay, int step, float lr, float gnorm_scale, bool skip_zeros=False) -> ()",
 
@@ -8,7 +8,6 @@
 from typing_extensions import deprecated
 
 import bitsandbytes.functional as F
-from bitsandbytes.functional import ipex_cpu, ipex_xpu
 
 # The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov:
 # https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py
@@ -85,11 +84,7 @@ def get_inverse_transform_indices(
     return permuted_tile_indices
 
 
-# torch.compiler.is_compiling() is available only in torch >= 2.3
-if hasattr(torch.compiler, "is_compiling"):
-    _is_compiling = torch.compiler.is_compiling
-else:
-    _is_compiling = torch._dynamo.is_compiling
+_is_compiling = torch.compiler.is_compiling
 
 
 @deprecated(
@@ -320,8 +315,6 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
 
         CB = state.CB.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
         output = torch.nn.functional.linear(A, CB, bias)
-        # to pass the test: tests/test_modules.py::test_linear8bitlt_no_fp16_weights[2.0-xpu]
-        state.idx = False
         ctx.state = state
         ctx.dtype_A = A.dtype
         ctx.grad_shape = A.shape
@@ -426,7 +419,7 @@ def matmul(
         state.threshold = threshold
     # MatMul8bitLt is slower because no fast kernel for quant/dequant 8bit in CPU/XPU
     if state.is_training:
-        if (A.device.type == "cpu" and ipex_cpu) or (A.device.type == "xpu" and ipex_xpu):
+        if A.device.type in ("cpu", "xpu"):
             return MatMul8bitFp.apply(A, B, out, bias, state)
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
@@ -440,17 +433,6 @@ def matmul_4bit(
 ):
     assert quant_state is not None
 
-    if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
-        if getattr(quant_state, "ipex", False):
-            # IPEX CPU will change weight to 4D so don't need transpose
-            B = B.t() if B.dim() == 2 else B
-            out = F.gemv_4bit(A, B, out, state=quant_state)
-            if bias is not None:
-                out += bias
-            return out
-        else:
-            return MatMul4Bit.apply(A, B, out, bias, quant_state)
-
     if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(