bitsandbytes-foundation
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/build-rocm.sh‎
Lines changed: 14 additions & 7 deletions b/‎.github/scripts/build-rocm.sh‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions b/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 26 additions & 20 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 26 additions & 20 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 19 additions & 21 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 19 additions & 21 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions b/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions
@@ -0,0 +1 @@
+*.bat text eol=crlf
@@ -4,14 +4,21 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
+
+# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
+[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
+
+# ROCm 7.0+ - Add gfx950
+[[ "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
+
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
-	echo "Using image $image"
-	docker run --rm --platform "linux/$build_arch" -i \
-		-w /src -v "$PWD:/src" "$image" sh -c \
-		"apt-get update \
-      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+    image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+    echo "Using image $image"
+    docker run --rm --platform "linux/$build_arch" -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
+      && pip install cmake==3.31.6 \
       && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 
@@ -0,0 +1,34 @@
+set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
+set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
+
+echo ::group::Intel Deep Learning Essentials Installation
+curl -o intel-dle-installer.exe %INTEL_DLE_URL%
+start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
+type %INTEL_DLE_LOG%
+if ERRORLEVEL 1 (
+    echo Failed to install Intel Deep Learning Essentials
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Build Environment Setup
+call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
+cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
+if ERRORLEVEL 1 (
+    echo Failed to setup environment
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Building with XPU backend
+cmake --build . --config Release
+if ERRORLEVEL 1 (
+    echo Build failed
+    exit /b 1
+)
+echo ::endgroup::
+
+set output_dir=output\%build_os%\x86_64
+if not exist "%output_dir%" mkdir "%output_dir%"
+copy bitsandbytes\*.dll "%output_dir%\" 2>nul
@@ -110,14 +110,21 @@ jobs:
   build-xpu:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-22.04, windows-2025]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - name: Build C++
+      - name: Build C++ (Linux)
+        if: runner.os == 'Linux'
         run: bash .github/scripts/build-xpu.sh
         env:
           build_os: ${{ matrix.os }}
+      - name: Build C++ (Windows)
+        if: runner.os == 'Windows'
+        run: .github/scripts/build-xpu-windows.bat
+        shell: cmd
+        env:
+          build_os: ${{ matrix.os }}
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
@@ -130,30 +137,26 @@ jobs:
       matrix:
         os: [ubuntu-22.04]
         arch: [x86_64]
-        rocm_version:
-          ["6.1.2", "6.2.4", "6.3.4", "6.4.4", "7.0"]
+        rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - name: Clean up disk space
         run: |
+          echo "Disk space before cleanup:"
+          df -h
+
+          # These are the biggest disk space hogs.
           sudo rm -rf \
-              /usr/share/dotnet \
-              /opt/ghc \
-              "/usr/local/share/boost" \
-              "$AGENT_TOOLSDIRECTORY" \
-              /opt/hostedtoolcache \
-              /opt/google/chrome \
-              /opt/microsoft/msedge \
-              /opt/microsoft/powershell \
-              /opt/pipx \
-              /usr/lib/mono \
-              /usr/local/julia* \
-              /usr/local/lib/android \
-              /usr/local/lib/node_modules \
-              /usr/local/share/chromium \
-              /usr/local/share/powershell \
-              /usr/share/swift
+            /opt/hostedtoolcache/CodeQL \
+            /usr/lib/dotnet \
+            /usr/lib/jvm \
+            /usr/local/.ghcup \
+            /usr/local/lib/android \
+            /usr/share/swift
+
+          echo "Disk space after cleanup:"
+          df -h
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
@@ -168,6 +171,9 @@ jobs:
           retention-days: 7
 
   build-wheels:
+    env:
+      # Skip rebuilding the CPU library when building the wheels.
+      BNB_SKIP_CMAKE: 1
     needs:
       - build-cpu
       - build-cuda
 
@@ -10,6 +10,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  # Skip rebuilding the CPU library when installing the wheels.
+  # We build the libraries in separate jobs and upload as artifacts.
+  BNB_SKIP_CMAKE: 1
+
 jobs:
 
   build-cpu:
@@ -49,8 +54,7 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        # TODO: Add 13.0.1 when we have runners with new enough drivers.
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
         os: [ubuntu-22.04, ubuntu-22.04-arm]
         include:
           - os: ubuntu-22.04
@@ -103,7 +107,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         # Test with the oldest supported torch version, the newest two stable/RC.
-        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
+        torch_version: ["2.3.1", "2.8.0", "2.9.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -146,7 +150,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       # We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
@@ -188,7 +192,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -263,7 +267,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -321,7 +325,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
 
       - name: Show installed packages
@@ -344,26 +348,20 @@ jobs:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
         gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
         include:
           - cuda_version: "11.8.0"
             torch_version: "2.3.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - cuda_version: "12.6.3"
-            torch_version: "2.6.0"
+            torch_version: "2.7.1"
             pypi_index: "https://download.pytorch.org/whl/cu126"
-          - cuda_version: "12.9.1"
-            torch_version: "2.8.0"
-            pypi_index: "https://download.pytorch.org/whl/cu129"
           - cuda_version: "12.8.1"
-            torch_version: "2.9.0"
-            pypi_index: "https://download.pytorch.org/whl/test/cu128"
-
-          # Note: Currently our runners do not have new enough drivers for CUDA 13.
-          # Add this when supported.
-          # - cuda_version: "13.0.1"
-          #   torch_version: "2.9.0"
-          #   pypi_index: "https://download.pytorch.org/whl/test/cu130"
+            torch_version: "2.8.0"
+            pypi_index: "https://download.pytorch.org/whl/cu128"
+          - cuda_version: "13.0.1"
+            torch_version: "2.9.1"
+            pypi_index: "https://download.pytorch.org/whl/cu130"
 
 
           # Linux L40S runners
@@ -438,7 +436,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
-          pip install -e ".[test]"
+          pip install -e ".[test]" -v
           pip install pytest-cov
       - name: Show installed packages
         run: pip list
 
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.14.3
     hooks:
       - id: ruff
         args:
@@ -17,6 +17,7 @@ repos:
       - id: mixed-line-ending
         args:
           - --fix=lf
+        exclude: '\.bat$'
   - repo: https://github.com/crate-ci/typos
     rev: v1.26.0
     hooks:
 
@@ -78,9 +78,17 @@ else()
     set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
     set(BUILD_XPU OFF)
+    set(BUILD_CPU ON)
 endif()
 
 
+if (BUILD_CPU)
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
+    find_package(OpenMP)
+endif()
+
 if(BUILD_CUDA)
     # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
     # Workaround: use --allow-unsupported-compiler
@@ -262,6 +270,34 @@ add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_17)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
+if (BUILD_CPU)
+    if (OpenMP_CXX_FOUND)
+        target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
+        add_definitions(-DHAS_OPENMP)
+    endif()
+
+    if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
+        include(CheckCXXCompilerFlag)
+        check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
+        check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
+        if (HAS_AVX512F_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512f)
+        endif()
+        if (HAS_AVX512BF16_FLAG)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
+        endif()
+        target_compile_options(
+            bitsandbytes PRIVATE
+            -mprefer-vector-width=256
+            -mfma
+            -mavx2
+            -mlzcnt
+            -mbmi
+            -mbmi2
+        )
+    endif()
+endif()
+
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 ## System Requirements
 bitsandbytes has the following minimum requirements for all platforms:
 
-* Python 3.9+
+* Python 3.10+
 * [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
   * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
 
 
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
     B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
     torch.nn.init.xavier_uniform_(B)
 
-    B_fp4, state = F.quantize_fp4(B)
-    B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
+    _B_fp4, _state = F.quantize_fp4(B)
+    _B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)
 
     B_nf4, state_nf4 = F.quantize_nf4(B)
     B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
         f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
-    CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
-    CB, SCB, _ = F.int8_vectorwise_quant(B)
+    CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
+    CB, _SCB, _ = F.int8_vectorwise_quant(B)
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
 
@@ -54,10 +54,7 @@ def _import_backends():
     """
     from importlib.metadata import entry_points
 
-    if sys.version_info < (3, 10):
-        extensions = entry_points().get("bitsandbytes.backends", [])
-    else:
-        extensions = entry_points(group="bitsandbytes.backends")
+    extensions = entry_points(group="bitsandbytes.backends")
 
     for ext in extensions:
         try:
@@ -75,4 +72,4 @@ def _import_backends():
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.48.2.dev0"
+__version__ = "0.49.0.dev0"