bitsandbytes-foundation
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/build-rocm.sh‎
Lines changed: 13 additions & 6 deletions b/‎.github/scripts/build-rocm.sh‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions b/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 23 additions & 20 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 23 additions & 20 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarking/matmul_benchmark.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions b/‎bitsandbytes/__init__.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 2 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎bitsandbytes/backends/utils.py‎
Lines changed: 2 additions & 1 deletion b/‎bitsandbytes/backends/utils.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1 @@
+*.bat text eol=crlf
@@ -4,13 +4,20 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
+
+# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
+[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
+
+# ROCm 7.0+ - Add gfx950
+[[ "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
+
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
-	echo "Using image $image"
-	docker run --rm --platform "linux/$build_arch" -i \
-		-w /src -v "$PWD:/src" "$image" sh -c \
-		"apt-get update \
+    image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+    echo "Using image $image"
+    docker run --rm --platform "linux/$build_arch" -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
       && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 
@@ -0,0 +1,34 @@
+set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
+set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
+
+echo ::group::Intel Deep Learning Essentials Installation
+curl -o intel-dle-installer.exe %INTEL_DLE_URL%
+start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
+type %INTEL_DLE_LOG%
+if ERRORLEVEL 1 (
+    echo Failed to install Intel Deep Learning Essentials
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Build Environment Setup
+call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
+cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
+if ERRORLEVEL 1 (
+    echo Failed to setup environment
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Building with XPU backend
+cmake --build . --config Release
+if ERRORLEVEL 1 (
+    echo Build failed
+    exit /b 1
+)
+echo ::endgroup::
+
+set output_dir=output\%build_os%\x86_64
+if not exist "%output_dir%" mkdir "%output_dir%"
+copy bitsandbytes\*.dll "%output_dir%\" 2>nul
@@ -110,14 +110,21 @@ jobs:
   build-xpu:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-22.04, windows-2025]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - name: Build C++
+      - name: Build C++ (Linux)
+        if: runner.os == 'Linux'
         run: bash .github/scripts/build-xpu.sh
         env:
           build_os: ${{ matrix.os }}
+      - name: Build C++ (Windows)
+        if: runner.os == 'Windows'
+        run: .github/scripts/build-xpu-windows.bat
+        shell: cmd
+        env:
+          build_os: ${{ matrix.os }}
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
@@ -130,30 +137,26 @@ jobs:
       matrix:
         os: [ubuntu-22.04]
         arch: [x86_64]
-        rocm_version:
-          ["6.1.2", "6.2.4", "6.3.4", "6.4.4", "7.0"]
+        rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - name: Clean up disk space
         run: |
+          echo "Disk space before cleanup:"
+          df -h
+
+          # These are the biggest disk space hogs.
           sudo rm -rf \
-              /usr/share/dotnet \
-              /opt/ghc \
-              "/usr/local/share/boost" \
-              "$AGENT_TOOLSDIRECTORY" \
-              /opt/hostedtoolcache \
-              /opt/google/chrome \
-              /opt/microsoft/msedge \
-              /opt/microsoft/powershell \
-              /opt/pipx \
-              /usr/lib/mono \
-              /usr/local/julia* \
-              /usr/local/lib/android \
-              /usr/local/lib/node_modules \
-              /usr/local/share/chromium \
-              /usr/local/share/powershell \
-              /usr/share/swift
+            /opt/hostedtoolcache/CodeQL \
+            /usr/lib/dotnet \
+            /usr/lib/jvm \
+            /usr/local/.ghcup \
+            /usr/local/lib/android \
+            /usr/share/swift
+
+          echo "Disk space after cleanup:"
+          df -h
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
 
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.14.3
     hooks:
       - id: ruff
         args:
@@ -17,6 +17,7 @@ repos:
       - id: mixed-line-ending
         args:
           - --fix=lf
+        exclude: '\.bat$'
   - repo: https://github.com/crate-ci/typos
     rev: v1.26.0
     hooks:
 
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 ## System Requirements
 bitsandbytes has the following minimum requirements for all platforms:
 
-* Python 3.9+
+* Python 3.10+
 * [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
   * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
 
 
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
     B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
     torch.nn.init.xavier_uniform_(B)
 
-    B_fp4, state = F.quantize_fp4(B)
-    B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
+    _B_fp4, _state = F.quantize_fp4(B)
+    _B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)
 
     B_nf4, state_nf4 = F.quantize_nf4(B)
     B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
         f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
     )
 
-    CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
-    CB, SCB, _ = F.int8_vectorwise_quant(B)
+    CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
+    CB, _SCB, _ = F.int8_vectorwise_quant(B)
     torch.cuda.synchronize()
     t0 = time.time()
     for i in range(iters):
 
@@ -54,10 +54,7 @@ def _import_backends():
     """
     from importlib.metadata import entry_points
 
-    if sys.version_info < (3, 10):
-        extensions = entry_points().get("bitsandbytes.backends", [])
-    else:
-        extensions = entry_points(group="bitsandbytes.backends")
+    extensions = entry_points(group="bitsandbytes.backends")
 
     for ext in extensions:
         try:
@@ -75,4 +72,4 @@ def _import_backends():
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.48.2.dev0"
+__version__ = "0.49.0.dev0"
@@ -1,6 +1,7 @@
+from collections.abc import Callable
 from dataclasses import dataclass
 from math import prod
-from typing import Callable, Optional
+from typing import Optional
 import warnings
 from warnings import warn
 
@@ -257,7 +258,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        CAt, subA, A = ctx.tensors
+        CAt, subA, _A = ctx.tensors
         SCAt, idx = ctx.tensor_states
         state: MatmulLtState = ctx.state
         grad_A = grad_B = grad_bias = None
 
@@ -4,9 +4,10 @@
 import torch
 
 try:
-    import triton  # noqa: F401
     import triton.language as tl  # noqa: F401
 
+    import triton  # noqa: F401
+
     triton_available = True
 except ImportError:
     triton_available = False