bitsandbytes-foundation
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 51 additions & 7 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 51 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 28 additions & 24 deletions b/‎README.md‎
Lines changed: 28 additions & 24 deletions
diff --git a/‎benchmarking/int8/row_scale_benchmark.py‎
Lines changed: 0 additions & 70 deletions b/‎benchmarking/int8/row_scale_benchmark.py‎
Lines changed: 0 additions & 70 deletions
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 3 additions & 3 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bitsandbytes/optim/optimizer.py‎
Lines changed: 5 additions & 5 deletions b/‎bitsandbytes/optim/optimizer.py‎
Lines changed: 5 additions & 5 deletions
@@ -93,24 +93,32 @@ jobs:
           path: output/${{ matrix.os }}/${{ matrix.arch }}/*
           retention-days: 7
 
-  cpu-tests:
+  test-cpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        torch_version: ["2.6.0", "2.7.0"]
+        # Test with the oldest supported torch version and the two newest.
+        torch_version: ["2.2.2", "2.6.0", "2.7.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
             runner: banb-aws-general-8-plus-use1-public-80
           - os: ubuntu-22.04-arm
             arch: aarch64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+            torch_version: "2.5.1"
           - os: windows-2025
             arch: x86_64
           - os: macos-15
             arch: arm64
+        exclude:
+          - os: ubuntu-22.04-arm
+            torch_version: "2.2.2"
+
     runs-on: ${{ matrix.runner || matrix.os }}
     env:
       BNB_TEST_DEVICE: cpu
@@ -129,12 +137,21 @@ jobs:
         with:
           python-version: 3.9
 
+      - name: Setup MSVC
+        if: startsWith(matrix.os, 'windows')
+        uses: ilammy/[email protected] # to use cl for torch.compile
+
       - name: Install dependencies
         run: |
           pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
           pip install -e ".[test]"
           pip install pytest-cov
 
+      # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
       - name: Show installed packages
         run: pip list
 
@@ -144,7 +161,7 @@ jobs:
       - name: Run tests
         run: pytest --durations=100
 
-  # cuda-aarch64-tests:
+  # test-cuda-aarch64:
   #   if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
   #   needs: build-cuda
   #   strategy:
@@ -167,7 +184,7 @@ jobs:
 
 
 
-  cuda-tests:
+  test-cuda:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cuda
     strategy:
@@ -179,7 +196,7 @@ jobs:
         cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
         include:
           - cuda_version: "11.8.0"
-            torch_version: "2.4.1"
+            torch_version: "2.2.2"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - cuda_version: "12.6.3"
             torch_version: "2.6.0"
@@ -188,18 +205,40 @@ jobs:
             torch_version: "2.7.0"
             pypi_index: "https://download.pytorch.org/whl/cu128"
 
-          # L40S runners
+
+          # Linux L40S runners
           - os: ubuntu-22.04
             gpu: L40S
             runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
 
-          # T4 runners
+          # Linux T4 runners
           - os: ubuntu-22.04
             gpu: T4
             runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
+
+          # Specific Windows runners using cu118
           - os: windows-2025
+            arch: x86_64
             gpu: T4
             runner: CUDA-Windows-x64
+            cuda_version: "11.8.0"
+            torch_version: "2.2.0"
+            pypi_index: "https://download.pytorch.org/whl/cu118"
+          - os: windows-2025
+            arch: x86_64
+            gpu: T4
+            runner: CUDA-Windows-x64
+            cuda_version: "11.8.0"
+            torch_version: "2.6.0"
+            pypi_index: "https://download.pytorch.org/whl/cu118"
+          - os: windows-2025
+            arch: x86_64
+            gpu: T4
+            runner: CUDA-Windows-x64
+            cuda_version: "11.8.0"
+            torch_version: "2.7.0"
+            pypi_index: "https://download.pytorch.org/whl/cu118"
+
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
@@ -238,6 +277,11 @@ jobs:
           pip install -e ".[test]"
           pip install pytest-cov
 
+        # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
       - name: Show installed packages
         run: pip list
 
 
@@ -36,44 +36,45 @@ bitsandbytes has the following minimum requirements for all platforms:
   </thead>
   <tbody>
     <tr>
-      <td colspan="4">🐧 <strong>Linux</strong></td>
+      <td colspan="4">🐧 <strong>Linux, glibc >= 2.24</strong></td>
     </tr>
     <tr>
       <td align="right">x86-64</td>
       <td>◻️ CPU</td>
-      <td></td>
+      <td>AVX2</td>
       <td>〰️ Partial Support</td>
     </tr>
     <tr>
       <td></td>
-      <td>🟩 NVIDIA GPU</td>
+      <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM50+ minimum<br>SM75+ recommended</td>
-      <td>✅ Full Support *</td>
+      <td>✅ Full Support</td>
     </tr>
     <tr>
       <td></td>
-      <td>🟥 AMD GPU</td>
-      <td>gfx90a, gfx942, gfx1100</td>
+      <td>🟥 AMD GPU <br><code>cuda</code></td>
+      <td>
+        CDNA: gfx90a, gfx942<br>
+        RDNA: gfx1100, gfx1200
+      </td>
       <td>🚧 In Development</td>
     </tr>
     <tr>
       <td></td>
-      <td>🟦 Intel XPU</td>
+      <td>🟦 Intel GPU <br><code>xpu</code></td>
       <td>
-        Data Center GPU Max Series (Ponte Vecchio) <br>
-        Arc A-Series (Alchemist) <br>
+        Data Center GPU Max Series<br>
+        Arc A-Series (Alchemist)<br>
         Arc B-Series (Battlemage)
       </td>
       <td>🚧 In Development</td>
     </tr>
-    <!--
     <tr>
       <td></td>
-      <td>🟦 Intel HPU</td>
+      <td>🟪 Intel Gaudi <br><code>hpu</code></td>
       <td>Gaudi1, Gaudi2, Gaudi3</td>
-      <td>🚧</td>
+      <td>🚧 In Development</td>
     </tr>
-    --->
     <tr>
       <td align="right">aarch64</td>
       <td>◻️ CPU</td>
@@ -82,12 +83,12 @@ bitsandbytes has the following minimum requirements for all platforms:
     </tr>
     <tr>
       <td></td>
-      <td>🟩 NVIDIA GPU</td>
+      <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM75, SM80, SM90, SM100</td>
-      <td>✅ Full Support *</td>
+      <td>✅ Full Support</td>
     </tr>
     <tr>
-      <td colspan="4">🪟 <strong>Windows</strong></td>
+      <td colspan="4">🪟 <strong>Windows 11 / Windows Server 2019+</strong></td>
     </tr>
     <tr>
       <td align="right">x86-64</td>
@@ -97,33 +98,36 @@ bitsandbytes has the following minimum requirements for all platforms:
     </tr>
     <tr>
       <td></td>
-      <td>🟩 NVIDIA GPU</td>
+      <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM50+ minimum<br>SM75+ recommended</td>
-      <td>✅ Full Support *</td>
+      <td>✅ Full Support</td>
     </tr>
     <tr>
       <td></td>
-      <td>🟦 Intel XPU</td>
+      <td>🟦 Intel GPU <br><code>xpu</code></td>
       <td>
         Arc A-Series (Alchemist) <br>
         Arc B-Series (Battlemage)
       </td>
       <td>🚧 In Development</td>
     </tr>
     <tr>
-      <td colspan="4">🍎 <strong>macOS</strong></td>
+      <td colspan="4">🍎 <strong>macOS 13.1+</strong></td>
     </tr>
     <tr>
       <td align="right">arm64</td>
-      <td>◻️ CPU / Metal</td>
+      <td>◻️ CPU</td>
       <td>Apple M1+</td>
-      <td>❌ Under consideration</td>
+      <td>🛣️ Future Roadmap</td>
     </tr>
+    <tr>
+      <td></td>
+      <td>⬜ Metal <br><code>mps</code></td>
+      <td>Apple M1+</td>
+      <td>🛣️ Future Roadmap</td>
   </tbody>
 </table>
 
-\* Accelerated INT8 requires SM75+.
-
 ## :book: Documentation
 * [Official Documentation](https://huggingface.co/docs/bitsandbytes/main)
 * 🤗 [Transformers](https://huggingface.co/docs/transformers/quantization/bitsandbytes)
 
@@ -367,7 +367,7 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
     # these are additional items that come from the case
     # where all the exponent bits are zero and no
     # indicator bit is present
-    non_sign_bits = total_bits - (1 if signed else 1)
+    non_sign_bits = total_bits - 1
     additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
     for i in range(max_exponent_bits):
         fraction_items = int(
@@ -771,14 +771,14 @@ def quantize_blockwise(
         qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
         quant_state = QuantState(
             absmax=qabsmax,
-            code=code,
+            code=code.to(A.device, copy=True),
             blocksize=blocksize,
             dtype=A.dtype,
             offset=offset,
             state2=state2,
         )
     else:
-        quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
+        quant_state = QuantState(absmax=_absmax, code=code.to(A.device, copy=True), blocksize=blocksize, dtype=A.dtype)
 
     # TODO(matthewdouglas): Deprecate out kwarg
     out = out.copy_(_out) if out is not None else _out
 
@@ -303,9 +303,9 @@ def get_config(self, gindex, pindex, group):
         config["eps"] = group["eps"]
         config["weight_decay"] = group["weight_decay"]
         config["lr"] = group["lr"]
-        config["alpha"] = group.get("alpha")
-        config["t_alpha"] = group.get("t_alpha")
-        config["t_beta3"] = group.get("t_beta3")
+        config["alpha"] = group.get("alpha", 0.0)
+        config["t_alpha"] = group.get("t_alpha", 0)
+        config["t_beta3"] = group.get("t_beta3", 0)
         config["optim_bits"] = self.args.optim_bits
         config["min_8bit_size"] = self.args.min_8bit_size
         config["percentile_clipping"] = self.args.percentile_clipping
@@ -530,7 +530,7 @@ def update_step(self, group, p, gindex, pindex):
                 state["state2"],
                 config["betas"][1],
                 config["betas"][2] if len(config["betas"]) >= 3 else 0.0,
-                config["alpha"],
+                config.get("alpha", 0.0),
                 config["weight_decay"],
                 gnorm_scale,
                 state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
@@ -575,7 +575,7 @@ def update_step(self, group, p, gindex, pindex):
                 config["betas"][0],
                 config["betas"][1],
                 config["betas"][2] if len(config["betas"]) >= 3 else 0.0,
-                config["alpha"],
+                config.get("alpha", 0.0),
                 config["eps"],
                 step,
                 config["lr"],