Merge branch 'main' into absmax

jiqing-feng · web-flow · commit 50ee9943c622 · 2025-06-09T09:24:39.000+08:00
diff --git a/README.md b/README.md
@@ -25,13 +25,25 @@ bitsandbytes has the following minimum requirements for all platforms:
 
 #### Accelerator support:
 
+<small>Note: this table reflects the status of the current development branch. For the latest stable release, see the
+[document in the v0.46.0 tag](https://github.com/bitsandbytes-foundation/bitsandbytes/blob/0.46.0/README.md#accelerator-support).
+</small>
+
+##### Legend:
+🚧 = In Development,
+〰️ = Partially Supported,
+✅ = Supported,
+❌ = Not Supported
+
 <table>
   <thead>
     <tr>
       <th>Platform</th>
       <th>Accelerator</th>
       <th>Hardware Requirements</th>
-      <th>Support Status</th>
+      <th>LLM.int8()</th>
+      <th>QLoRA 4-bit</th>
+      <th>8-bit Optimizers</th>
     </tr>
   </thead>
   <tbody>
@@ -42,13 +54,17 @@ bitsandbytes has the following minimum requirements for all platforms:
       <td align="right">x86-64</td>
       <td>◻️ CPU</td>
       <td>AVX2</td>
-      <td>〰️ Partial Support</td>
+      <td>〰️</td>
+      <td>〰️</td>
+      <td>❌</td>
     </tr>
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM50+ minimum<br>SM75+ recommended</td>
-      <td>✅ Full Support</td>
+      <td>✅</td>
+      <td>✅</td>
+      <td>✅</td>
     </tr>
     <tr>
       <td></td>
@@ -57,7 +73,9 @@ bitsandbytes has the following minimum requirements for all platforms:
         CDNA: gfx90a, gfx942<br>
         RDNA: gfx1100, gfx1200
       </td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>🚧</td>
     </tr>
     <tr>
       <td></td>
@@ -67,25 +85,33 @@ bitsandbytes has the following minimum requirements for all platforms:
         Arc A-Series (Alchemist)<br>
         Arc B-Series (Battlemage)
       </td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>🚧</td>
     </tr>
     <tr>
       <td></td>
       <td>🟪 Intel Gaudi <br><code>hpu</code></td>
       <td>Gaudi1, Gaudi2, Gaudi3</td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>❌</td>
     </tr>
     <tr>
       <td align="right">aarch64</td>
       <td>◻️ CPU</td>
       <td></td>
-      <td>〰️ Partial Support</td>
+      <td>〰️</td>
+      <td>〰️</td>
+      <td>❌</td>
     </tr>
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM75, SM80, SM90, SM100</td>
-      <td>✅ Full Support</td>
+      <td>✅</td>
+      <td>✅</td>
+      <td>✅</td>
     </tr>
     <tr>
       <td colspan="4">🪟 <strong>Windows 11 / Windows Server 2019+</strong></td>
@@ -94,13 +120,17 @@ bitsandbytes has the following minimum requirements for all platforms:
       <td align="right">x86-64</td>
       <td>◻️ CPU</td>
       <td>AVX2</td>
-      <td>〰️ Partial Support</td>
+      <td>〰️</td>
+      <td>〰️</td>
+      <td>❌</td>
     </tr>
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
       <td>SM50+ minimum<br>SM75+ recommended</td>
-      <td>✅ Full Support</td>
+      <td>✅</td>
+      <td>✅</td>
+      <td>✅</td>
     </tr>
     <tr>
       <td></td>
@@ -109,7 +139,9 @@ bitsandbytes has the following minimum requirements for all platforms:
         Arc A-Series (Alchemist) <br>
         Arc B-Series (Battlemage)
       </td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>🚧</td>
     </tr>
     <tr>
       <td colspan="4">🍎 <strong>macOS 13.1+</strong></td>
@@ -118,13 +150,17 @@ bitsandbytes has the following minimum requirements for all platforms:
       <td align="right">arm64</td>
       <td>◻️ CPU</td>
       <td>Apple M1+</td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>❌</td>
     </tr>
     <tr>
       <td></td>
       <td>⬜ Metal <br><code>mps</code></td>
       <td>Apple M1+</td>
-      <td>🚧 In Development</td>
+      <td>🚧</td>
+      <td>🚧</td>
+      <td>❌</td>
   </tbody>
 </table>
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -291,13 +291,6 @@ def from_prequantized(
 
         return self
 
-    @classmethod
-    def __torch_function__(cls, func, types, args=(), kwargs=None):
-        if kwargs is None:
-            kwargs = {}
-        with torch._C.DisableTorchFunctionSubclass():
-            return func(*args, **kwargs)
-
     def _quantize(self, device):
         w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
@@ -455,14 +448,14 @@ def set_compute_type(self, x):
             self.compute_dtype = x.dtype
         elif x.dtype == torch.float16:
             # we take the compoute dtype passed into the layer
-            if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]):
+            if self.compute_dtype in [None, torch.float32] and (x.numel() == x.shape[-1]):
                 # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast
                 # warn the user about this
                 warnings.warn(
                     "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.",
                 )
                 warnings.filterwarnings("ignore", message=".*inference.")
-            if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]):
+            if self.compute_dtype in [None, torch.float32] and (x.numel() != x.shape[-1]):
                 warnings.warn(
                     "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.",
                 )
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -270,10 +270,7 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
 @pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
 @pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
 def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
-    if device == "cpu" and quant_type == "fp4":
-        pytest.skip("FP4 is not supported for CPU")
-
-    if fullgraph and torch.__version__ < (2, 8):
+    if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
         pytest.skip("fullgraph mode requires torch 2.8 or higher")
 
     if device == "cuda" and platform.system() == "Windows":
diff --git a/tests/test_modules.py b/tests/test_modules.py
@@ -440,31 +440,23 @@ def test_4bit_linear_warnings(device):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):
-        net = nn.Sequential(
-            *[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4", compute_dtype=torch.float32) for i in range(10)]
-        )
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
         net = net.to(device)
         inp = torch.rand(10, dim1, device=device, dtype=torch.float16)
         net(inp)
     with pytest.warns(UserWarning, match=r"inference."):
-        net = nn.Sequential(
-            *[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4", compute_dtype=torch.float32) for i in range(10)]
-        )
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
         net = net.to(device)
         inp = torch.rand(1, dim1, device=device, dtype=torch.float16)
         net(inp)
 
     with pytest.warns(UserWarning) as record:
-        net = nn.Sequential(
-            *[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4", compute_dtype=torch.float32) for i in range(10)]
-        )
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
         net = net.to(device)
         inp = torch.rand(10, dim1, device=device, dtype=torch.float16)
         net(inp)
 
-        net = nn.Sequential(
-            *[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4", compute_dtype=torch.float32) for i in range(10)]
-        )
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
         net = net.to(device)
         inp = torch.rand(1, dim1, device=device, dtype=torch.float16)
         net(inp)