bitsandbytes-foundation
diff --git a/‎.github/scripts/build-cuda.sh‎
Lines changed: 4 additions & 4 deletions b/‎.github/scripts/build-cuda.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 12 additions & 6 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/nn/modules.py‎
Lines changed: 1 addition & 7 deletions b/‎bitsandbytes/nn/modules.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎bitsandbytes/optim/adam.py‎
Lines changed: 24 additions & 3 deletions b/‎bitsandbytes/optim/adam.py‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎bitsandbytes/optim/adamw.py‎
Lines changed: 31 additions & 14 deletions b/‎bitsandbytes/optim/adamw.py‎
Lines changed: 31 additions & 14 deletions
@@ -11,14 +11,14 @@ if [[ -v cuda_targets ]]; then
 elif [ "${build_arch}" = "aarch64" ]; then
     build_capability="75;80;90"
 
-    # CUDA 12.8: Add sm100
-    [[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;90;100"
+    # CUDA 12.8+: Add sm100/sm120
+    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
 else
     # By default, target Maxwell through Hopper.
     build_capability="50;52;60;61;70;75;80;86;89;90"
 
-    # CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
-    [[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
+    # CUDA 12.8+: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
+    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;86;89;90;100;120"
 fi
 
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
 
@@ -72,16 +72,17 @@ jobs:
           - os: windows-latest
             arch: x86_64
         cuda_version:
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/[email protected].22
+      - uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
-          cuda: ${{ matrix.cuda_version }}
+          # Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
+          cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
           method: "network"
           sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
           linux-local-args: '["--toolkit"]'
 
@@ -49,22 +49,23 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
+        os: [ubuntu-22.04, ubuntu-22.04-arm]
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+            cuda_version: "11.8.0"
     runs-on: ${{ matrix.os }}
 
     steps:
       - uses: actions/checkout@v4
 
       - name: Install CUDA Toolkit
-        uses: Jimver/[email protected].23
+        uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
@@ -193,7 +194,7 @@ jobs:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
         gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
         include:
           - cuda_version: "11.8.0"
             torch_version: "2.2.2"
@@ -204,6 +205,9 @@ jobs:
           - cuda_version: "12.8.1"
             torch_version: "2.7.0"
             pypi_index: "https://download.pytorch.org/whl/cu128"
+          - cuda_version: "12.9.1"
+            torch_version: "2.8.0"
+            pypi_index: "https://download.pytorch.org/whl/nightly/cu129"
 
 
           # Linux L40S runners
@@ -236,12 +240,14 @@ jobs:
             gpu: T4
             runner: CUDA-Windows-x64
             cuda_version: "11.8.0"
-            torch_version: "2.7.0"
+            torch_version: "2.7.1"    # Note: this is the last PyTorch release supporting CUDA 11.8.
             pypi_index: "https://download.pytorch.org/whl/cu118"
 
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
+          - os: windows-2025
+            cuda_version: "12.9.1"
           - os: windows-2025
             cuda_version: "12.8.1"
           - os: windows-2025
@@ -273,7 +279,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url ${{ matrix.pypi_index }}
+          pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
           pip install -e ".[test]"
           pip install pytest-cov
 
 
@@ -0,0 +1,3 @@
+include CMakeLists.txt
+graft csrc
+graft include
@@ -290,13 +290,6 @@ def from_prequantized(
 
         return self
 
-    @classmethod
-    def __torch_function__(cls, func, types, args=(), kwargs=None):
-        if kwargs is None:
-            kwargs = {}
-        with torch._C.DisableTorchFunctionSubclass():
-            return func(*args, **kwargs)
-
     def _quantize(self, device):
         w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
@@ -353,6 +346,7 @@ def to(self, *args, **kwargs):
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
                 quant_storage=self.quant_storage,
+                bnb_quantized=self.bnb_quantized,
             )
 
             return new_param
 
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 from bitsandbytes.optim.optimizer import Optimizer2State
 
 
@@ -100,8 +99,10 @@ def __init__(
                 The weight decay value for the optimizer.
             amsgrad (`bool`, defaults to `False`):
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+                Note: This parameter is not supported in Adam8bit and must be False.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
+                Note: This parameter is not used in Adam8bit as it always uses 8-bit optimization.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
@@ -113,14 +114,23 @@ def __init__(
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
+        # Validate unsupported parameters
+        if amsgrad:
+            raise ValueError("Adam8bit does not support amsgrad=True")
+
+        if optim_bits != 32:
+            # We allow the default value of 32 to maintain compatibility with the function signature,
+            # but any other value is invalid since Adam8bit always uses 8-bit optimization
+            raise ValueError("Adam8bit only supports optim_bits=32 (default value for compatibility)")
+
         super().__init__(
             "adam",
             params,
             lr,
             betas,
             eps,
             weight_decay,
-            8,
+            8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
             percentile_clipping,
@@ -283,8 +293,10 @@ def __init__(
                 The weight decay value for the optimizer.
             amsgrad (`bool`, defaults to `False`):
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+                Note: This parameter is not supported in PagedAdam8bit and must be False.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
+                Note: This parameter is not used in PagedAdam8bit as it always uses 8-bit optimization.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
@@ -296,14 +308,23 @@ def __init__(
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
+        # Validate unsupported parameters
+        if amsgrad:
+            raise ValueError("PagedAdam8bit does not support amsgrad=True")
+
+        if optim_bits != 32:
+            # We allow the default value of 32 to maintain compatibility with the function signature,
+            # but any other value is invalid since PagedAdam8bit always uses 8-bit optimization
+            raise ValueError("PagedAdam8bit only supports optim_bits=32 (default value for compatibility)")
+
         super().__init__(
             "adam",
             params,
             lr,
             betas,
             eps,
             weight_decay,
-            8,
+            8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
             percentile_clipping,
 
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 from bitsandbytes.optim.optimizer import Optimizer2State
 
 
@@ -25,7 +26,7 @@ def __init__(
         Base AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -86,7 +87,7 @@ def __init__(
         8-bit AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -98,8 +99,10 @@ def __init__(
                 The weight decay value for the optimizer.
             amsgrad (`bool`, defaults to `False`):
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+                Note: This parameter is not supported in AdamW8bit and must be False.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
+                Note: This parameter is not used in AdamW8bit as it always uses 8-bit optimization.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
@@ -111,14 +114,23 @@ def __init__(
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
+        # Validate unsupported parameters
+        if amsgrad:
+            raise ValueError("AdamW8bit does not support amsgrad=True")
+
+        if optim_bits != 32:
+            # We allow the default value of 32 to maintain compatibility with the function signature,
+            # but any other value is invalid since AdamW8bit always uses 8-bit optimization
+            raise ValueError("AdamW8bit only supports optim_bits=32 (default value for compatibility)")
+
         super().__init__(
             "adam",
             params,
             lr,
             betas,
             eps,
             weight_decay,
-            8,
+            8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
             percentile_clipping,
@@ -147,7 +159,7 @@ def __init__(
         32-bit AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -207,7 +219,7 @@ def __init__(
         Paged AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -229,8 +241,6 @@ def __init__(
                 Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             block_wise (`bool`, defaults to `True`):
                 Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
-            is_paged (`bool`, defaults to `False`):
-                Whether the optimizer is a paged optimizer or not.
         """
         super().__init__(
             "adam",
@@ -267,7 +277,7 @@ def __init__(
         Paged 8-bit AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -279,8 +289,10 @@ def __init__(
                 The weight decay value for the optimizer.
             amsgrad (`bool`, defaults to `False`):
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+                Note: This parameter is not supported in PagedAdamW8bit and must be False.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
+                Note: This parameter is not used in PagedAdamW8bit as it always uses 8-bit optimization.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
@@ -289,17 +301,24 @@ def __init__(
                 Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             block_wise (`bool`, defaults to `True`):
                 Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
-            is_paged (`bool`, defaults to `False`):
-                Whether the optimizer is a paged optimizer or not.
         """
+        # Validate unsupported parameters
+        if amsgrad:
+            raise ValueError("PagedAdamW8bit does not support amsgrad=True")
+
+        if optim_bits != 32:
+            # We allow the default value of 32 to maintain compatibility with the function signature,
+            # but any other value is invalid since PagedAdamW8bit always uses 8-bit optimization
+            raise ValueError("PagedAdamW8bit only supports optim_bits=32 (default value for compatibility)")
+
         super().__init__(
             "adam",
             params,
             lr,
             betas,
             eps,
             weight_decay,
-            8,
+            8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
             percentile_clipping,
@@ -327,7 +346,7 @@ def __init__(
         Paged 32-bit AdamW optimizer.
 
         Arguments:
-            params (`torch.tensor`):
+            params (`torch.Tensor`):
                 The input parameters to optimize.
             lr (`float`, defaults to 1e-3):
                 The learning rate.
@@ -349,8 +368,6 @@ def __init__(
                 Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             block_wise (`bool`, defaults to `True`):
                 Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
-            is_paged (`bool`, defaults to `False`):
-                Whether the optimizer is a paged optimizer or not.
         """
         super().__init__(
             "adam",
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+include CMakeLists.txt`
	`2`	`+graft csrc`
	`3`	`+graft include`