bitsandbytes-foundation
diff --git a/‎.github/scripts/build-cuda.sh‎
Lines changed: 2 additions & 5 deletions b/‎.github/scripts/build-cuda.sh‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 33 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 3 additions & 33 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 22 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 22 deletions
diff --git a/‎bitsandbytes/__init__.py‎
Lines changed: 33 additions & 7 deletions b/‎bitsandbytes/__init__.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 33 additions & 7 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 17 additions & 26 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 2 additions & 2 deletions b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 2 additions & 2 deletions
@@ -8,11 +8,8 @@ set -xeuo pipefail
 # By default, target Maxwell through Hopper.
 build_capability="50;52;60;61;70;75;80;86;89;90"
 
-# CUDA 11.7: Remove sm89 and sm90
-[[ "${cuda_version}" == 11.7.* ]] && build_capability="50;52;60;61;70;75;80;86"
-
-# CUDA 12.8: Add sm100 and sm120; remove sm50 through sm61
-[[ "${cuda_version}" == 12.8.* ]] && build_capability="70;75;80;86;89;90;100;120"
+# CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
+[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
 
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
 
 
@@ -40,7 +40,7 @@ jobs:
             arch: aarch64
           - os: ubuntu-22.04 # Temporary. Takes too long, not ready yet.
             arch: aarch64
-    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - name: Setup MSVC
@@ -70,13 +70,13 @@ jobs:
           - windows-latest
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
           - os: ubuntu-22.04 # Temporary. Takes too long, not ready yet.
             arch: aarch64
-    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
         # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
@@ -250,33 +250,3 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           print-hash: true
-
-#  test:
-#    needs:
-#      - build-wheels
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        include:
-#          - os: ubuntu-latest
-#            arch: x86_64
-#            python-version: "3.8"
-#          - os: windows-latest
-#            arch: x86_64
-#            python-version: "3.8"
-#    runs-on: ${{ matrix.os }}
-#    steps:
-#      - uses: actions/checkout@v4
-#      - uses: actions/download-artifact@v4
-#        with:
-#          merge-multiple: true
-#          pattern: "bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}*"
-#          path: wheel/
-#      - uses: actions/setup-python@v5
-#        with:
-#          python-version: ${{ matrix.python-version }}
-#          cache: pip
-#      - shell: bash
-#        run: ls -lar wheel/
-#      - run: pip install wheel/*.whl -r requirements-ci.txt
-#      - run: pytest --log-cli-level=DEBUG --continue-on-collection-errors tests
@@ -93,8 +93,8 @@ if(BUILD_CUDA)
         )
     endif()
 
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
-        message(FATAL_ERROR "CUDA Version < 11 is not supported")
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4")
+        message(FATAL_ERROR "CUDA Version < 11.4 is not supported")
     elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
         message(FATAL_ERROR "CUDA Version > 12 is not supported")
     endif()
@@ -103,35 +103,20 @@ if(BUILD_CUDA)
     if(CMAKE_VERSION VERSION_LESS "3.23.0")
         message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
 
-        # 11.x and 12.x both support these at a minimum.
-        set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80)
+        # 11.4+ supports these at a minimum.
+        set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
         set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
 
-        # CUDA 11.1 adds Ampere support for GA102-GA107.
-        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
-        endif()
-
-        # CUDA 11.4 adds Ampere support for GA10B.
-        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
-        endif()
-
         # CUDA 11.8 adds support for Ada and Hopper.
         if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
             list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
         endif()
-        # CUDA 12.7 adds support for Blackwell B100.
-        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.7")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100)
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100)
-        endif()
 
-        # CUDA 12.8 adds support for RTX 50 Blackwell.
+        # CUDA 12.8 adds support for Blackwell.
         if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 101 120)
-            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 101 120)
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120)
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
         endif()
     endif()
 
 
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import sys
+
 import torch
 
 from . import _ops, research, utils
@@ -18,24 +20,48 @@
 from .optim import adam
 
 # This is a signal for integrations with transformers/diffusers.
-# Eventually, we will remove this and check based on release version.
+# Eventually we may remove this but it is currently required for compatibility.
 features = {"multi-backend"}
 supported_torch_devices = {
-    "cuda",
     "cpu",
-    # "mps",
-    # "xpu",
-    # "hpu",
-    # "npu",
+    "cuda",  # NVIDIA/AMD GPU
+    "xpu",  # Intel GPU
+    "hpu",  # Gaudi
+    "npu",  # Ascend NPU
+    "mps",  # Apple Silicon
 }
 
 if torch.cuda.is_available():
     from .backends.cuda import ops as cuda_ops
 
+
+def _import_backends():
+    """
+    Discover and autoload all available backends installed as separate packages.
+    Packages with an entrypoint for "bitsandbytes.backends" will be loaded.
+    Inspired by PyTorch implementation: https://pytorch.org/tutorials/prototype/python_extension_autoload.html
+    """
+    from importlib.metadata import entry_points
+
+    if sys.version_info < (3, 10):
+        extensions = entry_points().get("bitsandbytes.backends", [])
+    else:
+        extensions = entry_points(group="bitsandbytes.backends")
+
+    for ext in extensions:
+        try:
+            entry = ext.load()
+            entry()
+        except Exception as e:
+            raise RuntimeError(f"bitsandbytes: failed to load backend {ext.name}: {e}") from e
+
+
+_import_backends()
+
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.45.5.dev0"
+__version__ = "0.46.0.dev0"
@@ -15,11 +15,37 @@
     register_fake = torch.library.impl_abstract
     register_kernel = torch.library.impl
 
+# Int8 mixed precision matmul + dequant + bias
+torch.library.define(
+    "bitsandbytes::int8_mixed_scaled_mm",
+    "(Tensor A, Tensor CA, Tensor CB, Tensor SCA, Tensor SCB, Tensor? outlier_cols=None, Tensor? bias=None) -> (Tensor, Tensor?)",
+)
+
+
+@register_fake("bitsandbytes::int8_mixed_scaled_mm")
+def _(
+    A: torch.Tensor,
+    CA: torch.Tensor,
+    CB: torch.Tensor,
+    SCA: torch.Tensor,
+    SCB: torch.Tensor,
+    outlier_cols: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    shapeC = (*CA.shape[:-1], CB.shape[0])
+
+    out = torch.empty(shapeC, device=A.device, dtype=A.dtype)
+
+    outlier_cols = torch.library.get_ctx().new_dynamic_size()
+    subA = A.new_empty(outlier_cols, dtype=torch.int64)
+
+    return out, subA
+
 
 # Higher level op: int8 matmul + dequant + bias
 torch.library.define(
     "bitsandbytes::int8_scaled_mm",
-    "(Tensor A, Tensor B, Tensor row_stats, Tensor col_stats, Tensor? bias=None, ScalarType dtype=float16) -> Tensor",
+    "(Tensor A, Tensor B, Tensor row_stats, Tensor col_stats, Tensor? bias=None, ScalarType? dtype=None) -> Tensor",
 )
 
 
@@ -30,10 +56,10 @@ def _(
     row_stats: torch.Tensor,
     col_stats: torch.Tensor,
     bias: Optional[torch.Tensor] = None,
-    dtype=torch.float16,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
     shapeC = (*A.shape[:-1], B.shape[0])
-    return torch.empty(shapeC, device=A.device, dtype=dtype)
+    return torch.empty(shapeC, device=A.device, dtype=dtype or torch.float16)
 
 
 torch.library.define(
@@ -98,15 +124,15 @@ def _(A: torch.Tensor, stats: torch.Tensor) -> torch.Tensor:
 
 
 # Default PyTorch-native implementation
-@register_kernel("bitsandbytes::int8_vectorwise_dequant", None)
+@register_kernel("bitsandbytes::int8_vectorwise_dequant", "default")
 def _(A: torch.Tensor, stats: torch.Tensor):
     # To dequantize we divide by 127, or multiply by the reciprocal.
     return A * stats.view(-1, 1) * 7.874015718698502e-3
 
 
 torch.library.define(
     "bitsandbytes::int8_mm_dequant",
-    "(Tensor A, Tensor row_stats, Tensor col_stats, ScalarType dtype=float16, Tensor? bias=None) -> Tensor",
+    "(Tensor A, Tensor row_stats, Tensor col_stats, ScalarType? dtype=None, Tensor? bias=None) -> Tensor",
 )
 
 
@@ -115,11 +141,11 @@ def _(
     A: torch.Tensor,
     row_stats: torch.Tensor,
     col_stats: torch.Tensor,
-    dtype=torch.float16,
+    dtype: Optional[torch.dtype] = None,
     bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     torch._check(A.dtype == torch.int32, lambda: "A must be int32")
-    return torch.empty_like(A, dtype=dtype)
+    return torch.empty_like(A, dtype=dtype or torch.float16)
 
 
 torch.library.define(
 
@@ -210,37 +210,28 @@ def forward(
                 # 2. Quantize B
                 state.CB, state.SCB, _ = F.int8_vectorwise_quant(B.to(torch.float16))
 
-        # Handle sparse decomposition. In some instances, we may have not found any
-        # outlier columns at all. In that case, we'll skip this part completely.
-        if state.threshold > 0.0 and outlier_cols is not None and outlier_cols.numel():
+        # Handle sparse decomposition
+        if state.threshold > 0.0:
             state.idx = outlier_cols
 
-            # Zero out the outliers in the transposed 8bit inputs.
-            if CAt is not None:
-                CAt[:, state.idx] = 0
-
-            # Extract the input outliers in original precision
-            subA = A[:, state.idx].contiguous()
+            # Mixed Int8 Matmul + Dequant + Bias
+            output, subA = torch.ops.bitsandbytes.int8_mixed_scaled_mm(
+                A,
+                CA,
+                state.CB,
+                SCA,
+                state.SCB,
+                outlier_cols,
+                bias,
+            )
 
-            # Extract the corresponding weights
-            if state.has_fp16_weights:
-                state.subB = B[:, state.idx].t()
-            else:
-                # To dequantize our weights associated with the input outliers,
-                # we want to divide by 127. It's however more performant to multiply
-                # by the reciprocal.
-                outliers = state.CB[:, state.idx]
-                state.subB = F.int8_vectorwise_dequant(outliers, state.SCB).to(A.dtype).t()
         else:
+            # Int8 Matmul + Dequant + Bias
+            output = torch.ops.bitsandbytes.int8_scaled_mm.default(
+                CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype
+            )
             subA = None
 
-        # 3. Int8 Matmul + Dequant + Bias
-        output = torch.ops.bitsandbytes.int8_scaled_mm.default(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
-
-        # 4. Mixed-precision decomposition matmul
-        if subA is not None and state.subB is not None:
-            output = output.addmm(subA, state.subB)
-
         # 5. Save state
         ctx.state = state
 
@@ -293,7 +284,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
                 dtype=torch.float16,
             )
 
-            if state.threshold > 0.0 and subA is not None:
+            if state.threshold > 0.0 and subA is not None and subA.numel() > 0:
                 grad_B[:, idx] += torch.matmul(grad_output.t(), subA)
 
         if req_gradA:
 
@@ -29,7 +29,7 @@ def _(
     A: torch.Tensor,
     row_stats: torch.Tensor,
     col_stats: torch.Tensor,
-    dtype=torch.float16,
+    dtype: Optional[torch.dtype] = None,
     bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
@@ -44,7 +44,7 @@ def _(
     if bias is not None:
         out += bias
 
-    return out.to(dtype)
+    return out.to(dtype or torch.float16)
 
 
 @register_kernel("bitsandbytes::quantize_blockwise", "cpu")