Merge branch 'main' into main

jiqing-feng · web-flow · commit aa0cf92c3d88 · 2025-07-02T16:06:25.000+08:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -101,8 +101,8 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        # Test with the oldest supported torch version and the two newest.
-        torch_version: ["2.2.2", "2.6.0", "2.7.1"]
+        # Test with the oldest supported torch version, the newest two stable/RC.
+        torch_version: ["2.2.2", "2.7.1", "2.8.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -144,7 +144,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.8.0' && 'test/cpu') || 'cpu' }}
           pip install -e ".[test]"
           pip install pytest-cov
 
@@ -372,7 +372,7 @@ jobs:
             pypi_index: "https://download.pytorch.org/whl/cu128"
           - cuda_version: "12.9.1"
             torch_version: "2.8.0"
-            pypi_index: "https://download.pytorch.org/whl/nightly/cu129"
+            pypi_index: "https://download.pytorch.org/whl/test/cu129"
 
 
           # Linux L40S runners
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include CMakeLists.txt
+graft csrc
+graft include
diff --git a/README.md b/README.md
@@ -71,11 +71,11 @@ bitsandbytes has the following minimum requirements for all platforms:
       <td>🟥 AMD GPU <br><code>cuda</code></td>
       <td>
         CDNA: gfx90a, gfx942<br>
-        RDNA: gfx1100, gfx1200
+        RDNA: gfx1100
       </td>
-      <td>🚧</td>
-      <td>🚧</td>
-      <td>🚧</td>
+      <td>✅</td>
+      <td>〰️</td>
+      <td>✅</td>
     </tr>
     <tr>
       <td></td>
@@ -85,8 +85,8 @@ bitsandbytes has the following minimum requirements for all platforms:
         Arc A-Series (Alchemist)<br>
         Arc B-Series (Battlemage)
       </td>
-      <td>🚧</td>
-      <td>🚧</td>
+      <td>✅</td>
+      <td>✅</td>
       <td>🚧</td>
     </tr>
     <tr>
@@ -108,7 +108,7 @@ bitsandbytes has the following minimum requirements for all platforms:
     <tr>
       <td></td>
       <td>🟩 NVIDIA GPU <br><code>cuda</code></td>
-      <td>SM75, SM80, SM90, SM100</td>
+      <td>SM75+</td>
       <td>✅</td>
       <td>✅</td>
       <td>✅</td>
@@ -139,8 +139,8 @@ bitsandbytes has the following minimum requirements for all platforms:
         Arc A-Series (Alchemist) <br>
         Arc B-Series (Battlemage)
       </td>
-      <td>🚧</td>
-      <td>🚧</td>
+      <td>✅</td>
+      <td>✅</td>
       <td>🚧</td>
     </tr>
     <tr>
diff --git a/bitsandbytes/backends/triton/ops.py b/bitsandbytes/backends/triton/ops.py
@@ -9,6 +9,8 @@
 # from bitsandbytes.functional import get_4bit_type
 # _FP4_QUANT_TABLE = get_4bit_type("fp4", device="xpu")
 # _NF4_QUANT_TABLE = get_4bit_type("nf4", device="xpu")
+device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+torch_accelerator_module = getattr(torch, device_type, torch.cuda)
 
 
 def quantize_blockwise(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
@@ -21,7 +23,9 @@ def quantize_blockwise(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> t
     absmax = torch.empty((blocks,), device=A.device, dtype=A.dtype)
     out = torch.empty_like(A.flatten(), dtype=torch.uint8)
 
-    triton_kernels.quantize_blockwise_triton(A, blocksize, code, blocks, absmax, out)
+    with torch_accelerator_module.device(A.device):
+        triton_kernels.quantize_blockwise_triton(A, blocksize, code, blocks, absmax, out)
+
     out = out.reshape(A.shape)
 
     return out, absmax.float()
@@ -35,13 +39,14 @@ def dequantize_blockwise(
     # torch._check(dtype == torch.float32, lambda: f"dtype must be float32 on xpu, got {dtype}")
 
     out = torch.empty_like(A, dtype=dtype, device=A.device)
-    triton_kernels.dequant_int8_blockwise(
-        A,
-        code,
-        absmax,
-        out,
-        blocksize,
-    )
+    with torch_accelerator_module.device(A.device):
+        triton_kernels.dequant_int8_blockwise(
+            A,
+            code,
+            absmax,
+            out,
+            blocksize,
+        )
 
     return out
 
@@ -55,13 +60,14 @@ def dequantize_blockwise_inplace(
     torch._check(out.device == A.device, lambda: f"Expected out.device == {A.device}, got {out.device}")
     torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
 
-    triton_kernels.dequant_int8_blockwise(
-        A,
-        code,
-        absmax,
-        out,
-        blocksize,
-    )
+    with torch_accelerator_module.device(A.device):
+        triton_kernels.dequant_int8_blockwise(
+            A,
+            code,
+            absmax,
+            out,
+            blocksize,
+        )
 
 
 def quantize_4bit(
@@ -84,9 +90,10 @@ def quantize_4bit(
     absmax = torch.empty((blocks * 2,), device=A.device, dtype=A.dtype)
     out = torch.empty((n // 2, 1), device=A.device, dtype=torch.uint8)
 
-    triton_kernels.quantize_4bit_blockwise_triton(
-        A, blocksize, quant_type, blocks, absmax, num_elements=n, quantized_out=out
-    )
+    with torch_accelerator_module.device(A.device):
+        triton_kernels.quantize_4bit_blockwise_triton(
+            A, blocksize, quant_type, blocks, absmax, num_elements=n, quantized_out=out
+        )
     packed = out
 
     if quant_storage != torch.uint8:
@@ -119,7 +126,9 @@ def dequantize_4bit(
 
     out = torch.empty(shape, dtype=dtype, device=A.device)
 
-    triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+    with torch_accelerator_module.device(A.device):
+        triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
     return out
 
 
@@ -134,7 +143,8 @@ def dequantize_4bit_inplace(
 ) -> None:
     torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
     torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
-    triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+    with torch_accelerator_module.device(A.device):
+        triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
 
 
 def gemv_4bit(
@@ -150,14 +160,15 @@ def gemv_4bit(
 
     B_dq_triton = torch.empty(shapeB, dtype=A.dtype, device=A.device)
 
-    triton_kernels._dequantize_4bit_impl_passing_code(
-        B,
-        absmax,
-        blocksize,
-        code,
-        dtype=A.dtype,
-        out=B_dq_triton,
-    )
+    with torch_accelerator_module.device(A.device):
+        triton_kernels._dequantize_4bit_impl_passing_code(
+            B,
+            absmax,
+            blocksize,
+            code,
+            dtype=A.dtype,
+            out=B_dq_triton,
+        )
 
     return torch.nn.functional.linear(
         A,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["setuptools >= 63.0.0"]
-build-backend = "setuptools.build_meta"
+requires = ["scikit-build-core", "setuptools >= 63.0.0"]
+build-backend = "scikit_build_core.setuptools.build_meta"
 
 [project]
 name = "bitsandbytes"
diff --git a/setup.py b/setup.py
@@ -2,7 +2,11 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from distutils.errors import DistutilsModuleError
+from warnings import warn
+
 from setuptools import find_packages, setup
+from setuptools.command.build_py import build_py
 from setuptools.dist import Distribution
 
 
@@ -12,4 +16,26 @@ def has_ext_modules(self):
         return True
 
 
-setup(version="0.47.0.dev0", packages=find_packages(), distclass=BinaryDistribution)
+class ExtBuildPy(build_py):
+    def run(self):
+        # build_cmake needs to be called prior to build_py, as the latter
+        # collects the files output into the package directory.
+        try:
+            self.run_command("build_cmake")
+        except DistutilsModuleError:
+            warn(
+                "scikit-build-core not installed, CMake will not be invoked automatically. "
+                "Please install scikit-build-core or run CMake manually to build extensions."
+            )
+        super().run()
+
+
+setup(
+    version="0.47.0.dev0",
+    packages=find_packages(),
+    distclass=BinaryDistribution,
+    cmake_source_dir=".",
+    cmdclass={
+        "build_py": ExtBuildPy,
+    },
+)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+include CMakeLists.txt`
	`2`	`+graft csrc`
	`3`	`+graft include`