Merge branch 'main' into ipex

jiqing-feng · web-flow · commit 1a7794936008 · 2025-05-14T09:28:59.000+08:00
diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
@@ -24,13 +24,16 @@ fi
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
 
 if [ "${build_os:0:6}" == ubuntu ]; then
-    image=nvidia/cuda:${cuda_version}-devel-ubuntu22.04
+    # We'll use Rocky Linux 8 in order to maintain manylinux 2.24 compatibility.
+    image="nvidia/cuda:${cuda_version}-devel-rockylinux8"
     echo "Using image $image"
-    docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
-        "apt-get update \
-    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-    && cmake -DPTXAS_VERBOSE=1 -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" . \
-    && cmake --build ."
+
+    docker run -i -w /src -v "$PWD:/src" "$image" bash -c \
+        "dnf update -y \
+        && dnf install cmake gcc-toolset-11 -y \
+        && source scl_source enable gcc-toolset-11 \
+        && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" . \
+        && cmake --build . --config Release"
 else
     pip install cmake==3.28.3
     cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DCMAKE_BUILD_TYPE=Release -S .
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -15,14 +15,16 @@ jobs:
   build-cpu:
     strategy:
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+          - os: macos-15
+            arch: arm64
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -97,7 +99,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         torch_version: ["2.7.0"]
         include:
           - os: ubuntu-22.04
@@ -106,6 +108,8 @@ jobs:
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+          - os: macos-15
+            arch: arm64
     runs-on: ${{ matrix.os }}
     env:
       BNB_TEST_DEVICE: cpu
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -94,15 +94,22 @@ class Test8BitBlockwiseQuantizeFunctional:
     @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
     @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
-        if device in ("cpu", "xpu"):
+        iters = 100
+
+        if device == "cpu":
+            iters = 10
+
+            # This test is slow on CPU, so avoid atypical use cases.
+            if nested:
+                pytest.skip("Not a typical use case.")
             if blocksize != 256:
                 pytest.skip("Only blocksize 256 is used in CPU/XPU")
             if dtype != torch.float32:
                 pytest.skip("Only float32 is used in CPU/XPU")
 
         diffs = []
         reldiffs = []
-        for i in range(100):
+        for i in range(iters):
             A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
             C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
             A2 = F.dequantize_blockwise(C, S)
@@ -112,15 +119,13 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
             reldiffs.append(reldiff.mean().item())
         abserr = sum(diffs) / len(diffs)
         relerr = sum(reldiffs) / len(reldiffs)
-        # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs))
-        # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs))
         assert abserr < 0.011
         assert relerr < 0.018
         assert A2.dtype == dtype
 
         diffs = []
         code = F.create_dynamic_map(signed=signed)
-        for i in range(100):
+        for i in range(iters):
             A1 = torch.rand(1024, 1024, device=device, dtype=dtype)
             C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
             A2 = F.dequantize_blockwise(C, S)
@@ -139,33 +144,29 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
             assert abserr < 0.00175 if device in ("cpu", "xpu") else 0.0023
             assert relerr < 0.012
         assert A2.dtype == dtype
-        # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
-        # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))
-
-    @pytest.mark.parametrize("device", get_available_devices())
-    def test_blockwise_cpu_large(self, device):
-        if device == "xpu":
-            pytest.skip("XPU will not build CPU C++ codes")
 
+    @pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
+    @pytest.mark.parametrize("hidden", [128])
+    @pytest.mark.parametrize("blocksize", [4096, 16384])
+    def test_blockwise_cpu_large(self, hidden, blocksize):
         diffs = []
         reldiffs = []
         batch = 128
         seq = 128
-        for hidden in [128]:  # , 14336]:
-            for blocksize in [4096, 16384]:
-                for i in range(2):
-                    A1 = torch.randn(batch, seq, hidden, device="cpu")
-                    t0 = time.time()
-                    C, S = F.quantize_blockwise(A1, blocksize=blocksize)
-                    A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
-                    print(time.time() - t0)
-                    diff = torch.abs(A1 - A2)
-                    reldiff = diff / torch.abs(A1 + 1e-8)
-                    diffs.append(diff.mean().item())
-                    reldiffs.append(reldiff.mean().item())
-                    assert diffs[-1] < 0.011
-                # print(sum(diffs)/len(diffs))
-                # print(sum(reldiffs)/len(reldiffs))
+
+        for i in range(2):
+            A1 = torch.randn(batch, seq, hidden, device="cpu")
+            t0 = time.time()
+            C, S = F.quantize_blockwise(A1, blocksize=blocksize)
+            A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
+            print(time.time() - t0)
+            diff = torch.abs(A1 - A2)
+            reldiff = diff / torch.abs(A1 + 1e-8)
+            diffs.append(diff.mean().item())
+            reldiffs.append(reldiff.mean().item())
+            assert diffs[-1] < 0.011
+        # print(sum(diffs)/len(diffs))
+        # print(sum(reldiffs)/len(reldiffs))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -97,8 +97,12 @@ class TestInt8BlockwiseQuantOps:
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_quantize_blockwise(self, device, dtype, blocksize):
-        if device == "cpu" and dtype != torch.float32:
-            pytest.skip("CPU implementation is only available for float32")
+        if device == "cpu":
+            if dtype != torch.float32:
+                pytest.skip("CPU implementation is only available for float32")
+
+            if blocksize != 256:
+                pytest.skip("CPU implementation is slow; only test blocksize=256")
 
         code = bitsandbytes.functional.create_dynamic_map().to(device)
         A = torch.randn(1024, 1024, dtype=dtype, device=device)