Skip to content

Commit 1a77949

Browse files
authored
Merge branch 'main' into ipex
2 parents a9e5c4a + 42bc729 commit 1a77949

File tree

4 files changed

+49
-37
lines changed

4 files changed

+49
-37
lines changed

.github/scripts/build-cuda.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ fi
2424
[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
2525

2626
if [ "${build_os:0:6}" == ubuntu ]; then
27-
image=nvidia/cuda:${cuda_version}-devel-ubuntu22.04
27+
# We'll use Rocky Linux 8 in order to maintain manylinux 2.24 compatibility.
28+
image="nvidia/cuda:${cuda_version}-devel-rockylinux8"
2829
echo "Using image $image"
29-
docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
30-
"apt-get update \
31-
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
32-
&& cmake -DPTXAS_VERBOSE=1 -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" . \
33-
&& cmake --build ."
30+
31+
docker run -i -w /src -v "$PWD:/src" "$image" bash -c \
32+
"dnf update -y \
33+
&& dnf install cmake gcc-toolset-11 -y \
34+
&& source scl_source enable gcc-toolset-11 \
35+
&& cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" . \
36+
&& cmake --build . --config Release"
3437
else
3538
pip install cmake==3.28.3
3639
cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DCMAKE_BUILD_TYPE=Release -S .

.github/workflows/tests.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,16 @@ jobs:
1515
build-cpu:
1616
strategy:
1717
matrix:
18-
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
18+
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
1919
include:
2020
- os: ubuntu-22.04
2121
arch: x86_64
2222
- os: ubuntu-22.04-arm
2323
arch: aarch64
2424
- os: windows-2025
2525
arch: x86_64
26+
- os: macos-15
27+
arch: arm64
2628
runs-on: ${{ matrix.os }}
2729
steps:
2830
- uses: actions/checkout@v4
@@ -97,7 +99,7 @@ jobs:
9799
strategy:
98100
fail-fast: false
99101
matrix:
100-
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
102+
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
101103
torch_version: ["2.7.0"]
102104
include:
103105
- os: ubuntu-22.04
@@ -106,6 +108,8 @@ jobs:
106108
arch: aarch64
107109
- os: windows-2025
108110
arch: x86_64
111+
- os: macos-15
112+
arch: arm64
109113
runs-on: ${{ matrix.os }}
110114
env:
111115
BNB_TEST_DEVICE: cpu

tests/test_functional.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,22 @@ class Test8BitBlockwiseQuantizeFunctional:
9494
@pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
9595
@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
9696
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
97-
if device in ("cpu", "xpu"):
97+
iters = 100
98+
99+
if device == "cpu":
100+
iters = 10
101+
102+
# This test is slow on CPU, so avoid atypical use cases.
103+
if nested:
104+
pytest.skip("Not a typical use case.")
98105
if blocksize != 256:
99106
pytest.skip("Only blocksize 256 is used in CPU/XPU")
100107
if dtype != torch.float32:
101108
pytest.skip("Only float32 is used in CPU/XPU")
102109

103110
diffs = []
104111
reldiffs = []
105-
for i in range(100):
112+
for i in range(iters):
106113
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
107114
C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
108115
A2 = F.dequantize_blockwise(C, S)
@@ -112,15 +119,13 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
112119
reldiffs.append(reldiff.mean().item())
113120
abserr = sum(diffs) / len(diffs)
114121
relerr = sum(reldiffs) / len(reldiffs)
115-
# print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs))
116-
# print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs))
117122
assert abserr < 0.011
118123
assert relerr < 0.018
119124
assert A2.dtype == dtype
120125

121126
diffs = []
122127
code = F.create_dynamic_map(signed=signed)
123-
for i in range(100):
128+
for i in range(iters):
124129
A1 = torch.rand(1024, 1024, device=device, dtype=dtype)
125130
C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
126131
A2 = F.dequantize_blockwise(C, S)
@@ -139,33 +144,29 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
139144
assert abserr < 0.00175 if device in ("cpu", "xpu") else 0.0023
140145
assert relerr < 0.012
141146
assert A2.dtype == dtype
142-
# print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
143-
# print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))
144-
145-
@pytest.mark.parametrize("device", get_available_devices())
146-
def test_blockwise_cpu_large(self, device):
147-
if device == "xpu":
148-
pytest.skip("XPU will not build CPU C++ codes")
149147

148+
@pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
149+
@pytest.mark.parametrize("hidden", [128])
150+
@pytest.mark.parametrize("blocksize", [4096, 16384])
151+
def test_blockwise_cpu_large(self, hidden, blocksize):
150152
diffs = []
151153
reldiffs = []
152154
batch = 128
153155
seq = 128
154-
for hidden in [128]: # , 14336]:
155-
for blocksize in [4096, 16384]:
156-
for i in range(2):
157-
A1 = torch.randn(batch, seq, hidden, device="cpu")
158-
t0 = time.time()
159-
C, S = F.quantize_blockwise(A1, blocksize=blocksize)
160-
A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
161-
print(time.time() - t0)
162-
diff = torch.abs(A1 - A2)
163-
reldiff = diff / torch.abs(A1 + 1e-8)
164-
diffs.append(diff.mean().item())
165-
reldiffs.append(reldiff.mean().item())
166-
assert diffs[-1] < 0.011
167-
# print(sum(diffs)/len(diffs))
168-
# print(sum(reldiffs)/len(reldiffs))
156+
157+
for i in range(2):
158+
A1 = torch.randn(batch, seq, hidden, device="cpu")
159+
t0 = time.time()
160+
C, S = F.quantize_blockwise(A1, blocksize=blocksize)
161+
A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
162+
print(time.time() - t0)
163+
diff = torch.abs(A1 - A2)
164+
reldiff = diff / torch.abs(A1 + 1e-8)
165+
diffs.append(diff.mean().item())
166+
reldiffs.append(reldiff.mean().item())
167+
assert diffs[-1] < 0.011
168+
# print(sum(diffs)/len(diffs))
169+
# print(sum(reldiffs)/len(reldiffs))
169170

170171
@pytest.mark.parametrize("device", get_available_devices())
171172
@pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))

tests/test_ops.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,12 @@ class TestInt8BlockwiseQuantOps:
9797
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
9898
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
9999
def test_quantize_blockwise(self, device, dtype, blocksize):
100-
if device == "cpu" and dtype != torch.float32:
101-
pytest.skip("CPU implementation is only available for float32")
100+
if device == "cpu":
101+
if dtype != torch.float32:
102+
pytest.skip("CPU implementation is only available for float32")
103+
104+
if blocksize != 256:
105+
pytest.skip("CPU implementation is slow; only test blocksize=256")
102106

103107
code = bitsandbytes.functional.create_dynamic_map().to(device)
104108
A = torch.randn(1024, 1024, dtype=dtype, device=device)

0 commit comments

Comments
 (0)