Skip to content

Commit cdcae8d

Browse files
CI runner updates (#1643)
* Test g5g runner * Switch L4 to L40S runner; swap GitHub Linux T4 runner for AWS g4dn * Run tests on last 2 pytorch stable releases * Run tests on last 2 pytorch stable releases
1 parent 513e69b commit cdcae8d

File tree

3 files changed

+50
-49
lines changed

3 files changed

+50
-49
lines changed

.github/workflows/tests.yml

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
build-cuda:
5050
strategy:
5151
matrix:
52-
cuda_version: ["11.8.0", "12.8.1"]
52+
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
5353
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
5454
include:
5555
- os: ubuntu-22.04
@@ -100,7 +100,7 @@ jobs:
100100
fail-fast: false
101101
matrix:
102102
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
103-
torch_version: ["2.7.0"]
103+
torch_version: ["2.6.0", "2.7.0"]
104104
include:
105105
- os: ubuntu-22.04
106106
arch: x86_64
@@ -138,9 +138,35 @@ jobs:
138138
- name: Show installed packages
139139
run: pip list
140140

141+
- name: Show environment information
142+
run: python -m torch.utils.collect_env
143+
141144
- name: Run tests
142145
run: pytest --durations=100
143146

147+
# cuda-aarch64-tests:
148+
# if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
149+
# needs: build-cuda
150+
# strategy:
151+
# fail-fast: false
152+
# matrix:
153+
# os: [ubuntu-22.04-arm]
154+
# arch: [aarch64]
155+
# torch_version: ["2.7.0"]
156+
# cuda_version: ["11.8.0", "12.8.1"]
157+
158+
# runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80
159+
# env:
160+
# BNB_TEST_DEVICE: cuda
161+
# steps:
162+
# - name: Show GPU Information
163+
# run: nvidia-smi
164+
165+
# - name: Show pip packages
166+
# run: pip list
167+
168+
169+
144170
cuda-tests:
145171
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
146172
needs: build-cuda
@@ -149,25 +175,28 @@ jobs:
149175
matrix:
150176
os: [ubuntu-22.04, windows-2025]
151177
arch: [x86_64]
152-
gpu: [T4, L4]
153-
cuda_version: ["11.8.0", "12.8.1"]
178+
gpu: [T4, L40S]
179+
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
154180
include:
155181
- cuda_version: "11.8.0"
156182
torch_version: "2.4.1"
157183
pypi_index: "https://download.pytorch.org/whl/cu118"
184+
- cuda_version: "12.6.3"
185+
torch_version: "2.6.0"
186+
pypi_index: "https://download.pytorch.org/whl/cu126"
158187
- cuda_version: "12.8.1"
159188
torch_version: "2.7.0"
160189
pypi_index: "https://download.pytorch.org/whl/cu128"
161190

162-
# L4 runners
191+
# L40S runners
163192
- os: ubuntu-22.04
164-
gpu: L4
165-
runner: bandb-aws-g6-4xlarge-plus-use1-public-80
193+
gpu: L40S
194+
runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
166195

167196
# T4 runners
168197
- os: ubuntu-22.04
169198
gpu: T4
170-
runner: CUDA-Linux-x64
199+
runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
171200
- os: windows-2025
172201
gpu: T4
173202
runner: CUDA-Windows-x64
@@ -176,10 +205,12 @@ jobs:
176205
# and cannot support CUDA 12+. Skip for now.
177206
- os: windows-2025
178207
cuda_version: "12.8.1"
208+
- os: windows-2025
209+
cuda_version: "12.6.3"
179210

180-
# No Windows L4 runners.
211+
# No Windows L40S runners.
181212
- os: windows-2025
182-
gpu: L4
213+
gpu: L40S
183214
runs-on: ${{ matrix.runner }}
184215
env:
185216
BNB_TEST_DEVICE: cuda
@@ -210,5 +241,8 @@ jobs:
210241
- name: Show installed packages
211242
run: pip list
212243

244+
- name: Show environment information
245+
run: python -m torch.utils.collect_env
246+
213247
- name: Run tests
214248
run: pytest --durations=100

tests/test_functional.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -929,39 +929,6 @@ def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func):
929929
# torch.cuda.synchronize()
930930
# print(time.time() - t0)
931931

932-
@pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
933-
@pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
934-
@pytest.mark.skip("No longer supported")
935-
def test_integrated_sparse_decomp(self, dim1, dim2):
936-
threshold = 3.0
937-
for _ in range(k):
938-
A = torch.randn(dim1, dim2).cuda().half()
939-
w1 = torch.randn(dim1, dim2).cuda().half()
940-
out1 = torch.matmul(A, w1.t())
941-
942-
Cw1, statsw1, _ = F.int8_vectorwise_quant(w1)
943-
CA, statsA, _ = F.int8_vectorwise_quant(A)
944-
945-
out1_32 = F.int8_linear_matmul(CA, Cw1)
946-
out2 = F.int8_mm_dequant(out1_32, statsA, statsw1)
947-
948-
# CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
949-
CA, _, statsA, _, coo_tensor = F.double_quant(A, threshold=threshold)
950-
951-
out1_32 = F.int8_linear_matmul(CA, Cw1)
952-
out3 = F.int8_mm_dequant(out1_32, statsA, statsw1)
953-
954-
assert coo_tensor is not None
955-
956-
out4 = F.spmm_coo(coo_tensor, w1.t())
957-
# idx = torch.unique(coo_tensor._indices()[1]).long()
958-
# out4 = torch.matmul(A, w1.t())
959-
out5 = out3 + out4
960-
961-
err1 = torch.abs(out1 - out2).mean().item()
962-
err2 = torch.abs(out1 - out5).mean().item()
963-
assert err2 < err1
964-
965932
@pytest.mark.parametrize("dim1", [1 * 2048])
966933
@pytest.mark.parametrize("dim2", [2048])
967934
@pytest.mark.parametrize("dtype", [torch.int8])

tests/test_modules.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
130130
assert l1.weight.dtype == torch.int8
131131

132132
l1.eval()
133-
for i in range(100):
133+
for i in range(4):
134134
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
135135
o1 = l1(b1)
136136
assert o1.dtype == torch.float16
@@ -139,7 +139,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
139139
assert mlp.fc1.weight.dtype == torch.int8
140140
assert mlp.fc2.weight.dtype == torch.int8
141141

142-
for i in range(100):
142+
for i in range(4):
143143
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
144144
o1 = mlp(b1)
145145
assert o1.dtype == torch.float16
@@ -152,7 +152,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
152152
assert mlp.fc1.weight.dtype == torch.int8
153153
assert mlp.fc2.weight.dtype == torch.int8
154154

155-
for i in range(100):
155+
for i in range(4):
156156
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
157157
o1 = mlp(b1)
158158
assert o1.dtype == torch.float16
@@ -163,7 +163,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
163163

164164
mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().to(device)
165165

166-
for i in range(100):
166+
for i in range(4):
167167
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
168168
o1 = mlp(b1)
169169
assert o1.dtype == torch.float16
@@ -185,7 +185,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
185185
.to(device)
186186
)
187187

188-
for i in range(100):
188+
for i in range(4):
189189
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
190190
o1 = mlp(b1)
191191
assert o1.dtype == torch.float16
@@ -207,7 +207,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
207207
w1, w2 = mlp.fc1.weight.clone().to(device), mlp.fc2.weight.clone().to(device) # grab weights before quantization,
208208
mlp = mlp.to(device).half() # and this line triggers quantization
209209

210-
for i in range(100):
210+
for i in range(4):
211211
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
212212
o1 = mlp(b1)
213213
assert o1.dtype == torch.float16

0 commit comments

Comments
 (0)