Skip to content

Commit 445725b

Browse files
authored
Merge branch 'main' into cpu_fused_kernel
2 parents 26b5685 + 76f45fe commit 445725b

File tree

10 files changed

+37
-62
lines changed

10 files changed

+37
-62
lines changed

.github/scripts/build-rocm.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@ set -xeuo pipefail
77
bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
88

99
# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
10-
[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
10+
[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
1111

1212
# ROCm 7.0+ - Add gfx950
13-
[[ "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
13+
[[ "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
1414

1515
if [ "${build_os:0:6}" == ubuntu ]; then
1616
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
1717
echo "Using image $image"
1818
docker run --rm --platform "linux/$build_arch" -i \
1919
-w /src -v "$PWD:/src" "$image" sh -c \
2020
"apt-get update \
21-
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
21+
&& pip install cmake==3.31.6 \
2222
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
2323
&& cmake --build ."
2424
fi

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ jobs:
137137
matrix:
138138
os: [ubuntu-22.04]
139139
arch: [x86_64]
140-
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2"]
140+
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1"]
141141
runs-on: ${{ matrix.os }}
142142
steps:
143143
- uses: actions/checkout@v4

.github/workflows/tests.yml

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ jobs:
5454
build-cuda:
5555
strategy:
5656
matrix:
57-
# TODO: Add 13.0.1 when we have runners with new enough drivers.
58-
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
57+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
5958
os: [ubuntu-22.04, ubuntu-22.04-arm]
6059
include:
6160
- os: ubuntu-22.04
@@ -349,26 +348,20 @@ jobs:
349348
os: [ubuntu-22.04, windows-2025]
350349
arch: [x86_64]
351350
gpu: [T4, L40S]
352-
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
351+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
353352
include:
354353
- cuda_version: "11.8.0"
355354
torch_version: "2.3.1"
356355
pypi_index: "https://download.pytorch.org/whl/cu118"
357356
- cuda_version: "12.6.3"
358-
torch_version: "2.6.0"
357+
torch_version: "2.7.1"
359358
pypi_index: "https://download.pytorch.org/whl/cu126"
360-
- cuda_version: "12.9.1"
361-
torch_version: "2.8.0"
362-
pypi_index: "https://download.pytorch.org/whl/cu129"
363359
- cuda_version: "12.8.1"
364-
torch_version: "2.9.0"
365-
pypi_index: "https://download.pytorch.org/whl/test/cu128"
366-
367-
# Note: Currently our runners do not have new enough drivers for CUDA 13.
368-
# Add this when supported.
369-
# - cuda_version: "13.0.1"
370-
# torch_version: "2.9.0"
371-
# pypi_index: "https://download.pytorch.org/whl/test/cu130"
360+
torch_version: "2.8.0"
361+
pypi_index: "https://download.pytorch.org/whl/cu128"
362+
- cuda_version: "13.0.1"
363+
torch_version: "2.9.1"
364+
pypi_index: "https://download.pytorch.org/whl/cu130"
372365

373366

374367
# Linux L40S runners

csrc/common.cuh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
// TODO: Let's make some of these constexpr and put in a namespace.
44

5-
#define BNB_CC_MAXWELL 500
6-
#define BNB_CC_MAXWELL2 520
7-
#define BNB_CC_MAXWELL2_X1 530
85
#define BNB_CC_PASCAL 600
96
#define BNB_CC_PASCAL_X2 620
107
#define BNB_CC_VOLTA 700
@@ -17,7 +14,6 @@
1714
#define BNB_CC_HOPPER 900
1815
#define BNB_CC_BLACKWELL 1000
1916

20-
#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
2117
#define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
2218
#define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
2319
#define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)

csrc/kernels.cu

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,15 +1767,7 @@ template <typename T, int THREADS, int SPARSE_DECOMP>
17671767
__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
17681768
void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols) {
17691769

1770-
// For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32.
1771-
// Otherwise `T` is `fp16`. This can be removed when Maxwell is dropped.
1772-
#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE
1773-
using TReduction = T;
1774-
#else
1775-
using TReduction = float;
1776-
#endif
1777-
1778-
using BlockReduceT = cub::BlockReduce<TReduction, THREADS>;
1770+
using BlockReduceT = cub::BlockReduce<T, THREADS>;
17791771

17801772
// One block per row.
17811773
// Threads load column values in a striped arrangement.
@@ -1785,27 +1777,27 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
17851777
// We then do a blockwise reduction to determine the row's absmax.
17861778

17871779
__shared__ typename BlockReduceT::TempStorage temp_storage;
1788-
__shared__ TReduction smem_row_absmax;
1780+
__shared__ T smem_row_absmax;
17891781

17901782
const int row_id = blockIdx.x;
17911783
const T* row_data = A + (row_id * cols);
17921784

17931785
// Threads will read the row values in a striped access pattern and find a local absmax.
1794-
TReduction row_local_absmax = -FLT_MIN;
1786+
T row_local_absmax = -FLT_MIN;
17951787
for (int i = threadIdx.x; i < cols; i += THREADS) {
1796-
const TReduction absval = fabsf(__ldcs(&(row_data[i])));
1788+
const T absval = fabsf(__ldcs(&(row_data[i])));
17971789

17981790
// For sparse decomposition, values outside of the threshold are not to be
17991791
// included when calculating the row's absmax.
18001792
if constexpr (SPARSE_DECOMP) {
1801-
row_local_absmax = fmaxf(row_local_absmax, absval < TReduction(threshold) ? absval : row_local_absmax);
1793+
row_local_absmax = fmaxf(row_local_absmax, absval < T(threshold) ? absval : row_local_absmax);
18021794
} else {
18031795
row_local_absmax = fmaxf(row_local_absmax, absval);
18041796
}
18051797
}
18061798

18071799
// Reduce thread-local absmax across the block.
1808-
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
1800+
const T row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
18091801
if (threadIdx.x == 0) {
18101802
// Save our block's absmax to shared memory for the quantization step.
18111803
rowStats[row_id] = smem_row_absmax = row_absmax;

csrc/kernels.hip

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2613,9 +2613,9 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
26132613
{
26142614

26152615
// per threadblock:
2616-
// load step-by-step in chunks of [BNB_WARP_SIZE,warps]: 1xBNB_WARP_SIZE * [BNB_WARP_SIZE,warps] -> [1,warps]
2616+
// load step-by-step in chunks of [warp_size,warps]: 1xwarp_size * [warp_size,warps] -> [1,warps]
26172617
// 4 warps -> 4 loads per iter
2618-
// 1 x BNB_WARP_SIZE * BNB_WARP_SIZE x 4 -> 1x4 outputs per thread block
2618+
// 1xwarp_size * warp_sizex4 -> 1x4 outputs per thread block
26192619
typedef hipcub::WarpReduce<float, BNB_WARP_SIZE> WarpReduce;
26202620
__shared__ typename WarpReduce::TempStorage temp_storage[THREADS/BNB_WARP_SIZE];
26212621

docs/source/installation.mdx

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,6 @@ The library can be built using CUDA Toolkit versions as old as **11.8**.
4040
| NF4/FP4 quantization | 6.0+ | Pascal (GTX 10X0 series, P100) or newer GPUs|
4141

4242

43-
> [!WARNING]
44-
> Support for Maxwell GPUs is deprecated and will be removed in a future release.
45-
> Maxwell support is not included in PyPI distributions from `v0.48.0` on and must be built from source.
46-
> For the best results, a Turing generation device or newer is recommended.
47-
48-
4943
### Installation via PyPI[[cuda-pip]]
5044

5145
This is the most straightforward and recommended installation option.
@@ -81,7 +75,7 @@ For Linux and Windows systems, compiling from source allows you to customize the
8175
<hfoptions id="source">
8276
<hfoption id="Linux">
8377

84-
To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 9 or newer.
78+
To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 11 or newer.
8579

8680
For example, to install a compiler and CMake on Ubuntu:
8781

@@ -133,7 +127,7 @@ The currently distributed `bitsandbytes` packages are built with the following c
133127
| **OS** | **oneAPI Toolkit** | **Kernel Implementation** |
134128
|--------------------|------------------|----------------------|
135129
| **Linux x86-64** | 2025.1.3 | SYCL + Triton |
136-
| **Windows x86-64** | N/A | SYCL |
130+
| **Windows x86-64** | 2025.1.3 | SYCL + Triton |
137131

138132
The Linux build has a minimum glibc version of 2.34.
139133

@@ -197,12 +191,12 @@ pip install -e .
197191
The currently distributed preview `bitsandbytes` are built with the following configurations:
198192

199193
| **OS** | **ROCm** | **Targets**
200-
|--------------------|----------|---------------------------|
201-
| **Linux x86-64** | 6.1.2 | gfx90a / gfx942 / gfx1100
202-
| **Linux x86-64** | 6.2.4 | gfx90a / gfx942 / gfx1100
203-
| **Linux x86-64** | 6.3.4 | gfx90a / gfx942 / gfx1100
204-
| **Linux x86-64** | 6.4.4 | gfx90a / gfx942 / gfx1100
205-
| **Linux x86-64** | 7.0.0 | gfx90a / gfx942 / gfx1100
194+
|--------------------|----------|---------------------------------------------------------------------|
195+
| **Linux x86-64** | 6.2.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
196+
| **Linux x86-64** | 6.3.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
197+
| **Linux x86-64** | 6.4.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101, gfx1200, gfx1201
198+
| **Linux x86-64** | 7.0.2 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100 / gfx1101 / gfx1200 / gfx1201
199+
| **Linux x86-64** | 7.1.0 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100 / gfx1101 / gfx1200 / gfx1201
206200

207201
**Windows is not currently supported.**
208202

tests/test_functional.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ def test_dim3_igemm(self, seq_dim, hidden_dim, batch_dim):
491491
@pytest.mark.parametrize("hidden_dim", [32, 1024 * 4], ids=id_formatter("hidden_dim"))
492492
@pytest.mark.parametrize("batch_dim", [2, 16], ids=id_formatter("batch_dim"))
493493
@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
494-
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
494+
@pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
495495
def test_minmax_igemm(self, seq_dim, hidden_dim, batch_dim, transpose):
496496
def min_max(x):
497497
maxA = torch.amax(x, dim=2, keepdim=True)
@@ -1205,7 +1205,7 @@ def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
12051205
@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No accelerator device")
12061206
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
12071207
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
1208-
@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128], ids=id_formatter("blocksize"))
1208+
@pytest.mark.parametrize("blocksize", [64, 128] if not ROCM_WARP_SIZE_64 else [128], ids=id_formatter("blocksize"))
12091209
def test_4bit_quant_large(self, device, dtype, quant_type, blocksize):
12101210
"""
12111211
Test that we can successfully quantize a large tensor. Note that the following limitations apply:
@@ -1428,7 +1428,7 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
14281428
@pytest.mark.parametrize("device", get_available_devices())
14291429
@pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
14301430
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
1431-
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
1431+
@pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
14321432
def test_gemv_eye_4bit(self, device, storage_type, dtype):
14331433
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
14341434
pytest.skip("This configuration is not supported on HPU.")

tests/test_linear8bitlt.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import torch
1010

1111
import bitsandbytes as bnb
12-
from bitsandbytes.cextension import HIP_ENVIRONMENT
12+
from bitsandbytes.cextension import ROCM_WARP_SIZE_64
1313
from bitsandbytes.nn.modules import Linear8bitLt
1414
from tests.helpers import (
1515
TRUE_FALSE,
@@ -234,7 +234,7 @@ def test_linear8bit_serialization(linear8bit):
234234
@pytest.mark.parametrize("fullgraph", TRUE_FALSE, ids=id_formatter("fullgraph"))
235235
@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
236236
@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
237-
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
237+
@pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
238238
def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
239239
if device == "cuda" and platform.system() == "Windows":
240240
pytest.skip("Triton is not officially supported on Windows")

tests/test_parametrize.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import torch.nn as nn
44

55
from bitsandbytes import functional as F
6-
from bitsandbytes.cextension import HIP_ENVIRONMENT
6+
from bitsandbytes.cextension import ROCM_WARP_SIZE_64
77
from bitsandbytes.nn.parametrize import (
88
Bnb4bitParametrization,
99
replace_parameter_4bit,
@@ -39,7 +39,7 @@ def __init__(self, device="cpu", dtype=torch.float32):
3939
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
4040
@pytest.mark.parametrize(
4141
"blocksize",
42-
[64, 128, 256] if not HIP_ENVIRONMENT else [128, 256],
42+
[64, 128, 256] if not ROCM_WARP_SIZE_64 else [128, 256],
4343
)
4444
def test_replace_parameter_4bit(device, dtype, quant_type, compress_statistics, blocksize):
4545
"""Test basic parameter replacement with 4-bit quantization on different dtypes."""
@@ -267,7 +267,7 @@ def test_quant_state_preservation(device, dtype):
267267

268268
module = ParametrizeTestModule(device=device, dtype=dtype)
269269

270-
blocksize = 128 if HIP_ENVIRONMENT else 64
270+
blocksize = 128 if ROCM_WARP_SIZE_64 else 64
271271

272272
# Apply parametrization with specific settings
273273
replace_parameter_4bit(module, "weight_2d", quant_type="nf4", compress_statistics=True, blocksize=blocksize)
@@ -326,7 +326,7 @@ def test_multiple_parameters(device, dtype):
326326
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
327327
@pytest.mark.parametrize(
328328
"blocksize",
329-
[64, 128, 256] if not HIP_ENVIRONMENT else [128, 256],
329+
[64, 128, 256] if not ROCM_WARP_SIZE_64 else [128, 256],
330330
)
331331
def test_different_blocksizes(device, dtype, blocksize):
332332
"""Test parametrization with different block sizes to verify flexibility."""

0 commit comments

Comments
 (0)