Skip to content

Commit 5c0a0a9

Browse files
CUDA: Drop compilation compatibility with Maxwell (#1806)
* CUDA: Drop Maxwell compatibility * Update docs
1 parent 8f5d139 commit 5c0a0a9

File tree

3 files changed

+13
-32
lines changed

3 files changed

+13
-32
lines changed

csrc/common.cuh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
// TODO: Let's make some of these constexpr and put in a namespace.
44

5-
#define BNB_CC_MAXWELL 500
6-
#define BNB_CC_MAXWELL2 520
7-
#define BNB_CC_MAXWELL2_X1 530
85
#define BNB_CC_PASCAL 600
96
#define BNB_CC_PASCAL_X2 620
107
#define BNB_CC_VOLTA 700
@@ -17,7 +14,6 @@
1714
#define BNB_CC_HOPPER 900
1815
#define BNB_CC_BLACKWELL 1000
1916

20-
#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
2117
#define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
2218
#define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
2319
#define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)

csrc/kernels.cu

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,15 +1767,7 @@ template <typename T, int THREADS, int SPARSE_DECOMP>
17671767
__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
17681768
void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols) {
17691769

1770-
// For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32.
1771-
// Otherwise `T` is `fp16`. This can be removed when Maxwell is dropped.
1772-
#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE
1773-
using TReduction = T;
1774-
#else
1775-
using TReduction = float;
1776-
#endif
1777-
1778-
using BlockReduceT = cub::BlockReduce<TReduction, THREADS>;
1770+
using BlockReduceT = cub::BlockReduce<T, THREADS>;
17791771

17801772
// One block per row.
17811773
// Threads load column values in a striped arrangement.
@@ -1785,27 +1777,27 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
17851777
// We then do a blockwise reduction to determine the row's absmax.
17861778

17871779
__shared__ typename BlockReduceT::TempStorage temp_storage;
1788-
__shared__ TReduction smem_row_absmax;
1780+
__shared__ T smem_row_absmax;
17891781

17901782
const int row_id = blockIdx.x;
17911783
const T* row_data = A + (row_id * cols);
17921784

17931785
// Threads will read the row values in a striped access pattern and find a local absmax.
1794-
TReduction row_local_absmax = -FLT_MIN;
1786+
T row_local_absmax = -FLT_MIN;
17951787
for (int i = threadIdx.x; i < cols; i += THREADS) {
1796-
const TReduction absval = fabsf(__ldcs(&(row_data[i])));
1788+
const T absval = fabsf(__ldcs(&(row_data[i])));
17971789

17981790
// For sparse decomposition, values outside of the threshold are not to be
17991791
// included when calculating the row's absmax.
18001792
if constexpr (SPARSE_DECOMP) {
1801-
row_local_absmax = fmaxf(row_local_absmax, absval < TReduction(threshold) ? absval : row_local_absmax);
1793+
row_local_absmax = fmaxf(row_local_absmax, absval < T(threshold) ? absval : row_local_absmax);
18021794
} else {
18031795
row_local_absmax = fmaxf(row_local_absmax, absval);
18041796
}
18051797
}
18061798

18071799
// Reduce thread-local absmax across the block.
1808-
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
1800+
const T row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
18091801
if (threadIdx.x == 0) {
18101802
// Save our block's absmax to shared memory for the quantization step.
18111803
rowStats[row_id] = smem_row_absmax = row_absmax;

docs/source/installation.mdx

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,6 @@ The library can be built using CUDA Toolkit versions as old as **11.8**.
4040
| NF4/FP4 quantization | 6.0+ | Pascal (GTX 10X0 series, P100) or newer GPUs|
4141

4242

43-
> [!WARNING]
44-
> Support for Maxwell GPUs is deprecated and will be removed in a future release.
45-
> Maxwell support is not included in PyPI distributions from `v0.48.0` on and must be built from source.
46-
> For the best results, a Turing generation device or newer is recommended.
47-
48-
4943
### Installation via PyPI[[cuda-pip]]
5044

5145
This is the most straightforward and recommended installation option.
@@ -81,7 +75,7 @@ For Linux and Windows systems, compiling from source allows you to customize the
8175
<hfoptions id="source">
8276
<hfoption id="Linux">
8377

84-
To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 9 or newer.
78+
To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 11 or newer.
8579

8680
For example, to install a compiler and CMake on Ubuntu:
8781

@@ -133,7 +127,7 @@ The currently distributed `bitsandbytes` packages are built with the following c
133127
| **OS** | **oneAPI Toolkit** | **Kernel Implementation** |
134128
|--------------------|------------------|----------------------|
135129
| **Linux x86-64** | 2025.1.3 | SYCL + Triton |
136-
| **Windows x86-64** | N/A | SYCL |
130+
| **Windows x86-64** | 2025.1.3 | SYCL + Triton |
137131

138132
The Linux build has a minimum glibc version of 2.34.
139133

@@ -197,12 +191,11 @@ pip install -e .
197191
The currently distributed preview `bitsandbytes` are built with the following configurations:
198192

199193
| **OS** | **ROCm** | **Targets**
200-
|--------------------|----------|---------------------------|
201-
| **Linux x86-64** | 6.1.2 | gfx90a / gfx942 / gfx1100
202-
| **Linux x86-64** | 6.2.4 | gfx90a / gfx942 / gfx1100
203-
| **Linux x86-64** | 6.3.4 | gfx90a / gfx942 / gfx1100
204-
| **Linux x86-64** | 6.4.4 | gfx90a / gfx942 / gfx1100
205-
| **Linux x86-64** | 7.0.0 | gfx90a / gfx942 / gfx1100
194+
|--------------------|----------|---------------------------------------------------------------------|
195+
| **Linux x86-64** | 6.2.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
196+
| **Linux x86-64** | 6.3.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
197+
| **Linux x86-64** | 6.4.4 | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101, gfx1200, gfx1201
198+
| **Linux x86-64** | 7.0.2 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100 / gfx1101 / gfx1200 / gfx1201
206199

207200
**Windows is not currently supported.**
208201

0 commit comments

Comments
 (0)