CUDA: Drop compilation compatibility with Maxwell (#1806)

matthewdouglas · web-flow · commit 5c0a0a9f03b0 · 2025-11-14T11:09:28.000-05:00
* CUDA: Drop Maxwell compatibility

* Update docs
diff --git a/csrc/common.cuh b/csrc/common.cuh
@@ -2,9 +2,6 @@
 
 // TODO: Let's make some of these constexpr and put in a namespace.
 
-#define BNB_CC_MAXWELL 500
-#define BNB_CC_MAXWELL2 520
-#define BNB_CC_MAXWELL2_X1 530
 #define BNB_CC_PASCAL 600
 #define BNB_CC_PASCAL_X2 620
 #define BNB_CC_VOLTA 700
@@ -17,7 +14,6 @@
 #define BNB_CC_HOPPER 900
 #define BNB_CC_BLACKWELL 1000
 
-#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
 #define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
 #define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
 #define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -1767,15 +1767,7 @@ template <typename T, int THREADS, int SPARSE_DECOMP>
 __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
     void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols) {
 
-    // For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32.
-    // Otherwise `T` is `fp16`. This can be removed when Maxwell is dropped.
-#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE
-    using TReduction = T;
-#else
-    using TReduction = float;
-#endif
-
-    using BlockReduceT = cub::BlockReduce<TReduction, THREADS>;
+    using BlockReduceT = cub::BlockReduce<T, THREADS>;
 
     // One block per row.
     // Threads load column values in a striped arrangement.
@@ -1785,27 +1777,27 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
     // We then do a blockwise reduction to determine the row's absmax.
 
     __shared__ typename BlockReduceT::TempStorage temp_storage;
-    __shared__ TReduction smem_row_absmax;
+    __shared__ T smem_row_absmax;
 
     const int row_id = blockIdx.x;
     const T* row_data = A + (row_id * cols);
 
     // Threads will read the row values in a striped access pattern and find a local absmax.
-    TReduction row_local_absmax = -FLT_MIN;
+    T row_local_absmax = -FLT_MIN;
     for (int i = threadIdx.x; i < cols; i += THREADS) {
-        const TReduction absval = fabsf(__ldcs(&(row_data[i])));
+        const T absval = fabsf(__ldcs(&(row_data[i])));
 
         // For sparse decomposition, values outside of the threshold are not to be
         // included when calculating the row's absmax.
         if constexpr (SPARSE_DECOMP) {
-            row_local_absmax = fmaxf(row_local_absmax, absval < TReduction(threshold) ? absval : row_local_absmax);
+            row_local_absmax = fmaxf(row_local_absmax, absval < T(threshold) ? absval : row_local_absmax);
         } else {
             row_local_absmax = fmaxf(row_local_absmax, absval);
         }
     }
 
     // Reduce thread-local absmax across the block.
-    const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
+    const T row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
     if (threadIdx.x == 0) {
         // Save our block's absmax to shared memory for the quantization step.
         rowStats[row_id] = smem_row_absmax = row_absmax;
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -40,12 +40,6 @@ The library can be built using CUDA Toolkit versions as old as **11.8**.
 | NF4/FP4 quantization            | 6.0+            | Pascal (GTX 10X0 series, P100) or newer GPUs|
 
 
-> [!WARNING]
-> Support for Maxwell GPUs is deprecated and will be removed in a future release.
-> Maxwell support is not included in PyPI distributions from `v0.48.0` on and must be built from source.
-> For the best results, a Turing generation device or newer is recommended.
-
-
 ### Installation via PyPI[[cuda-pip]]
 
 This is the most straightforward and recommended installation option.
@@ -81,7 +75,7 @@ For Linux and Windows systems, compiling from source allows you to customize the
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 9 or newer.
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 11 or newer.
 
 For example, to install a compiler and CMake on Ubuntu:
 
@@ -133,7 +127,7 @@ The currently distributed `bitsandbytes` packages are built with the following c
 | **OS**             | **oneAPI Toolkit** | **Kernel Implementation** |
 |--------------------|------------------|----------------------|
 | **Linux x86-64**   | 2025.1.3         | SYCL + Triton        |
-| **Windows x86-64** | N/A              | SYCL |
+| **Windows x86-64** | 2025.1.3         | SYCL + Triton        |
 
 The Linux build has a minimum glibc version of 2.34.
 
@@ -197,12 +191,11 @@ pip install -e .
 The currently distributed preview `bitsandbytes` are built with the following configurations:
 
 | **OS**             | **ROCm** | **Targets**
-|--------------------|----------|---------------------------|
-| **Linux x86-64**   | 6.1.2    | gfx90a / gfx942 / gfx1100
-| **Linux x86-64**   | 6.2.4    | gfx90a / gfx942 / gfx1100
-| **Linux x86-64**   | 6.3.4    | gfx90a / gfx942 / gfx1100
-| **Linux x86-64**   | 6.4.4    | gfx90a / gfx942 / gfx1100
-| **Linux x86-64**   | 7.0.0    | gfx90a / gfx942 / gfx1100
+|--------------------|----------|---------------------------------------------------------------------|
+| **Linux x86-64**   | 6.2.4    | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
+| **Linux x86-64**   | 6.3.4    | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101
+| **Linux x86-64**   | 6.4.4    | CDNA: gfx90a, gfx942 / RDNA: gfx1100, gfx1101, gfx1200, gfx1201
+| **Linux x86-64**   | 7.0.2    | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100 / gfx1101 / gfx1200 / gfx1201
 
 **Windows is not currently supported.**