Merge branch 'main' into build-cmake

mgorny · web-flow · commit 9c9081119c9c · 2025-03-17T15:58:46.000+01:00
diff --git a/.github/dependabot.yml.disabled b/.github/dependabot.yml.disabled
diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
@@ -4,13 +4,16 @@ declare build_os
 declare cuda_version
 
 set -xeuo pipefail
-build_capability="50;52;60;61;70;75;80;86;89;90;100;120"
-remove_for_11_7=";89;90;100;120"
-remove_for_11_8=";100;120"
-remove_for_lt_12_7=";100;120"
-[[ "${cuda_version}" == 11.7.* ]] && build_capability=$(sed 's|'"$remove_for_11_7"'||g' <<< "$build_capability")
-[[ "${cuda_version}" == 11.8.* ]] && build_capability=$(sed 's|'"$remove_for_11_8"'||g' <<< "$build_capability")
-[[ "${cuda_version}" < 12.7 ]] && build_capability=$(sed 's|'"$remove_for_lt_12_7"'||g; s|'"${remove_for_lt_12_7#;}"';||g' <<< "$build_capability")
+
+# By default, target Maxwell through Hopper.
+build_capability="50;52;60;61;70;75;80;86;89;90"
+
+# CUDA 11.7: Remove sm89 and sm90
+[[ "${cuda_version}" == 11.7.* ]] && build_capability="50;52;60;61;70;75;80;86"
+
+# CUDA 12.8: Add sm100 and sm120; remove sm50 through sm61
+[[ "${cuda_version}" == 12.8.* ]] && build_capability="70;75;80;86;89;90;100;120"
+
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
 
 if [ "${build_os:0:6}" == ubuntu ]; then
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -30,7 +30,7 @@ jobs:
   build-shared-libs:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-22.04, macos-latest, windows-latest]
         arch: [x86_64, aarch64]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
diff --git a/README.md b/README.md
@@ -6,26 +6,12 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
 
 The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
-There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well.
+There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon, hopefully NPU.
 
 **Please head to the official documentation page:**
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
-## `bitsandbytes` multi-backend _alpha_ release is out!
-
-🚀 Big news! After months of hard work and incredible community contributions, we're thrilled to announce the **bitsandbytes multi-backend _alpha_ release**! 💥
-
-Now supporting:
-- 🔥 **AMD GPUs** (ROCm)
-- ⚡ **Intel CPUs** & **GPUs**
-
-We’d love your early feedback! 🙏
-
-👉 [Instructions for your `pip install` here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)
-
-We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality (e.g. helping us with the upcoming Apple Silicon backend or reporting bugs). BNB is a community project and we're excited for your collaboration 🤗
-
 ## License
 
 `bitsandbytes` is MIT licensed.
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.45.3.dev0"
+__version__ = "0.45.4.dev0"
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -389,14 +389,14 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
             if signed
             else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1,
         )
-        boundaries = torch.linspace(0.1, 1, fraction_items)
+        boundaries = torch.linspace(0.1, 1, fraction_items, dtype=torch.float32)
         means = (boundaries[:-1] + boundaries[1:]) / 2.0
         data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
         if signed:
             data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
 
     if additional_items > 0:
-        boundaries = torch.linspace(0.1, 1, additional_items + 1)
+        boundaries = torch.linspace(0.1, 1, additional_items + 1, dtype=torch.float32)
         means = (boundaries[:-1] + boundaries[1:]) / 2.0
         data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
         if signed:
@@ -412,7 +412,7 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
         data.append(0)
 
     data.sort()
-    return torch.tensor(data)
+    return torch.tensor(data, dtype=torch.float32)
 
 
 def create_quantile_map(A, total_bits=8):
@@ -821,6 +821,7 @@ def as_dict(self, packed=False):
 
     def to(self, device):
         # make sure the quantization state is on the right device
+        self.code = self.code.to(device)
         self.absmax = self.absmax.to(device)
         if self.nested:
             self.offset = self.offset.to(device)
@@ -1048,7 +1049,7 @@ def dequantize_blockwise(
         lib.cdequantize_blockwise_cpu_fp32(
             get_ptr(code),
             get_ptr(A),
-            get_ptr(quant_state.absmax),
+            get_ptr(absmax),
             get_ptr(out),
             ct.c_longlong(quant_state.blocksize),
             ct.c_longlong(A.numel()),
diff --git a/docs/source/faqs.mdx b/docs/source/faqs.mdx
@@ -1,6 +1,6 @@
 # FAQs
 
-Please submit your questions in [this Github Discussion thread](https://github.com/TimDettmers/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation.
+Please submit your questions in [this Github Discussion thread](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation.
 
 We'll pick the most generally applicable ones and post the QAs here or integrate them into the general documentation (also feel free to submit doc PRs, please).
 
diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
@@ -5,7 +5,7 @@ FSDP-QLoRA combines data parallelism (FSDP enables sharding model parameters, op
 This guide provides a brief guide on how bitsandbytes supports storing quantized weights to enable FSDP-QLoRA, and how to run training with the Hugging Face libraries.
 
 > [!TIP]
-> Other changes required for bitsandbytes to support FSDP-QLoRA, such as reconstructing the weights from the quantization metadata and preventing quantizing already quantized weights when they're moved from a CPU to GPU, are documented in this [Pull Request](https://github.com/TimDettmers/bitsandbytes/pull/970) and described in the [Enabling 70B Finetuning on Consumer GPUs](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive) blog post. We highly recommend reading these resources for a better understanding of FSDP-QLoRA!
+> Other changes required for bitsandbytes to support FSDP-QLoRA, such as reconstructing the weights from the quantization metadata and preventing quantizing already quantized weights when they're moved from a CPU to GPU, are documented in this [Pull Request](https://github.com/bitsandbytes-foundation/bitsandbytes/pull/970) and described in the [Enabling 70B Finetuning on Consumer GPUs](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive) blog post. We highly recommend reading these resources for a better understanding of FSDP-QLoRA!
 
 ## Quantized data storage
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -19,7 +19,7 @@ Welcome to the installation guide for the `bitsandbytes` library! This document
 
 ## CUDA[[cuda]]
 
-`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.6**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.8**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
 ### Supported CUDA Configurations[[cuda-pip]]
 
@@ -28,10 +28,8 @@ The latest version of the distributed `bitsandbytes` package is built with the f
 | **OS**      | **CUDA Toolkit** | **Host Compiler**         |
 |-------------|------------------|----------------------|
 | **Linux**   | 11.7 - 12.3      | GCC 11.4             |
-|             | 12.4 - 12.6      | GCC 13.2             |
-| **Windows** | 11.7 - 12.6      | MSVC 19.42+ (VS2022) |
-|             | 12.4+            | GCC 13.2             |
-| **Windows** | 11.7 - 12.6      | MSVC 19.38+ (VS2022) |
+|             | 12.4 - 12.8      | GCC 13.2             |
+| **Windows** | 11.7 - 12.8      | MSVC 19.42+ (VS2022) |
 
 For CUDA systems, ensure your hardware meets the following requirements:
 
@@ -104,7 +102,6 @@ Now to install the bitsandbytes package from source, run the following commands:
 
 ```bash
 git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -152,7 +149,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 128}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 12.6 to ~/local/cuda-12.6 and exports the path to your .bashrc
@@ -228,7 +225,7 @@ pip install "transformers>=4.45.1"
 <hfoption id="AMD ROCm">
 
 > [!WARNING]
-> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
+> Pre-compiled binaries are only built for ROCm versions `6.1.2`/`6.2.4`/`6.3.2` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
 >
 > Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
 >
@@ -320,9 +317,6 @@ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha releas
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
-# Install dependencies
-pip install -r requirements-dev.txt
-
 # Compile & install
 apt-get install -y build-essential cmake  # install build tools dependencies, unless present
 cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
@@ -345,7 +339,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 ```
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
-pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .
 make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -365,9 +358,6 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 # Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
 git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
-# Install dependencies
-pip install -r requirements-dev.txt
-
 # Compile & install
 apt-get install -y build-essential cmake  # install build tools dependencies, unless present
 cmake -DCOMPUTE_BACKEND=npu -S .
diff --git a/install_cuda.sh b/install_cuda.sh
@@ -69,10 +69,10 @@ if [[ -n "$CUDA_VERSION" ]]; then
     URL=$URL126
     FOLDER=cuda-12.6
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 126"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 126"
 fi
 
 FILE=$(basename $URL)
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,7 +58,7 @@ docs = ["hf-doc-builder==0.5.0"]
 dev = [
     "bitsandbytes[test]",
     "build>=1.0.0,<2",
-    "ruff==0.6.9",
+    "ruff==0.9.6",
     "pre-commit>=3.5.0,<4",
     "wheel>=0.42,<1"
 ]
diff --git a/setup.py b/setup.py
@@ -28,10 +28,10 @@ def run(self):
         super().run()
 
 
-setup(version="0.45.3.dev0",
+setup(version="0.45.4.dev0",
       packages=find_packages(),
       distclass=BinaryDistribution,
       cmake_source_dir=".",
       cmdclass={
           "build_py": ExtBuildPy,
-      })
+      })

Original file line number	Diff line number	Diff line change
`@@ -21,4 +21,4 @@`
`21`	`21`	`"optim.optimizer.MockArgs": False,`
`22`	`22`	`}`
`23`	`23`
`24`		`-__version__ = "0.45.3.dev0"`
	`24`	`+__version__ = "0.45.4.dev0"`