Skip to content

Commit 9c90811

Browse files
authored
Merge branch 'main' into build-cmake
2 parents 19cd26c + e772a9e commit 9c90811

File tree

12 files changed

+30
-50
lines changed

12 files changed

+30
-50
lines changed
File renamed without changes.

.github/scripts/build-cuda.sh

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@ declare build_os
44
declare cuda_version
55

66
set -xeuo pipefail
7-
build_capability="50;52;60;61;70;75;80;86;89;90;100;120"
8-
remove_for_11_7=";89;90;100;120"
9-
remove_for_11_8=";100;120"
10-
remove_for_lt_12_7=";100;120"
11-
[[ "${cuda_version}" == 11.7.* ]] && build_capability=$(sed 's|'"$remove_for_11_7"'||g' <<< "$build_capability")
12-
[[ "${cuda_version}" == 11.8.* ]] && build_capability=$(sed 's|'"$remove_for_11_8"'||g' <<< "$build_capability")
13-
[[ "${cuda_version}" < 12.7 ]] && build_capability=$(sed 's|'"$remove_for_lt_12_7"'||g; s|'"${remove_for_lt_12_7#;}"';||g' <<< "$build_capability")
7+
8+
# By default, target Maxwell through Hopper.
9+
build_capability="50;52;60;61;70;75;80;86;89;90"
10+
11+
# CUDA 11.7: Remove sm89 and sm90
12+
[[ "${cuda_version}" == 11.7.* ]] && build_capability="50;52;60;61;70;75;80;86"
13+
14+
# CUDA 12.8: Add sm100 and sm120; remove sm50 through sm61
15+
[[ "${cuda_version}" == 12.8.* ]] && build_capability="70;75;80;86;89;90;100;120"
16+
1417
[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
1518

1619
if [ "${build_os:0:6}" == ubuntu ]; then

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
build-shared-libs:
3131
strategy:
3232
matrix:
33-
os: [ubuntu-latest, macos-latest, windows-latest]
33+
os: [ubuntu-22.04, macos-latest, windows-latest]
3434
arch: [x86_64, aarch64]
3535
exclude:
3636
- os: windows-latest # This probably requires arm64 Windows agents

README.md

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,12 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
66

77
The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
88

9-
There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well.
9+
There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon, hopefully NPU.
1010

1111
**Please head to the official documentation page:**
1212

1313
**[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
1414

15-
## `bitsandbytes` multi-backend _alpha_ release is out!
16-
17-
🚀 Big news! After months of hard work and incredible community contributions, we're thrilled to announce the **bitsandbytes multi-backend _alpha_ release**! 💥
18-
19-
Now supporting:
20-
- 🔥 **AMD GPUs** (ROCm)
21-
-**Intel CPUs** & **GPUs**
22-
23-
We’d love your early feedback! 🙏
24-
25-
👉 [Instructions for your `pip install` here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)
26-
27-
We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality (e.g. helping us with the upcoming Apple Silicon backend or reporting bugs). BNB is a community project and we're excited for your collaboration 🤗
28-
2915
## License
3016

3117
`bitsandbytes` is MIT licensed.

bitsandbytes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@
2121
"optim.optimizer.MockArgs": False,
2222
}
2323

24-
__version__ = "0.45.3.dev0"
24+
__version__ = "0.45.4.dev0"

bitsandbytes/functional.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,14 +389,14 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
389389
if signed
390390
else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1,
391391
)
392-
boundaries = torch.linspace(0.1, 1, fraction_items)
392+
boundaries = torch.linspace(0.1, 1, fraction_items, dtype=torch.float32)
393393
means = (boundaries[:-1] + boundaries[1:]) / 2.0
394394
data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
395395
if signed:
396396
data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
397397

398398
if additional_items > 0:
399-
boundaries = torch.linspace(0.1, 1, additional_items + 1)
399+
boundaries = torch.linspace(0.1, 1, additional_items + 1, dtype=torch.float32)
400400
means = (boundaries[:-1] + boundaries[1:]) / 2.0
401401
data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
402402
if signed:
@@ -412,7 +412,7 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
412412
data.append(0)
413413

414414
data.sort()
415-
return torch.tensor(data)
415+
return torch.tensor(data, dtype=torch.float32)
416416

417417

418418
def create_quantile_map(A, total_bits=8):
@@ -821,6 +821,7 @@ def as_dict(self, packed=False):
821821

822822
def to(self, device):
823823
# make sure the quantization state is on the right device
824+
self.code = self.code.to(device)
824825
self.absmax = self.absmax.to(device)
825826
if self.nested:
826827
self.offset = self.offset.to(device)
@@ -1048,7 +1049,7 @@ def dequantize_blockwise(
10481049
lib.cdequantize_blockwise_cpu_fp32(
10491050
get_ptr(code),
10501051
get_ptr(A),
1051-
get_ptr(quant_state.absmax),
1052+
get_ptr(absmax),
10521053
get_ptr(out),
10531054
ct.c_longlong(quant_state.blocksize),
10541055
ct.c_longlong(A.numel()),

docs/source/faqs.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# FAQs
22

3-
Please submit your questions in [this Github Discussion thread](https://github.com/TimDettmers/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation.
3+
Please submit your questions in [this Github Discussion thread](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation.
44

55
We'll pick the most generally applicable ones and post the QAs here or integrate them into the general documentation (also feel free to submit doc PRs, please).
66

docs/source/fsdp_qlora.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ FSDP-QLoRA combines data parallelism (FSDP enables sharding model parameters, op
55
This guide provides a brief guide on how bitsandbytes supports storing quantized weights to enable FSDP-QLoRA, and how to run training with the Hugging Face libraries.
66

77
> [!TIP]
8-
> Other changes required for bitsandbytes to support FSDP-QLoRA, such as reconstructing the weights from the quantization metadata and preventing quantizing already quantized weights when they're moved from a CPU to GPU, are documented in this [Pull Request](https://github.com/TimDettmers/bitsandbytes/pull/970) and described in the [Enabling 70B Finetuning on Consumer GPUs](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive) blog post. We highly recommend reading these resources for a better understanding of FSDP-QLoRA!
8+
> Other changes required for bitsandbytes to support FSDP-QLoRA, such as reconstructing the weights from the quantization metadata and preventing quantizing already quantized weights when they're moved from a CPU to GPU, are documented in this [Pull Request](https://github.com/bitsandbytes-foundation/bitsandbytes/pull/970) and described in the [Enabling 70B Finetuning on Consumer GPUs](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive) blog post. We highly recommend reading these resources for a better understanding of FSDP-QLoRA!
99
1010
## Quantized data storage
1111

docs/source/installation.mdx

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Welcome to the installation guide for the `bitsandbytes` library! This document
1919

2020
## CUDA[[cuda]]
2121

22-
`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.6**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
22+
`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.8**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
2323

2424
### Supported CUDA Configurations[[cuda-pip]]
2525

@@ -28,10 +28,8 @@ The latest version of the distributed `bitsandbytes` package is built with the f
2828
| **OS** | **CUDA Toolkit** | **Host Compiler** |
2929
|-------------|------------------|----------------------|
3030
| **Linux** | 11.7 - 12.3 | GCC 11.4 |
31-
| | 12.4 - 12.6 | GCC 13.2 |
32-
| **Windows** | 11.7 - 12.6 | MSVC 19.42+ (VS2022) |
33-
| | 12.4+ | GCC 13.2 |
34-
| **Windows** | 11.7 - 12.6 | MSVC 19.38+ (VS2022) |
31+
| | 12.4 - 12.8 | GCC 13.2 |
32+
| **Windows** | 11.7 - 12.8 | MSVC 19.42+ (VS2022) |
3533

3634
For CUDA systems, ensure your hardware meets the following requirements:
3735

@@ -104,7 +102,6 @@ Now to install the bitsandbytes package from source, run the following commands:
104102

105103
```bash
106104
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
107-
pip install -r requirements-dev.txt
108105
cmake -DCOMPUTE_BACKEND=cuda -S .
109106
make
110107
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -152,7 +149,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
152149
```bash
153150
wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
154151
# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
155-
# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126}
152+
# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 128}
156153
# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
157154

158155
# For example, the following installs CUDA 12.6 to ~/local/cuda-12.6 and exports the path to your .bashrc
@@ -228,7 +225,7 @@ pip install "transformers>=4.45.1"
228225
<hfoption id="AMD ROCm">
229226

230227
> [!WARNING]
231-
> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
228+
> Pre-compiled binaries are only built for ROCm versions `6.1.2`/`6.2.4`/`6.3.2` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
232229
>
233230
> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
234231
>
@@ -320,9 +317,6 @@ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha releas
320317
# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
321318
git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
322319

323-
# Install dependencies
324-
pip install -r requirements-dev.txt
325-
326320
# Compile & install
327321
apt-get install -y build-essential cmake # install build tools dependencies, unless present
328322
cmake -DCOMPUTE_BACKEND=hip -S . # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
@@ -345,7 +339,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
345339
```
346340
git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
347341
pip install intel_extension_for_pytorch
348-
pip install -r requirements-dev.txt
349342
cmake -DCOMPUTE_BACKEND=cpu -S .
350343
make
351344
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -365,9 +358,6 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
365358
# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
366359
git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
367360
368-
# Install dependencies
369-
pip install -r requirements-dev.txt
370-
371361
# Compile & install
372362
apt-get install -y build-essential cmake # install build tools dependencies, unless present
373363
cmake -DCOMPUTE_BACKEND=npu -S .

install_cuda.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,10 @@ if [[ -n "$CUDA_VERSION" ]]; then
6969
URL=$URL126
7070
FOLDER=cuda-12.6
7171
else
72-
echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
72+
echo "argument error: No cuda version passed as input. Choose among versions 110 to 126"
7373
fi
7474
else
75-
echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
75+
echo "argument error: No cuda version passed as input. Choose among versions 110 to 126"
7676
fi
7777

7878
FILE=$(basename $URL)

0 commit comments

Comments
 (0)