Skip to content

Commit d8bd0b3

Browse files
Merge branch 'cpu-ops' of https://github.com/TimDettmers/bitsandbytes into cpu-ops
2 parents 9198900 + 958fecb commit d8bd0b3

27 files changed

+603
-631
lines changed

.github/scripts/build-cuda.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@ set -xeuo pipefail
88
# By default, target Maxwell through Hopper.
99
build_capability="50;52;60;61;70;75;80;86;89;90"
1010

11-
# CUDA 11.7: Remove sm89 and sm90
12-
[[ "${cuda_version}" == 11.7.* ]] && build_capability="50;52;60;61;70;75;80;86"
13-
14-
# CUDA 12.8: Add sm100 and sm120; remove sm50 through sm61
15-
[[ "${cuda_version}" == 12.8.* ]] && build_capability="70;75;80;86;89;90;100;120"
11+
# CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
12+
[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
1613

1714
[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
1815

.github/workflows/python-package.yml

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
arch: aarch64
4141
- os: ubuntu-22.04 # Temporary. Takes too long, not ready yet.
4242
arch: aarch64
43-
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
43+
runs-on: ${{ matrix.os }}
4444
steps:
4545
- uses: actions/checkout@v4
4646
- name: Setup MSVC
@@ -70,13 +70,13 @@ jobs:
7070
- windows-latest
7171
arch: [x86_64, aarch64]
7272
cuda_version:
73-
["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
73+
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
7474
exclude:
7575
- os: windows-latest # This probably requires arm64 Windows agents
7676
arch: aarch64
7777
- os: ubuntu-22.04 # Temporary. Takes too long, not ready yet.
7878
arch: aarch64
79-
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
79+
runs-on: ${{ matrix.os }}
8080
steps:
8181
- uses: actions/checkout@v4
8282
# Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
@@ -250,33 +250,3 @@ jobs:
250250
uses: pypa/gh-action-pypi-publish@release/v1
251251
with:
252252
print-hash: true
253-
254-
# test:
255-
# needs:
256-
# - build-wheels
257-
# strategy:
258-
# fail-fast: false
259-
# matrix:
260-
# include:
261-
# - os: ubuntu-latest
262-
# arch: x86_64
263-
# python-version: "3.8"
264-
# - os: windows-latest
265-
# arch: x86_64
266-
# python-version: "3.8"
267-
# runs-on: ${{ matrix.os }}
268-
# steps:
269-
# - uses: actions/checkout@v4
270-
# - uses: actions/download-artifact@v4
271-
# with:
272-
# merge-multiple: true
273-
# pattern: "bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}*"
274-
# path: wheel/
275-
# - uses: actions/setup-python@v5
276-
# with:
277-
# python-version: ${{ matrix.python-version }}
278-
# cache: pip
279-
# - shell: bash
280-
# run: ls -lar wheel/
281-
# - run: pip install wheel/*.whl -r requirements-ci.txt
282-
# - run: pytest --log-cli-level=DEBUG --continue-on-collection-errors tests

CMakeLists.txt

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ if(BUILD_CUDA)
9393
)
9494
endif()
9595

96-
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
97-
message(FATAL_ERROR "CUDA Version < 11 is not supported")
96+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4")
97+
message(FATAL_ERROR "CUDA Version < 11.4 is not supported")
9898
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
9999
message(FATAL_ERROR "CUDA Version > 12 is not supported")
100100
endif()
@@ -103,35 +103,20 @@ if(BUILD_CUDA)
103103
if(CMAKE_VERSION VERSION_LESS "3.23.0")
104104
message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
105105

106-
# 11.x and 12.x both support these at a minimum.
107-
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80)
106+
# 11.4+ supports these at a minimum.
107+
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
108108
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
109109

110-
# CUDA 11.1 adds Ampere support for GA102-GA107.
111-
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1")
112-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
113-
endif()
114-
115-
# CUDA 11.4 adds Ampere support for GA10B.
116-
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4")
117-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
118-
endif()
119-
120110
# CUDA 11.8 adds support for Ada and Hopper.
121111
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
122112
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
123113
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
124114
endif()
125-
# CUDA 12.7 adds support for Blackwell B100.
126-
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.7")
127-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100)
128-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100)
129-
endif()
130115

131-
# CUDA 12.8 adds support for RTX 50 Blackwell.
116+
# CUDA 12.8 adds support for Blackwell.
132117
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
133-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 101 120)
134-
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 101 120)
118+
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120)
119+
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
135120
endif()
136121
endif()
137122

bitsandbytes/__init__.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# LICENSE file in the root directory of this source tree.
55

66

7+
import sys
8+
79
import torch
810

911
from . import _ops, research, utils
@@ -18,24 +20,48 @@
1820
from .optim import adam
1921

2022
# This is a signal for integrations with transformers/diffusers.
21-
# Eventually, we will remove this and check based on release version.
23+
# Eventually we may remove this but it is currently required for compatibility.
2224
features = {"multi-backend"}
2325
supported_torch_devices = {
24-
"cuda",
2526
"cpu",
26-
# "mps",
27-
# "xpu",
28-
# "hpu",
29-
# "npu",
27+
"cuda", # NVIDIA/AMD GPU
28+
"xpu", # Intel GPU
29+
"hpu", # Gaudi
30+
"npu", # Ascend NPU
31+
"mps", # Apple Silicon
3032
}
3133

3234
if torch.cuda.is_available():
3335
from .backends.cuda import ops as cuda_ops
3436

37+
38+
def _import_backends():
39+
"""
40+
Discover and autoload all available backends installed as separate packages.
41+
Packages with an entrypoint for "bitsandbytes.backends" will be loaded.
42+
Inspired by PyTorch implementation: https://pytorch.org/tutorials/prototype/python_extension_autoload.html
43+
"""
44+
from importlib.metadata import entry_points
45+
46+
if sys.version_info < (3, 10):
47+
extensions = entry_points().get("bitsandbytes.backends", [])
48+
else:
49+
extensions = entry_points(group="bitsandbytes.backends")
50+
51+
for ext in extensions:
52+
try:
53+
entry = ext.load()
54+
entry()
55+
except Exception as e:
56+
raise RuntimeError(f"bitsandbytes: failed to load backend {ext.name}: {e}") from e
57+
58+
59+
_import_backends()
60+
3561
__pdoc__ = {
3662
"libbitsandbytes": False,
3763
"optim.optimizer.Optimizer8bit": False,
3864
"optim.optimizer.MockArgs": False,
3965
}
4066

41-
__version__ = "0.45.5.dev0"
67+
__version__ = "0.46.0.dev0"

bitsandbytes/_ops.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,37 @@
1515
register_fake = torch.library.impl_abstract
1616
register_kernel = torch.library.impl
1717

18+
# Int8 mixed precision matmul + dequant + bias
19+
torch.library.define(
20+
"bitsandbytes::int8_mixed_scaled_mm",
21+
"(Tensor A, Tensor CA, Tensor CB, Tensor SCA, Tensor SCB, Tensor? outlier_cols=None, Tensor? bias=None) -> (Tensor, Tensor?)",
22+
)
23+
24+
25+
@register_fake("bitsandbytes::int8_mixed_scaled_mm")
26+
def _(
27+
A: torch.Tensor,
28+
CA: torch.Tensor,
29+
CB: torch.Tensor,
30+
SCA: torch.Tensor,
31+
SCB: torch.Tensor,
32+
outlier_cols: Optional[torch.Tensor] = None,
33+
bias: Optional[torch.Tensor] = None,
34+
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
35+
shapeC = (*CA.shape[:-1], CB.shape[0])
36+
37+
out = torch.empty(shapeC, device=A.device, dtype=A.dtype)
38+
39+
outlier_cols = torch.library.get_ctx().new_dynamic_size()
40+
subA = A.new_empty(outlier_cols, dtype=torch.int64)
41+
42+
return out, subA
43+
1844

1945
# Higher level op: int8 matmul + dequant + bias
2046
torch.library.define(
2147
"bitsandbytes::int8_scaled_mm",
22-
"(Tensor A, Tensor B, Tensor row_stats, Tensor col_stats, Tensor? bias=None, ScalarType dtype=float16) -> Tensor",
48+
"(Tensor A, Tensor B, Tensor row_stats, Tensor col_stats, Tensor? bias=None, ScalarType? dtype=None) -> Tensor",
2349
)
2450

2551

@@ -30,10 +56,10 @@ def _(
3056
row_stats: torch.Tensor,
3157
col_stats: torch.Tensor,
3258
bias: Optional[torch.Tensor] = None,
33-
dtype=torch.float16,
59+
dtype: Optional[torch.dtype] = None,
3460
) -> torch.Tensor:
3561
shapeC = (*A.shape[:-1], B.shape[0])
36-
return torch.empty(shapeC, device=A.device, dtype=dtype)
62+
return torch.empty(shapeC, device=A.device, dtype=dtype or torch.float16)
3763

3864

3965
torch.library.define(
@@ -98,15 +124,15 @@ def _(A: torch.Tensor, stats: torch.Tensor) -> torch.Tensor:
98124

99125

100126
# Default PyTorch-native implementation
101-
@register_kernel("bitsandbytes::int8_vectorwise_dequant", None)
127+
@register_kernel("bitsandbytes::int8_vectorwise_dequant", "default")
102128
def _(A: torch.Tensor, stats: torch.Tensor):
103129
# To dequantize we divide by 127, or multiply by the reciprocal.
104130
return A * stats.view(-1, 1) * 7.874015718698502e-3
105131

106132

107133
torch.library.define(
108134
"bitsandbytes::int8_mm_dequant",
109-
"(Tensor A, Tensor row_stats, Tensor col_stats, ScalarType dtype=float16, Tensor? bias=None) -> Tensor",
135+
"(Tensor A, Tensor row_stats, Tensor col_stats, ScalarType? dtype=None, Tensor? bias=None) -> Tensor",
110136
)
111137

112138

@@ -115,11 +141,11 @@ def _(
115141
A: torch.Tensor,
116142
row_stats: torch.Tensor,
117143
col_stats: torch.Tensor,
118-
dtype=torch.float16,
144+
dtype: Optional[torch.dtype] = None,
119145
bias: Optional[torch.Tensor] = None,
120146
) -> torch.Tensor:
121147
torch._check(A.dtype == torch.int32, lambda: "A must be int32")
122-
return torch.empty_like(A, dtype=dtype)
148+
return torch.empty_like(A, dtype=dtype or torch.float16)
123149

124150

125151
torch.library.define(

bitsandbytes/autograd/_functions.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -210,37 +210,28 @@ def forward(
210210
# 2. Quantize B
211211
state.CB, state.SCB, _ = F.int8_vectorwise_quant(B.to(torch.float16))
212212

213-
# Handle sparse decomposition. In some instances, we may have not found any
214-
# outlier columns at all. In that case, we'll skip this part completely.
215-
if state.threshold > 0.0 and outlier_cols is not None and outlier_cols.numel():
213+
# Handle sparse decomposition
214+
if state.threshold > 0.0:
216215
state.idx = outlier_cols
217216

218-
# Zero out the outliers in the transposed 8bit inputs.
219-
if CAt is not None:
220-
CAt[:, state.idx] = 0
221-
222-
# Extract the input outliers in original precision
223-
subA = A[:, state.idx].contiguous()
217+
# Mixed Int8 Matmul + Dequant + Bias
218+
output, subA = torch.ops.bitsandbytes.int8_mixed_scaled_mm(
219+
A,
220+
CA,
221+
state.CB,
222+
SCA,
223+
state.SCB,
224+
outlier_cols,
225+
bias,
226+
)
224227

225-
# Extract the corresponding weights
226-
if state.has_fp16_weights:
227-
state.subB = B[:, state.idx].t()
228-
else:
229-
# To dequantize our weights associated with the input outliers,
230-
# we want to divide by 127. It's however more performant to multiply
231-
# by the reciprocal.
232-
outliers = state.CB[:, state.idx]
233-
state.subB = F.int8_vectorwise_dequant(outliers, state.SCB).to(A.dtype).t()
234228
else:
229+
# Int8 Matmul + Dequant + Bias
230+
output = torch.ops.bitsandbytes.int8_scaled_mm.default(
231+
CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype
232+
)
235233
subA = None
236234

237-
# 3. Int8 Matmul + Dequant + Bias
238-
output = torch.ops.bitsandbytes.int8_scaled_mm.default(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
239-
240-
# 4. Mixed-precision decomposition matmul
241-
if subA is not None and state.subB is not None:
242-
output = output.addmm(subA, state.subB)
243-
244235
# 5. Save state
245236
ctx.state = state
246237

@@ -293,7 +284,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
293284
dtype=torch.float16,
294285
)
295286

296-
if state.threshold > 0.0 and subA is not None:
287+
if state.threshold > 0.0 and subA is not None and subA.numel() > 0:
297288
grad_B[:, idx] += torch.matmul(grad_output.t(), subA)
298289

299290
if req_gradA:

bitsandbytes/backends/cpu/ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def _(
2929
A: torch.Tensor,
3030
row_stats: torch.Tensor,
3131
col_stats: torch.Tensor,
32-
dtype=torch.float16,
32+
dtype: Optional[torch.dtype] = None,
3333
bias: Optional[torch.Tensor] = None,
3434
) -> torch.Tensor:
3535
torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
@@ -44,7 +44,7 @@ def _(
4444
if bias is not None:
4545
out += bias
4646

47-
return out.to(dtype)
47+
return out.to(dtype or torch.float16)
4848

4949

5050
@register_kernel("bitsandbytes::quantize_blockwise", "cpu")

0 commit comments

Comments
 (0)