Skip to content

Commit cd026c3

Browse files
committed
Merge remote-tracking branch 'upstream/multi-backend-refactor' into enable_6.2_packaging
2 parents 7e787da + 45b7d14 commit cd026c3

File tree

8 files changed

+94
-30
lines changed

8 files changed

+94
-30
lines changed

.github/workflows/upload_pr_documentation.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ on:
66
types:
77
- completed
88

9+
permissions:
10+
contents: read
11+
pull-requests: write # Allows posting comments on pull requests
12+
913
jobs:
1014
build:
1115
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main

bitsandbytes/backends/cpu_xpu_common.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -370,25 +370,6 @@ def quantize_4bit_impl(
370370
quant_type=quant_type,
371371
)
372372

373-
if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and input_shape[1] % blocksize == 0 and quant_type == "nf4":
374-
# lowp_mode: lowest precision for computation
375-
lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
376-
state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
377-
out.reshape([input_shape[0], input_shape[1] // 2]),
378-
ipex_cpu.quantization.WoqWeightDtype.NF4,
379-
input_shape, # weight shape
380-
absmax.view(input_shape[0], input_shape[1] // blocksize), # scales
381-
None, # zero_points
382-
None, # bias
383-
None, # g_idx
384-
None, # batch_size
385-
blocksize,
386-
int(lowp_mode),
387-
-1, # act_quant_mode. -1 means don't quant activation
388-
)
389-
state.absmax = torch.Tensor()
390-
return torch.empty([1, 0], dtype=torch.uint8), state
391-
392373
return out.unsqueeze(0), state
393374

394375

bitsandbytes/cextension.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,6 @@ def get_native_library() -> BNBNativeLibrary:
106106
if hasattr(dll, "get_context"): # only a CUDA-built library exposes this
107107
return CudaBNBNativeLibrary(dll)
108108

109-
logger.warning(
110-
"The installed version of bitsandbytes was compiled without GPU support. "
111-
"8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.",
112-
)
113109
return BNBNativeLibrary(dll)
114110

115111

bitsandbytes/nn/modules.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
2020
LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
2121
OutlierTracer,
22+
enable_ipex_fusion,
2223
)
2324

2425
T = TypeVar("T", bound="torch.nn.Module")
@@ -444,17 +445,35 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
444445
save weight and bias,
445446
then fill state_dict with components of quant_state
446447
"""
448+
if (
449+
getattr(self.weight, "quant_state", None) is not None
450+
and getattr(self.weight.quant_state, "op_context", None) is not None
451+
):
452+
context = self.weight.quant_state.op_context
453+
self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
454+
447455
super()._save_to_state_dict(destination, prefix, keep_vars) # saving weight and bias
448456

449457
if getattr(self.weight, "quant_state", None) is not None:
458+
if (
459+
self.weight.quant_state.absmax.shape.numel() == 0
460+
and getattr(self.weight.quant_state, "op_context", None) is not None
461+
):
462+
self.weight.quant_state.absmax = context.get_scales().reshape(-1)
463+
delattr(self.weight.quant_state, "op_context")
450464
for k, v in self.weight.quant_state.as_dict(packed=True).items():
451465
destination[prefix + "weight." + k] = v if keep_vars else v.detach()
452-
if getattr(self.weight.quant_state, "op_context", None) is not None:
453-
context = self.weight.quant_state.op_context
454-
destination[prefix + "weight." + "absmax"] = context.get_scales().reshape(-1)
455-
self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
456466

457467
def forward(self, x: torch.Tensor):
468+
# Check if ipex fusion can be used
469+
if (
470+
x.device.type == "cpu"
471+
and not hasattr(self.weight.quant_state, "op_context")
472+
and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
473+
and self.weight.quant_state.quant_type == "nf4"
474+
):
475+
enable_ipex_fusion(self.weight, self.weight.quant_state)
476+
458477
# weights are cast automatically as Int8Params, but the bias has to be cast manually
459478
if self.bias is not None and self.bias.dtype != x.dtype:
460479
self.bias.data = self.bias.data.to(x.dtype)

bitsandbytes/utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,30 @@ def unpack_tensor_to_dict(tensor_data):
200200
return unpacked_dict
201201

202202

203+
def enable_ipex_fusion(weight, quant_state):
204+
from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
205+
206+
if _ipex_cpu_version_prereq(2, 3):
207+
import intel_extension_for_pytorch as ipex
208+
209+
lowp_mode = ipex.quantization.WoqLowpMode.BF16
210+
quant_state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
211+
weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
212+
ipex.quantization.WoqWeightDtype.NF4,
213+
quant_state.shape, # weight shape
214+
quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize), # scales
215+
None, # zero_points
216+
None, # bias
217+
None, # g_idx
218+
None, # batch_size
219+
quant_state.blocksize,
220+
int(lowp_mode),
221+
-1, # act_quant_mode. -1 means don't quant activation
222+
)
223+
quant_state.absmax = torch.Tensor()
224+
weight.data = torch.empty([1, 0], dtype=torch.uint8)
225+
226+
203227
class QuantState:
204228
"""container for quantization state components to work with Params4bit and similar classes"""
205229

docs/source/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
title: 8-bit optimizers
1313
- local: algorithms
1414
title: Algorithms
15+
- local: non_cuda_backends
16+
title: Non-CUDA compute backends
1517
- local: fsdp_qlora
1618
title: FSDP-QLoRA
1719
- local: integrations

docs/source/installation.mdx

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,23 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
134134

135135
3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
136136

137-
## Multi-backend preview release compilation[[multi-backend]]
137+
## Multi-backend[[multi-backend]]
138+
139+
> [!TIP]
140+
> This functionality is currently in preview and therefore not yet production-ready!
138141
139142
Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
140143

144+
### Pip install the pre-built wheel (recommended for most)
145+
146+
WIP (will be added in the coming days)
147+
148+
### Compilation
149+
141150
<hfoptions id="backend">
142151
<hfoption id="AMD ROCm">
143152

144-
### AMD GPU
153+
#### AMD GPU
145154

146155
bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
147156

@@ -179,7 +188,7 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
179188
</hfoption>
180189
<hfoption id="Intel CPU + GPU">
181190

182-
### Intel CPU
191+
#### Intel CPU
183192

184193
> [!TIP]
185194
> Intel CPU backend only supports building from source; for now, please follow the instructions below.
@@ -200,6 +209,8 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
200209
</hfoption>
201210
<hfoption id="Apple Silicon (MPS)">
202211

212+
#### Apple Silicon
213+
203214
WIP
204215

205216
</hfoption>

docs/source/non_cuda_backends.mdx

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Multi-backend support (non-CUDA backends)
2+
3+
As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
4+
5+
At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.
6+
7+
Please refer to the [installation instructions](./installation#multi-backend) for details on installing the backend you intend to test (and hopefully provide feedback on).
8+
9+
> [!Tip]
10+
> Apple Silicon support is planned for Q4 2024. We are actively seeking contributors to help implement this, develop a concrete plan, and create a detailed list of requirements. Due to limited resources, we rely on community contributions for this implementation effort. To discuss further, please spell out your thoughts and discuss in [this GitHub discussion](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and tag `@Titus-von-Koeller` and `@matthewdouglas`. Thank you!
11+
12+
## Alpha Release
13+
14+
As we are currently in the alpha testing phase, bugs are expected, and performance might not meet expectations. However, this is exactly what we want to discover from **your** perspective as the end user!
15+
16+
Please share and discuss your feedback with us here:
17+
18+
- [Github Discussion: Multi-backend refactor: Alpha release ( AMD ROCm ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
19+
- [Github Discussion: Multi-backend refactor: Alpha release ( Intel ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
20+
21+
Thank you for your support!
22+
23+
## Benchmarks
24+
25+
### Intel
26+
27+
### AMD

0 commit comments

Comments
 (0)