Skip to content

Commit a60d565

Browse files
authored
Merge branch 'bitsandbytes-foundation:main' into main
2 parents 555c6df + c9bce2b commit a60d565

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3485
-667
lines changed

.github/scripts/build-cuda.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ elif [ "${build_arch}" = "aarch64" ]; then
1414
# CUDA 12.8+: Add sm100/sm120
1515
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
1616
else
17-
# By default, target Maxwell through Hopper.
18-
build_capability="50;60;70;75;80;86;89;90"
17+
# By default, target Pascal through Hopper.
18+
build_capability="60;70;75;80;86;89;90"
1919

2020
# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
2121
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"

.github/workflows/tests.yml

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ jobs:
102102
matrix:
103103
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
104104
# Test with the oldest supported torch version, the newest two stable/RC.
105-
torch_version: ["2.2.2", "2.7.1", "2.8.0"]
105+
torch_version: ["2.3.1", "2.7.1", "2.8.0"]
106106
include:
107107
- os: ubuntu-22.04
108108
arch: x86_64
@@ -118,7 +118,7 @@ jobs:
118118
arch: arm64
119119
exclude:
120120
- os: ubuntu-22.04-arm
121-
torch_version: "2.2.2"
121+
torch_version: "2.3.1"
122122

123123
runs-on: ${{ matrix.runner || matrix.os }}
124124
env:
@@ -144,13 +144,14 @@ jobs:
144144

145145
- name: Install dependencies
146146
run: |
147-
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.8.0' && 'test/cpu') || 'cpu' }}
147+
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
148148
pip install -e ".[test]"
149149
pip install pytest-cov
150150
151-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
151+
# We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
152+
# See: https://github.com/pytorch/pytorch/issues/131668
152153
- name: Downgrade NumPy
153-
if: startsWith(matrix.torch_version, '2.2.')
154+
if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
154155
run: pip install "numpy<2"
155156

156157
- name: Show installed packages
@@ -162,7 +163,7 @@ jobs:
162163
- name: Run tests
163164
run: pytest --durations=100
164165

165-
test-cpu-ipex:
166+
test-cpu-intel:
166167
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
167168
needs: build-cpu
168169
runs-on: banb-aws-general-8-plus-use1-public-80
@@ -186,7 +187,6 @@ jobs:
186187
- name: Install dependencies
187188
run: |
188189
pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
189-
pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
190190
pip install -e ".[test]"
191191
pip install pytest-cov
192192
@@ -196,9 +196,6 @@ jobs:
196196
- name: Show environment information
197197
run: python -m torch.utils.collect_env
198198

199-
- name: IPEX smoke test
200-
run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
201-
202199
- name: Run tests
203200
run: pytest --durations=100
204201

@@ -286,15 +283,6 @@ jobs:
286283
fail-fast: false
287284
matrix:
288285
torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
289-
ipex: [false]
290-
# ipex: [true, false]
291-
# include:
292-
# - torch_version: "2.6.0"
293-
# ipex: true
294-
# ipex_version: "2.6.10+xpu"
295-
# - torch_version: "2.7.1"
296-
# ipex: true
297-
# ipex_version: "2.7.10+xpu"
298286
runs-on:
299287
group: bandb-itac-bmsprpvc1550-8-1gpu
300288
env:
@@ -330,10 +318,6 @@ jobs:
330318
- name: Install PyTorch
331319
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
332320

333-
- name: Install IPEX
334-
if: matrix.ipex == true
335-
run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
336-
337321
- name: Install dependencies
338322
run: |
339323
pip install -e ".[test]"
@@ -362,7 +346,7 @@ jobs:
362346
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
363347
include:
364348
- cuda_version: "11.8.0"
365-
torch_version: "2.2.2"
349+
torch_version: "2.3.1"
366350
pypi_index: "https://download.pytorch.org/whl/cu118"
367351
- cuda_version: "12.6.3"
368352
torch_version: "2.6.0"
@@ -372,7 +356,7 @@ jobs:
372356
pypi_index: "https://download.pytorch.org/whl/cu128"
373357
- cuda_version: "12.9.1"
374358
torch_version: "2.8.0"
375-
pypi_index: "https://download.pytorch.org/whl/test/cu129"
359+
pypi_index: "https://download.pytorch.org/whl/cu129"
376360

377361

378362
# Linux L40S runners
@@ -391,7 +375,7 @@ jobs:
391375
gpu: T4
392376
runner: CUDA-Windows-x64
393377
cuda_version: "11.8.0"
394-
torch_version: "2.2.0"
378+
torch_version: "2.3.1"
395379
pypi_index: "https://download.pytorch.org/whl/cu118"
396380
- os: windows-2025
397381
arch: x86_64
@@ -447,12 +431,6 @@ jobs:
447431
pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
448432
pip install -e ".[test]"
449433
pip install pytest-cov
450-
451-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
452-
- name: Downgrade NumPy
453-
if: startsWith(matrix.torch_version, '2.2.')
454-
run: pip install "numpy<2"
455-
456434
- name: Show installed packages
457435
run: pip list
458436

CMakeLists.txt

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
2828
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
2929
set(MPS_FILES csrc/mps_ops.mm)
3030
set(METAL_FILES csrc/mps_kernels.metal)
31+
set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
3132
# C++ sources are always included
3233
list(APPEND SRC_FILES ${CPP_FILES})
3334

34-
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
35-
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
35+
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
36+
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
3637
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
3738

3839
if(APPLE)
@@ -64,10 +65,18 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
6465
set(BUILD_CUDA OFF)
6566
set(BUILD_HIP OFF)
6667
set(BUILD_MPS ON)
68+
elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
69+
if(APPLE)
70+
message(FATAL_ERROR "XPU is not supported on macOS" )
71+
endif()
72+
set(BUILD_CUDA OFF)
73+
set(BUILD_MPS OFF)
74+
set(BUILD_XPU ON)
6775
else()
6876
set(BUILD_CUDA OFF)
6977
set(BUILD_HIP OFF)
7078
set(BUILD_MPS OFF)
79+
set(BUILD_XPU OFF)
7180
endif()
7281

7382

@@ -222,6 +231,15 @@ elseif(BUILD_MPS)
222231
COMMENT "Compiling Metal kernels"
223232
VERBATIM)
224233
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
234+
elseif(BUILD_XPU)
235+
list(APPEND SRC_FILES ${XPU_FILES})
236+
string(APPEND BNB_OUTPUT_NAME "_xpu")
237+
add_compile_definitions(BUILD_XPU)
238+
set(CMAKE_C_COMPILER icx)
239+
set(CMAKE_CXX_COMPILER icpx)
240+
if(WIN32)
241+
set(CMAKE_CXX_COMPILER icx)
242+
endif()
225243
else()
226244
string(APPEND BNB_OUTPUT_NAME "_cpu")
227245
set(GPU_SOURCES)
@@ -290,6 +308,15 @@ if(BUILD_MPS)
290308
add_dependencies(bitsandbytes metallib)
291309
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
292310
endif()
311+
if(BUILD_XPU)
312+
set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
313+
set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
314+
315+
set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
316+
target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
317+
target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})
318+
319+
endif()
293320

294321
if(WIN32)
295322
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
2020
bitsandbytes has the following minimum requirements for all platforms:
2121

2222
* Python 3.9+
23-
* [PyTorch](https://pytorch.org/get-started/locally/) 2.2+
23+
* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
2424
* _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
2525

2626
#### Accelerator support:
@@ -61,7 +61,7 @@ bitsandbytes has the following minimum requirements for all platforms:
6161
<tr>
6262
<td></td>
6363
<td>🟩 NVIDIA GPU <br><code>cuda</code></td>
64-
<td>SM50+ minimum<br>SM75+ recommended</td>
64+
<td>SM60+ minimum<br>SM75+ recommended</td>
6565
<td>✅</td>
6666
<td>✅</td>
6767
<td>✅</td>
@@ -87,7 +87,7 @@ bitsandbytes has the following minimum requirements for all platforms:
8787
</td>
8888
<td>✅</td>
8989
<td>✅</td>
90-
<td>🚧</td>
90+
<td>〰️</td>
9191
</tr>
9292
<tr>
9393
<td></td>
@@ -127,7 +127,7 @@ bitsandbytes has the following minimum requirements for all platforms:
127127
<tr>
128128
<td></td>
129129
<td>🟩 NVIDIA GPU <br><code>cuda</code></td>
130-
<td>SM50+ minimum<br>SM75+ recommended</td>
130+
<td>SM60+ minimum<br>SM75+ recommended</td>
131131
<td>✅</td>
132132
<td>✅</td>
133133
<td>✅</td>
@@ -141,7 +141,7 @@ bitsandbytes has the following minimum requirements for all platforms:
141141
</td>
142142
<td>✅</td>
143143
<td>✅</td>
144-
<td>🚧</td>
144+
<td>〰️</td>
145145
</tr>
146146
<tr>
147147
<td colspan="6">🍎 <strong>macOS 14+</strong></td>

_typos.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
[files]
2+
# Skip these files in typo checks
3+
extend-exclude = [
4+
"csrc/xpu_ops.h",
5+
"csrc/xpu_ops.cpp",
6+
"csrc/xpu_kernels.h",
7+
"csrc/xpu_kernels.cpp"
8+
]
29

310
[default]
411
extend-ignore-re = [

bitsandbytes/_ops.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44

55
import torch
66

7-
from .cextension import ipex_cpu, ipex_xpu
8-
97
_IS_TORCH_GTE_24 = False
108

119
if hasattr(torch.library, "register_fake"):
@@ -331,25 +329,6 @@ def _(
331329
torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
332330

333331

334-
if ipex_cpu or ipex_xpu:
335-
# Register the dequantize_nf4_ipex implementation
336-
torch.library.define(
337-
"bitsandbytes::dequantize_nf4_ipex",
338-
"(Tensor A, Tensor absmax, int blocksize, int[] shape, ScalarType dtype) -> Tensor",
339-
)
340-
341-
@register_fake("bitsandbytes::dequantize_nf4_ipex")
342-
def _(
343-
A: torch.Tensor,
344-
absmax: torch.Tensor,
345-
blocksize: int,
346-
shape: Sequence[int],
347-
dtype: torch.dtype,
348-
) -> torch.Tensor:
349-
torch._check_is_size(blocksize)
350-
return torch.empty(shape, dtype=dtype, device=A.device)
351-
352-
353332
torch.library.define(
354333
"bitsandbytes::optimizer_update_32bit",
355334
"(str optimizer_name, Tensor(a0!) g, Tensor(a1!) p, Tensor(a2!) state1, Tensor(a3!)? state2, Tensor(a4!)? unorm_vec, float max_unorm, float param_norm, float beta1, float beta2, float beta3, float alpha, float eps, float weight_decay, int step, float lr, float gnorm_scale, bool skip_zeros=False) -> ()",

bitsandbytes/autograd/_functions.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from typing_extensions import deprecated
99

1010
import bitsandbytes.functional as F
11-
from bitsandbytes.functional import ipex_cpu, ipex_xpu
1211

1312
# The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov:
1413
# https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py
@@ -85,11 +84,7 @@ def get_inverse_transform_indices(
8584
return permuted_tile_indices
8685

8786

88-
# torch.compiler.is_compiling() is available only in torch >= 2.3
89-
if hasattr(torch.compiler, "is_compiling"):
90-
_is_compiling = torch.compiler.is_compiling
91-
else:
92-
_is_compiling = torch._dynamo.is_compiling
87+
_is_compiling = torch.compiler.is_compiling
9388

9489

9590
@deprecated(
@@ -320,8 +315,6 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
320315

321316
CB = state.CB.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
322317
output = torch.nn.functional.linear(A, CB, bias)
323-
# to pass the test: tests/test_modules.py::test_linear8bitlt_no_fp16_weights[2.0-xpu]
324-
state.idx = False
325318
ctx.state = state
326319
ctx.dtype_A = A.dtype
327320
ctx.grad_shape = A.shape
@@ -426,7 +419,7 @@ def matmul(
426419
state.threshold = threshold
427420
# MatMul8bitLt is slower because no fast kernel for quant/dequant 8bit in CPU/XPU
428421
if state.is_training:
429-
if (A.device.type == "cpu" and ipex_cpu) or (A.device.type == "xpu" and ipex_xpu):
422+
if A.device.type in ("cpu", "xpu"):
430423
return MatMul8bitFp.apply(A, B, out, bias, state)
431424
return MatMul8bitLt.apply(A, B, out, bias, state)
432425

@@ -440,17 +433,6 @@ def matmul_4bit(
440433
):
441434
assert quant_state is not None
442435

443-
if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
444-
if getattr(quant_state, "ipex", False):
445-
# IPEX CPU will change weight to 4D so don't need transpose
446-
B = B.t() if B.dim() == 2 else B
447-
out = F.gemv_4bit(A, B, out, state=quant_state)
448-
if bias is not None:
449-
out += bias
450-
return out
451-
else:
452-
return MatMul4Bit.apply(A, B, out, bias, quant_state)
453-
454436
if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
455437
if A.shape[-1] % quant_state.blocksize != 0:
456438
warn(

0 commit comments

Comments
 (0)