Skip to content

Commit 61c2066

Browse files
authored
Merge branch 'main' into docs/quickstart-update
2 parents 3ee0e76 + 221b4b4 commit 61c2066

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1113
-817
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.bat text eol=crlf

.github/scripts/build-rocm.sh

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,21 @@ declare build_os
44
declare rocm_version
55

66
set -xeuo pipefail
7-
bnb_rocm_arch="gfx90a;gfx942;gfx1100"
7+
bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
8+
9+
# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
10+
[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
11+
12+
# ROCm 7.0+ - Add gfx950
13+
[[ "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
14+
815
if [ "${build_os:0:6}" == ubuntu ]; then
9-
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
10-
echo "Using image $image"
11-
docker run --rm --platform "linux/$build_arch" -i \
12-
-w /src -v "$PWD:/src" "$image" sh -c \
13-
"apt-get update \
14-
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
16+
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
17+
echo "Using image $image"
18+
docker run --rm --platform "linux/$build_arch" -i \
19+
-w /src -v "$PWD:/src" "$image" sh -c \
20+
"apt-get update \
21+
&& pip install cmake==3.31.6 \
1522
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
1623
&& cmake --build ."
1724
fi
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
2+
set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
3+
set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
4+
5+
echo ::group::Intel Deep Learning Essentials Installation
6+
curl -o intel-dle-installer.exe %INTEL_DLE_URL%
7+
start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
8+
type %INTEL_DLE_LOG%
9+
if ERRORLEVEL 1 (
10+
echo Failed to install Intel Deep Learning Essentials
11+
exit /b 1
12+
)
13+
echo ::endgroup::
14+
15+
echo ::group::Build Environment Setup
16+
call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
17+
cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
18+
if ERRORLEVEL 1 (
19+
echo Failed to setup environment
20+
exit /b 1
21+
)
22+
echo ::endgroup::
23+
24+
echo ::group::Building with XPU backend
25+
cmake --build . --config Release
26+
if ERRORLEVEL 1 (
27+
echo Build failed
28+
exit /b 1
29+
)
30+
echo ::endgroup::
31+
32+
set output_dir=output\%build_os%\x86_64
33+
if not exist "%output_dir%" mkdir "%output_dir%"
34+
copy bitsandbytes\*.dll "%output_dir%\" 2>nul

.github/workflows/python-package.yml

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,21 @@ jobs:
110110
build-xpu:
111111
strategy:
112112
matrix:
113-
os: [ubuntu-22.04]
113+
os: [ubuntu-22.04, windows-2025]
114114
runs-on: ${{ matrix.os }}
115115
steps:
116116
- uses: actions/checkout@v4
117-
- name: Build C++
117+
- name: Build C++ (Linux)
118+
if: runner.os == 'Linux'
118119
run: bash .github/scripts/build-xpu.sh
119120
env:
120121
build_os: ${{ matrix.os }}
122+
- name: Build C++ (Windows)
123+
if: runner.os == 'Windows'
124+
run: .github/scripts/build-xpu-windows.bat
125+
shell: cmd
126+
env:
127+
build_os: ${{ matrix.os }}
121128
- name: Upload build artifact
122129
uses: actions/upload-artifact@v4
123130
with:
@@ -130,30 +137,26 @@ jobs:
130137
matrix:
131138
os: [ubuntu-22.04]
132139
arch: [x86_64]
133-
rocm_version:
134-
["6.1.2", "6.2.4", "6.3.4", "6.4.4", "7.0"]
140+
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1"]
135141
runs-on: ${{ matrix.os }}
136142
steps:
137143
- uses: actions/checkout@v4
138144
- name: Clean up disk space
139145
run: |
146+
echo "Disk space before cleanup:"
147+
df -h
148+
149+
# These are the biggest disk space hogs.
140150
sudo rm -rf \
141-
/usr/share/dotnet \
142-
/opt/ghc \
143-
"/usr/local/share/boost" \
144-
"$AGENT_TOOLSDIRECTORY" \
145-
/opt/hostedtoolcache \
146-
/opt/google/chrome \
147-
/opt/microsoft/msedge \
148-
/opt/microsoft/powershell \
149-
/opt/pipx \
150-
/usr/lib/mono \
151-
/usr/local/julia* \
152-
/usr/local/lib/android \
153-
/usr/local/lib/node_modules \
154-
/usr/local/share/chromium \
155-
/usr/local/share/powershell \
156-
/usr/share/swift
151+
/opt/hostedtoolcache/CodeQL \
152+
/usr/lib/dotnet \
153+
/usr/lib/jvm \
154+
/usr/local/.ghcup \
155+
/usr/local/lib/android \
156+
/usr/share/swift
157+
158+
echo "Disk space after cleanup:"
159+
df -h
157160
- name: Build C++
158161
run: bash .github/scripts/build-rocm.sh
159162
env:
@@ -168,6 +171,9 @@ jobs:
168171
retention-days: 7
169172

170173
build-wheels:
174+
env:
175+
# Skip rebuilding the CPU library when building the wheels.
176+
BNB_SKIP_CMAKE: 1
171177
needs:
172178
- build-cpu
173179
- build-cuda

.github/workflows/tests.yml

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ concurrency:
1010
group: ${{ github.workflow }}-${{ github.ref }}
1111
cancel-in-progress: true
1212

13+
env:
14+
# Skip rebuilding the CPU library when installing the wheels.
15+
# We build the libraries in separate jobs and upload as artifacts.
16+
BNB_SKIP_CMAKE: 1
17+
1318
jobs:
1419

1520
build-cpu:
@@ -49,8 +54,7 @@ jobs:
4954
build-cuda:
5055
strategy:
5156
matrix:
52-
# TODO: Add 13.0.1 when we have runners with new enough drivers.
53-
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
57+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
5458
os: [ubuntu-22.04, ubuntu-22.04-arm]
5559
include:
5660
- os: ubuntu-22.04
@@ -103,7 +107,7 @@ jobs:
103107
matrix:
104108
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
105109
# Test with the oldest supported torch version, the newest two stable/RC.
106-
torch_version: ["2.3.1", "2.7.1", "2.8.0"]
110+
torch_version: ["2.3.1", "2.8.0", "2.9.0"]
107111
include:
108112
- os: ubuntu-22.04
109113
arch: x86_64
@@ -146,7 +150,7 @@ jobs:
146150
- name: Install dependencies
147151
run: |
148152
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
149-
pip install -e ".[test]"
153+
pip install -e ".[test]" -v
150154
pip install pytest-cov
151155
152156
# We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
@@ -188,7 +192,7 @@ jobs:
188192
- name: Install dependencies
189193
run: |
190194
pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
191-
pip install -e ".[test]"
195+
pip install -e ".[test]" -v
192196
pip install pytest-cov
193197
194198
- name: Show installed packages
@@ -263,7 +267,7 @@ jobs:
263267

264268
- name: Install dependencies
265269
run: |
266-
pip install -e ".[test]"
270+
pip install -e ".[test]" -v
267271
pip install pytest-cov
268272
269273
- name: Show installed packages
@@ -321,7 +325,7 @@ jobs:
321325

322326
- name: Install dependencies
323327
run: |
324-
pip install -e ".[test]"
328+
pip install -e ".[test]" -v
325329
pip install pytest-cov
326330
327331
- name: Show installed packages
@@ -344,26 +348,20 @@ jobs:
344348
os: [ubuntu-22.04, windows-2025]
345349
arch: [x86_64]
346350
gpu: [T4, L40S]
347-
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
351+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.1"]
348352
include:
349353
- cuda_version: "11.8.0"
350354
torch_version: "2.3.1"
351355
pypi_index: "https://download.pytorch.org/whl/cu118"
352356
- cuda_version: "12.6.3"
353-
torch_version: "2.6.0"
357+
torch_version: "2.7.1"
354358
pypi_index: "https://download.pytorch.org/whl/cu126"
355-
- cuda_version: "12.9.1"
356-
torch_version: "2.8.0"
357-
pypi_index: "https://download.pytorch.org/whl/cu129"
358359
- cuda_version: "12.8.1"
359-
torch_version: "2.9.0"
360-
pypi_index: "https://download.pytorch.org/whl/test/cu128"
361-
362-
# Note: Currently our runners do not have new enough drivers for CUDA 13.
363-
# Add this when supported.
364-
# - cuda_version: "13.0.1"
365-
# torch_version: "2.9.0"
366-
# pypi_index: "https://download.pytorch.org/whl/test/cu130"
360+
torch_version: "2.8.0"
361+
pypi_index: "https://download.pytorch.org/whl/cu128"
362+
- cuda_version: "13.0.1"
363+
torch_version: "2.9.1"
364+
pypi_index: "https://download.pytorch.org/whl/cu130"
367365

368366

369367
# Linux L40S runners
@@ -438,7 +436,7 @@ jobs:
438436
- name: Install dependencies
439437
run: |
440438
pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
441-
pip install -e ".[test]"
439+
pip install -e ".[test]" -v
442440
pip install pytest-cov
443441
- name: Show installed packages
444442
run: pip list

.pre-commit-config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: v0.11.2
3+
rev: v0.14.3
44
hooks:
55
- id: ruff
66
args:
@@ -17,6 +17,7 @@ repos:
1717
- id: mixed-line-ending
1818
args:
1919
- --fix=lf
20+
exclude: '\.bat$'
2021
- repo: https://github.com/crate-ci/typos
2122
rev: v1.26.0
2223
hooks:

CMakeLists.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,17 @@ else()
7878
set(BUILD_HIP OFF)
7979
set(BUILD_MPS OFF)
8080
set(BUILD_XPU OFF)
81+
set(BUILD_CPU ON)
8182
endif()
8283

8384

85+
if (BUILD_CPU)
86+
set(CMAKE_CXX_STANDARD 17)
87+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
88+
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" HOST_ARCH)
89+
find_package(OpenMP)
90+
endif()
91+
8492
if(BUILD_CUDA)
8593
# NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
8694
# Workaround: use --allow-unsupported-compiler
@@ -262,6 +270,34 @@ add_library(bitsandbytes SHARED ${SRC_FILES})
262270
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
263271
target_include_directories(bitsandbytes PUBLIC csrc include)
264272

273+
if (BUILD_CPU)
274+
if (OpenMP_CXX_FOUND)
275+
target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
276+
add_definitions(-DHAS_OPENMP)
277+
endif()
278+
279+
if ((HOST_ARCH MATCHES "x86_64|amd64") AND (NOT MSVC))
280+
include(CheckCXXCompilerFlag)
281+
check_cxx_compiler_flag(-mavx512f HAS_AVX512F_FLAG)
282+
check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
283+
if (HAS_AVX512F_FLAG)
284+
target_compile_options(bitsandbytes PRIVATE -mavx512f)
285+
endif()
286+
if (HAS_AVX512BF16_FLAG)
287+
target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
288+
endif()
289+
target_compile_options(
290+
bitsandbytes PRIVATE
291+
-mprefer-vector-width=256
292+
-mfma
293+
-mavx2
294+
-mlzcnt
295+
-mbmi
296+
-mbmi2
297+
)
298+
endif()
299+
endif()
300+
265301

266302
if(BUILD_CUDA)
267303
target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
1919
## System Requirements
2020
bitsandbytes has the following minimum requirements for all platforms:
2121

22-
* Python 3.9+
22+
* Python 3.10+
2323
* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
2424
* _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
2525

benchmarking/matmul_benchmark.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
3535
B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
3636
torch.nn.init.xavier_uniform_(B)
3737

38-
B_fp4, state = F.quantize_fp4(B)
39-
B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
38+
_B_fp4, _state = F.quantize_fp4(B)
39+
_B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)
4040

4141
B_nf4, state_nf4 = F.quantize_nf4(B)
4242
B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
117117
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
118118
)
119119

120-
CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
121-
CB, SCB, _ = F.int8_vectorwise_quant(B)
120+
CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
121+
CB, _SCB, _ = F.int8_vectorwise_quant(B)
122122
torch.cuda.synchronize()
123123
t0 = time.time()
124124
for i in range(iters):

bitsandbytes/__init__.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,7 @@ def _import_backends():
5454
"""
5555
from importlib.metadata import entry_points
5656

57-
if sys.version_info < (3, 10):
58-
extensions = entry_points().get("bitsandbytes.backends", [])
59-
else:
60-
extensions = entry_points(group="bitsandbytes.backends")
57+
extensions = entry_points(group="bitsandbytes.backends")
6158

6259
for ext in extensions:
6360
try:
@@ -75,4 +72,4 @@ def _import_backends():
7572
"optim.optimizer.MockArgs": False,
7673
}
7774

78-
__version__ = "0.48.2.dev0"
75+
__version__ = "0.49.0.dev0"

0 commit comments

Comments
 (0)