Skip to content

Commit ca63206

Browse files
Merge branch 'main' into main
2 parents e9f0af3 + fad47f2 commit ca63206

File tree

12 files changed

+158
-50
lines changed

12 files changed

+158
-50
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.bat text eol=crlf

.github/scripts/build-rocm.sh

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,20 @@ declare build_os
44
declare rocm_version
55

66
set -xeuo pipefail
7-
bnb_rocm_arch="gfx90a;gfx942;gfx1100"
7+
bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
8+
9+
# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
10+
[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
11+
12+
# ROCm 7.0+ - Add gfx950
13+
[[ "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
14+
815
if [ "${build_os:0:6}" == ubuntu ]; then
9-
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
10-
echo "Using image $image"
11-
docker run --rm --platform "linux/$build_arch" -i \
12-
-w /src -v "$PWD:/src" "$image" sh -c \
13-
"apt-get update \
16+
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
17+
echo "Using image $image"
18+
docker run --rm --platform "linux/$build_arch" -i \
19+
-w /src -v "$PWD:/src" "$image" sh -c \
20+
"apt-get update \
1421
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
1522
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
1623
&& cmake --build ."
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
2+
set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
3+
set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
4+
5+
echo ::group::Intel Deep Learning Essentials Installation
6+
curl -o intel-dle-installer.exe %INTEL_DLE_URL%
7+
start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
8+
type %INTEL_DLE_LOG%
9+
if ERRORLEVEL 1 (
10+
echo Failed to install Intel Deep Learning Essentials
11+
exit /b 1
12+
)
13+
echo ::endgroup::
14+
15+
echo ::group::Build Environment Setup
16+
call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
17+
cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
18+
if ERRORLEVEL 1 (
19+
echo Failed to setup environment
20+
exit /b 1
21+
)
22+
echo ::endgroup::
23+
24+
echo ::group::Building with XPU backend
25+
cmake --build . --config Release
26+
if ERRORLEVEL 1 (
27+
echo Build failed
28+
exit /b 1
29+
)
30+
echo ::endgroup::
31+
32+
set output_dir=output\%build_os%\x86_64
33+
if not exist "%output_dir%" mkdir "%output_dir%"
34+
copy bitsandbytes\*.dll "%output_dir%\" 2>nul

.github/workflows/python-package.yml

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,21 @@ jobs:
110110
build-xpu:
111111
strategy:
112112
matrix:
113-
os: [ubuntu-22.04]
113+
os: [ubuntu-22.04, windows-2025]
114114
runs-on: ${{ matrix.os }}
115115
steps:
116116
- uses: actions/checkout@v4
117-
- name: Build C++
117+
- name: Build C++ (Linux)
118+
if: runner.os == 'Linux'
118119
run: bash .github/scripts/build-xpu.sh
119120
env:
120121
build_os: ${{ matrix.os }}
122+
- name: Build C++ (Windows)
123+
if: runner.os == 'Windows'
124+
run: .github/scripts/build-xpu-windows.bat
125+
shell: cmd
126+
env:
127+
build_os: ${{ matrix.os }}
121128
- name: Upload build artifact
122129
uses: actions/upload-artifact@v4
123130
with:
@@ -130,30 +137,26 @@ jobs:
130137
matrix:
131138
os: [ubuntu-22.04]
132139
arch: [x86_64]
133-
rocm_version:
134-
["6.1.2", "6.2.4", "6.3.4", "6.4.4", "7.0"]
140+
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2"]
135141
runs-on: ${{ matrix.os }}
136142
steps:
137143
- uses: actions/checkout@v4
138144
- name: Clean up disk space
139145
run: |
146+
echo "Disk space before cleanup:"
147+
df -h
148+
149+
# These are the biggest disk space hogs.
140150
sudo rm -rf \
141-
/usr/share/dotnet \
142-
/opt/ghc \
143-
"/usr/local/share/boost" \
144-
"$AGENT_TOOLSDIRECTORY" \
145-
/opt/hostedtoolcache \
146-
/opt/google/chrome \
147-
/opt/microsoft/msedge \
148-
/opt/microsoft/powershell \
149-
/opt/pipx \
150-
/usr/lib/mono \
151-
/usr/local/julia* \
152-
/usr/local/lib/android \
153-
/usr/local/lib/node_modules \
154-
/usr/local/share/chromium \
155-
/usr/local/share/powershell \
156-
/usr/share/swift
151+
/opt/hostedtoolcache/CodeQL \
152+
/usr/lib/dotnet \
153+
/usr/lib/jvm \
154+
/usr/local/.ghcup \
155+
/usr/local/lib/android \
156+
/usr/share/swift
157+
158+
echo "Disk space after cleanup:"
159+
df -h
157160
- name: Build C++
158161
run: bash .github/scripts/build-rocm.sh
159162
env:

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ repos:
1717
- id: mixed-line-ending
1818
args:
1919
- --fix=lf
20+
exclude: '\.bat$'
2021
- repo: https://github.com/crate-ci/typos
2122
rev: v1.26.0
2223
hooks:

bitsandbytes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,4 @@ def _import_backends():
7575
"optim.optimizer.MockArgs": False,
7676
}
7777

78-
__version__ = "0.48.2.dev0"
78+
__version__ = "0.48.3.dev0"

bitsandbytes/backends/cuda/ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def _(
326326
get_ptr(absmax),
327327
get_ptr(out),
328328
ct.c_int32(blocksize),
329-
ct.c_int(n),
329+
ct.c_int32(n),
330330
)
331331

332332
if A.dtype == torch.bfloat16:
@@ -403,7 +403,7 @@ def _dequantize_4bit_impl(
403403
get_ptr(absmax),
404404
get_ptr(out),
405405
ct.c_int(blocksize),
406-
ct.c_int(out.numel()),
406+
ct.c_int32(out.numel()),
407407
_get_tensor_stream(A),
408408
)
409409

bitsandbytes/nn/modules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,7 @@ def to(self, *args, **kwargs):
697697
if is_quantized:
698698
new_param.CB = new_param.data
699699

700-
if self.SCB is not None and device is not None:
700+
if device is not None and self.SCB is not None and self.SCB.device.type != "meta":
701701
new_param.SCB = self.SCB.to(device)
702702

703703
return new_param

csrc/kernels.cu

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -328,14 +328,16 @@ __global__ void kQuantizeBlockwise(
328328
float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
329329
const int rand_offset, const int n
330330
) {
331-
const int n_full = gridDim.x * BLOCK_SIZE;
331+
// This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
332+
const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
333+
334+
const int base_idx = blockIdx.x * BLOCK_SIZE;
332335
int valid_items = 0;
333-
const int base_idx = (blockIdx.x * BLOCK_SIZE);
334336

335337
T vals[NUM_PER_TH];
336338
float rand_vals[NUM_PER_TH];
337339
unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];
338-
// float local_abs_max = -FLT_MAX;
340+
339341
float local_abs_max = 0.0f;
340342
int local_rand_idx = 0;
341343

@@ -358,8 +360,8 @@ __global__ void kQuantizeBlockwise(
358360
for (int i = threadIdx.x; i < 256; i += blockDim.x)
359361
smem_code[i] = code[i];
360362

361-
for (int i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
362-
valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
363+
for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
364+
valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
363365
local_abs_max = -FLT_MAX;
364366

365367
__syncthreads();
@@ -442,7 +444,8 @@ __global__ void
442444

443445
for (int i = base_idx; i < n_load; i += gridDim.x * TILE_SIZE) {
444446
if (DATA_TYPE > 0) {
445-
valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
447+
// Cast n to int64_t to avoid overflow for large n
448+
valid_items_load = min(TILE_SIZE, static_cast<int>((static_cast<int64_t>(n) + 1) / 2) - i);
446449
valid_items_store = min(TILE_SIZE * 2, n - i * 2);
447450
} else {
448451
valid_items_load = min(TILE_SIZE, n - i);

csrc/ops.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,17 @@ template <typename T, int DATA_TYPE>
6161
void dequantizeBlockwise(
6262
float* code, unsigned char* A, float* absmax, T* out, int blocksize, const int n, cudaStream_t stream
6363
) {
64-
// printf("stream==%d\n",stream);
65-
int num_blocks = n / blocksize;
66-
num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
67-
int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
64+
constexpr int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
65+
66+
// Upcast to int64 to avoid overflow for large n
67+
int grid_blocks = ((int64_t)n + tile_size - 1) / tile_size;
68+
6869
if (DATA_TYPE > 0)
6970
kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>
70-
<<<(n + tile_size - 1) / tile_size, 64, 0, stream>>>(code, A, absmax, out, blocksize / 2, n);
71+
<<<grid_blocks, 64, 0, stream>>>(code, A, absmax, out, blocksize / 2, n);
7172
else
7273
kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>
73-
<<<(n + tile_size - 1) / tile_size, 64, 0, stream>>>(code, A, absmax, out, blocksize, n);
74+
<<<grid_blocks, 64, 0, stream>>>(code, A, absmax, out, blocksize, n);
7475

7576
CUDA_CHECK_RETURN(cudaPeekAtLastError());
7677
}

0 commit comments

Comments
 (0)