Merge branch 'main' into main

matthewdouglas · web-flow · commit ca6320671365 · 2025-10-29T18:28:59.000-04:00
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.bat text eol=crlf
diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
@@ -4,13 +4,20 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
+
+# ROCm 6.4+ - Add gfx1200/gfx1201. Note we assume >=6.4.1.
+[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1200;gfx1201"
+
+# ROCm 7.0+ - Add gfx950
+[[ "${rocm_version}" == 7.*.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
+
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
-	echo "Using image $image"
-	docker run --rm --platform "linux/$build_arch" -i \
-		-w /src -v "$PWD:/src" "$image" sh -c \
-		"apt-get update \
+    image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+    echo "Using image $image"
+    docker run --rm --platform "linux/$build_arch" -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
       && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
diff --git a/.github/scripts/build-xpu-windows.bat b/.github/scripts/build-xpu-windows.bat
@@ -0,0 +1,34 @@
+set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
+set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
+
+echo ::group::Intel Deep Learning Essentials Installation
+curl -o intel-dle-installer.exe %INTEL_DLE_URL%
+start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
+type %INTEL_DLE_LOG%
+if ERRORLEVEL 1 (
+    echo Failed to install Intel Deep Learning Essentials
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Build Environment Setup
+call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
+cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
+if ERRORLEVEL 1 (
+    echo Failed to setup environment
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Building with XPU backend
+cmake --build . --config Release
+if ERRORLEVEL 1 (
+    echo Build failed
+    exit /b 1
+)
+echo ::endgroup::
+
+set output_dir=output\%build_os%\x86_64
+if not exist "%output_dir%" mkdir "%output_dir%"
+copy bitsandbytes\*.dll "%output_dir%\" 2>nul
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -110,14 +110,21 @@ jobs:
   build-xpu:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-22.04, windows-2025]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - name: Build C++
+      - name: Build C++ (Linux)
+        if: runner.os == 'Linux'
         run: bash .github/scripts/build-xpu.sh
         env:
           build_os: ${{ matrix.os }}
+      - name: Build C++ (Windows)
+        if: runner.os == 'Windows'
+        run: .github/scripts/build-xpu-windows.bat
+        shell: cmd
+        env:
+          build_os: ${{ matrix.os }}
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
@@ -130,30 +137,26 @@ jobs:
       matrix:
         os: [ubuntu-22.04]
         arch: [x86_64]
-        rocm_version:
-          ["6.1.2", "6.2.4", "6.3.4", "6.4.4", "7.0"]
+        rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - name: Clean up disk space
         run: |
+          echo "Disk space before cleanup:"
+          df -h
+
+          # These are the biggest disk space hogs.
           sudo rm -rf \
-              /usr/share/dotnet \
-              /opt/ghc \
-              "/usr/local/share/boost" \
-              "$AGENT_TOOLSDIRECTORY" \
-              /opt/hostedtoolcache \
-              /opt/google/chrome \
-              /opt/microsoft/msedge \
-              /opt/microsoft/powershell \
-              /opt/pipx \
-              /usr/lib/mono \
-              /usr/local/julia* \
-              /usr/local/lib/android \
-              /usr/local/lib/node_modules \
-              /usr/local/share/chromium \
-              /usr/local/share/powershell \
-              /usr/share/swift
+            /opt/hostedtoolcache/CodeQL \
+            /usr/lib/dotnet \
+            /usr/lib/jvm \
+            /usr/local/.ghcup \
+            /usr/local/lib/android \
+            /usr/share/swift
+
+          echo "Disk space after cleanup:"
+          df -h
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,6 +17,7 @@ repos:
       - id: mixed-line-ending
         args:
           - --fix=lf
+        exclude: '\.bat$'
   - repo: https://github.com/crate-ci/typos
     rev: v1.26.0
     hooks:
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -75,4 +75,4 @@ def _import_backends():
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.48.2.dev0"
+__version__ = "0.48.3.dev0"
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -326,7 +326,7 @@ def _(
             get_ptr(absmax),
             get_ptr(out),
             ct.c_int32(blocksize),
-            ct.c_int(n),
+            ct.c_int32(n),
         )
 
         if A.dtype == torch.bfloat16:
@@ -403,7 +403,7 @@ def _dequantize_4bit_impl(
             get_ptr(absmax),
             get_ptr(out),
             ct.c_int(blocksize),
-            ct.c_int(out.numel()),
+            ct.c_int32(out.numel()),
             _get_tensor_stream(A),
         )
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -697,7 +697,7 @@ def to(self, *args, **kwargs):
         if is_quantized:
             new_param.CB = new_param.data
 
-            if self.SCB is not None and device is not None:
+            if device is not None and self.SCB is not None and self.SCB.device.type != "meta":
                 new_param.SCB = self.SCB.to(device)
 
         return new_param
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -328,14 +328,16 @@ __global__ void kQuantizeBlockwise(
     float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
     const int rand_offset, const int n
 ) {
-    const int n_full = gridDim.x * BLOCK_SIZE;
+    // This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
+    const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
+
+    const int base_idx = blockIdx.x * BLOCK_SIZE;
     int valid_items = 0;
-    const int base_idx = (blockIdx.x * BLOCK_SIZE);
 
     T vals[NUM_PER_TH];
     float rand_vals[NUM_PER_TH];
     unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];
-    // float local_abs_max = -FLT_MAX;
+
     float local_abs_max = 0.0f;
     int local_rand_idx = 0;
 
@@ -358,8 +360,8 @@ __global__ void kQuantizeBlockwise(
         for (int i = threadIdx.x; i < 256; i += blockDim.x)
             smem_code[i] = code[i];
 
-    for (int i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
-        valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
+    for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
+        valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
         local_abs_max = -FLT_MAX;
 
         __syncthreads();
@@ -442,7 +444,8 @@ __global__ void
 
     for (int i = base_idx; i < n_load; i += gridDim.x * TILE_SIZE) {
         if (DATA_TYPE > 0) {
-            valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
+            // Cast n to int64_t to avoid overflow for large n
+            valid_items_load = min(TILE_SIZE, static_cast<int>((static_cast<int64_t>(n) + 1) / 2) - i);
             valid_items_store = min(TILE_SIZE * 2, n - i * 2);
         } else {
             valid_items_load = min(TILE_SIZE, n - i);
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -61,16 +61,17 @@ template <typename T, int DATA_TYPE>
 void dequantizeBlockwise(
     float* code, unsigned char* A, float* absmax, T* out, int blocksize, const int n, cudaStream_t stream
 ) {
-    // printf("stream==%d\n",stream);
-    int num_blocks = n / blocksize;
-    num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
-    int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
+    constexpr int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
+
+    // Upcast to int64 to avoid overflow for large n
+    int grid_blocks = ((int64_t)n + tile_size - 1) / tile_size;
+
     if (DATA_TYPE > 0)
         kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>
-            <<<(n + tile_size - 1) / tile_size, 64, 0, stream>>>(code, A, absmax, out, blocksize / 2, n);
+            <<<grid_blocks, 64, 0, stream>>>(code, A, absmax, out, blocksize / 2, n);
     else
         kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>
-            <<<(n + tile_size - 1) / tile_size, 64, 0, stream>>>(code, A, absmax, out, blocksize, n);
+            <<<grid_blocks, 64, 0, stream>>>(code, A, absmax, out, blocksize, n);
 
     CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@ def run(self):
 
 
 setup(
-    version="0.48.2.dev0",
+    version="0.48.3.dev0",
     packages=find_packages(),
     distclass=BinaryDistribution,
     cmake_source_dir=".",
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -151,6 +151,34 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
             assert relerr < 0.012
         assert A2.dtype == dtype
 
+    @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
+    @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No accelerator device")
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+    @pytest.mark.parametrize("blocksize", [256], ids=id_formatter("blocksize"))
+    def test_dynamic_blockwise_quantization_large(self, device, dtype, blocksize):
+        """
+        Test that we can successfully quantize a large tensor. Note that the following limitations apply:
+        - On CUDA/XPU/ROCm, the maximum number of elements is limited to 2**31 - 1 due to int32 indexing in C++ kernels.
+        - On CPU, there is a significantly higher memory overhead for the quantization, so we skip this test.
+        - Verification of the accuracy for dequantization has too high memory overhead for this test.
+        """
+        if device not in ["cuda", "xpu"]:
+            pytest.skip("This test is only for CUDA and XPU devices due to memory constraints.")
+
+        data = torch.randn(2**31 - 1, device=device, dtype=dtype)
+        q_data, q_stats = F.quantize_blockwise(data, blocksize=blocksize)
+
+        assert q_data is not None
+        assert q_data.dtype == torch.uint8
+        assert q_data.numel() == data.numel()
+
+        # Dequant
+        del data
+        dq = F.dequantize_blockwise(q_data, q_stats)
+
+        assert dq.dtype == dtype
+        assert dq.numel() == q_data.numel()
+
     @pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
     @pytest.mark.parametrize("hidden", [128])
     @pytest.mark.parametrize("blocksize", [4096, 16384])
@@ -1118,18 +1146,17 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
         A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
         qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
         A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
+        del qa, SA
+
+        assert A2.dtype == dtype
 
         err = (A1 - A2).abs().float()
+        del A2
+
         relerr = (err / (A1.abs().float() + 1e-8)).mean()
         err = err.mean()
 
-        assert A2.dtype == dtype
-
-        # With larger block sizes, we can expect this to blow up.
-        # At blocksize>=1024, don't even bother looking at relerr.
-        #
-        # Actually, the above is not true anymore after fixing the integer packing bug.
-        # The following values were taken from averaging 1k samples per test configuration after fixing the bug.
+        # The following values were taken from averaging 1k samples per test configuration.
         error_dict = dict()
         error_dict["fp4"] = dict()
         error_dict["nf4"] = dict()
@@ -1213,6 +1240,37 @@ def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
             assert err.item() < 0.11
             assert relerr.item() < 0.28
 
+    @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
+    @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No accelerator device")
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
+    @pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128], ids=id_formatter("blocksize"))
+    def test_4bit_quant_large(self, device, dtype, quant_type, blocksize):
+        """
+        Test that we can successfully quantize a large tensor. Note that the following limitations apply:
+        - On CUDA/XPU/ROCm, the maximum number of elements is limited to 2**31 - 1 due to int32 indexing in C++ kernels.
+        - On CUDA, this test requires ~10GiB of memory for fp32
+        - On CPU, there is a significantly higher memory overhead for the quantization, so we skip this test.
+        - Verification of the accuracy for dequantization has too high memory overhead for this test.
+        """
+
+        if device not in ["cuda", "xpu"]:
+            pytest.skip("This test is only for CUDA and XPU devices due to memory constraints.")
+
+        A1 = torch.randn(2**31 - 1, device=device, dtype=dtype)
+        qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
+
+        assert qa is not None
+        assert qa.dtype == torch.uint8
+        assert qa.numel() == (2**31 - 1 + 1) // 2  # each byte holds 2 quantized values
+
+        # Dequant
+        del A1
+        dq = F.dequantize_4bit(qa, SA)
+
+        assert dq.dtype == dtype
+        assert dq.numel() == 2**31 - 1
+
     # @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
     @pytest.mark.parametrize("quant_type", ["nf4"])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")

Original file line number	Diff line number	Diff line change
`@@ -75,4 +75,4 @@ def _import_backends():`
`75`	`75`	`"optim.optimizer.MockArgs": False,`
`76`	`76`	`}`
`77`	`77`
`78`		`-__version__ = "0.48.2.dev0"`
	`78`	`+__version__ = "0.48.3.dev0"`