Skip to content

Commit fe44bb7

Browse files
authored
Merge branch 'main' into docs/quickstart-update
2 parents a329ba1 + 2adcb7a commit fe44bb7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3599
-787
lines changed

.github/scripts/build-cuda.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ elif [ "${build_arch}" = "aarch64" ]; then
1414
# CUDA 12.8+: Add sm100/sm120
1515
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
1616
else
17-
# By default, target Maxwell through Hopper.
18-
build_capability="50;60;70;75;80;86;89;90"
17+
# By default, target Pascal through Hopper.
18+
build_capability="60;70;75;80;86;89;90"
1919

2020
# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
2121
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"

.github/workflows/tests.yml

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ jobs:
102102
matrix:
103103
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
104104
# Test with the oldest supported torch version, the newest two stable/RC.
105-
torch_version: ["2.2.2", "2.7.1", "2.8.0"]
105+
torch_version: ["2.3.1", "2.7.1", "2.8.0"]
106106
include:
107107
- os: ubuntu-22.04
108108
arch: x86_64
@@ -118,7 +118,7 @@ jobs:
118118
arch: arm64
119119
exclude:
120120
- os: ubuntu-22.04-arm
121-
torch_version: "2.2.2"
121+
torch_version: "2.3.1"
122122

123123
runs-on: ${{ matrix.runner || matrix.os }}
124124
env:
@@ -144,13 +144,14 @@ jobs:
144144

145145
- name: Install dependencies
146146
run: |
147-
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/${{ (matrix.torch_version == '2.8.0' && 'test/cpu') || 'cpu' }}
147+
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
148148
pip install -e ".[test]"
149149
pip install pytest-cov
150150
151-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
151+
# We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
152+
# See: https://github.com/pytorch/pytorch/issues/131668
152153
- name: Downgrade NumPy
153-
if: startsWith(matrix.torch_version, '2.2.')
154+
if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
154155
run: pip install "numpy<2"
155156

156157
- name: Show installed packages
@@ -162,7 +163,7 @@ jobs:
162163
- name: Run tests
163164
run: pytest --durations=100
164165

165-
test-cpu-ipex:
166+
test-cpu-intel:
166167
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
167168
needs: build-cpu
168169
runs-on: banb-aws-general-8-plus-use1-public-80
@@ -186,7 +187,6 @@ jobs:
186187
- name: Install dependencies
187188
run: |
188189
pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
189-
pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
190190
pip install -e ".[test]"
191191
pip install pytest-cov
192192
@@ -196,9 +196,6 @@ jobs:
196196
- name: Show environment information
197197
run: python -m torch.utils.collect_env
198198

199-
- name: IPEX smoke test
200-
run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
201-
202199
- name: Run tests
203200
run: pytest --durations=100
204201

@@ -286,15 +283,6 @@ jobs:
286283
fail-fast: false
287284
matrix:
288285
torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
289-
ipex: [false]
290-
# ipex: [true, false]
291-
# include:
292-
# - torch_version: "2.6.0"
293-
# ipex: true
294-
# ipex_version: "2.6.10+xpu"
295-
# - torch_version: "2.7.1"
296-
# ipex: true
297-
# ipex_version: "2.7.10+xpu"
298286
runs-on:
299287
group: bandb-itac-bmsprpvc1550-8-1gpu
300288
env:
@@ -330,10 +318,6 @@ jobs:
330318
- name: Install PyTorch
331319
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
332320

333-
- name: Install IPEX
334-
if: matrix.ipex == true
335-
run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
336-
337321
- name: Install dependencies
338322
run: |
339323
pip install -e ".[test]"
@@ -362,7 +346,7 @@ jobs:
362346
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
363347
include:
364348
- cuda_version: "11.8.0"
365-
torch_version: "2.2.2"
349+
torch_version: "2.3.1"
366350
pypi_index: "https://download.pytorch.org/whl/cu118"
367351
- cuda_version: "12.6.3"
368352
torch_version: "2.6.0"
@@ -372,7 +356,7 @@ jobs:
372356
pypi_index: "https://download.pytorch.org/whl/cu128"
373357
- cuda_version: "12.9.1"
374358
torch_version: "2.8.0"
375-
pypi_index: "https://download.pytorch.org/whl/test/cu129"
359+
pypi_index: "https://download.pytorch.org/whl/cu129"
376360

377361

378362
# Linux L40S runners
@@ -391,7 +375,7 @@ jobs:
391375
gpu: T4
392376
runner: CUDA-Windows-x64
393377
cuda_version: "11.8.0"
394-
torch_version: "2.2.0"
378+
torch_version: "2.3.1"
395379
pypi_index: "https://download.pytorch.org/whl/cu118"
396380
- os: windows-2025
397381
arch: x86_64
@@ -447,12 +431,6 @@ jobs:
447431
pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
448432
pip install -e ".[test]"
449433
pip install pytest-cov
450-
451-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
452-
- name: Downgrade NumPy
453-
if: startsWith(matrix.torch_version, '2.2.')
454-
run: pip install "numpy<2"
455-
456434
- name: Show installed packages
457435
run: pip list
458436

CMakeLists.txt

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
2828
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
2929
set(MPS_FILES csrc/mps_ops.mm)
3030
set(METAL_FILES csrc/mps_kernels.metal)
31+
set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
3132
# C++ sources are always included
3233
list(APPEND SRC_FILES ${CPP_FILES})
3334

34-
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
35-
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
35+
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
36+
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
3637
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
3738

3839
if(APPLE)
@@ -64,10 +65,18 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
6465
set(BUILD_CUDA OFF)
6566
set(BUILD_HIP OFF)
6667
set(BUILD_MPS ON)
68+
elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
69+
if(APPLE)
70+
message(FATAL_ERROR "XPU is not supported on macOS" )
71+
endif()
72+
set(BUILD_CUDA OFF)
73+
set(BUILD_MPS OFF)
74+
set(BUILD_XPU ON)
6775
else()
6876
set(BUILD_CUDA OFF)
6977
set(BUILD_HIP OFF)
7078
set(BUILD_MPS OFF)
79+
set(BUILD_XPU OFF)
7180
endif()
7281

7382

@@ -217,6 +226,15 @@ elseif(BUILD_MPS)
217226
COMMENT "Compiling Metal kernels"
218227
VERBATIM)
219228
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
229+
elseif(BUILD_XPU)
230+
list(APPEND SRC_FILES ${XPU_FILES})
231+
string(APPEND BNB_OUTPUT_NAME "_xpu")
232+
add_compile_definitions(BUILD_XPU)
233+
set(CMAKE_C_COMPILER icx)
234+
set(CMAKE_CXX_COMPILER icpx)
235+
if(WIN32)
236+
set(CMAKE_CXX_COMPILER icx)
237+
endif()
220238
else()
221239
string(APPEND BNB_OUTPUT_NAME "_cpu")
222240
set(GPU_SOURCES)
@@ -285,6 +303,15 @@ if(BUILD_MPS)
285303
add_dependencies(bitsandbytes metallib)
286304
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
287305
endif()
306+
if(BUILD_XPU)
307+
set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
308+
set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
309+
310+
set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
311+
target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
312+
target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})
313+
314+
endif()
288315

289316
if(WIN32)
290317
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
2020
bitsandbytes has the following minimum requirements for all platforms:
2121

2222
* Python 3.9+
23-
* [PyTorch](https://pytorch.org/get-started/locally/) 2.2+
23+
* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
2424
* _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._
2525

2626
#### Accelerator support:
@@ -61,7 +61,7 @@ bitsandbytes has the following minimum requirements for all platforms:
6161
<tr>
6262
<td></td>
6363
<td>🟩 NVIDIA GPU <br><code>cuda</code></td>
64-
<td>SM50+ minimum<br>SM75+ recommended</td>
64+
<td>SM60+ minimum<br>SM75+ recommended</td>
6565
<td>✅</td>
6666
<td>✅</td>
6767
<td>✅</td>
@@ -87,7 +87,7 @@ bitsandbytes has the following minimum requirements for all platforms:
8787
</td>
8888
<td>✅</td>
8989
<td>✅</td>
90-
<td>🚧</td>
90+
<td>〰️</td>
9191
</tr>
9292
<tr>
9393
<td></td>
@@ -127,7 +127,7 @@ bitsandbytes has the following minimum requirements for all platforms:
127127
<tr>
128128
<td></td>
129129
<td>🟩 NVIDIA GPU <br><code>cuda</code></td>
130-
<td>SM50+ minimum<br>SM75+ recommended</td>
130+
<td>SM60+ minimum<br>SM75+ recommended</td>
131131
<td>✅</td>
132132
<td>✅</td>
133133
<td>✅</td>
@@ -141,7 +141,7 @@ bitsandbytes has the following minimum requirements for all platforms:
141141
</td>
142142
<td>✅</td>
143143
<td>✅</td>
144-
<td>🚧</td>
144+
<td>〰️</td>
145145
</tr>
146146
<tr>
147147
<td colspan="6">🍎 <strong>macOS 14+</strong></td>

_typos.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
[files]
2+
# Skip these files in typo checks
3+
extend-exclude = [
4+
"csrc/xpu_ops.h",
5+
"csrc/xpu_ops.cpp",
6+
"csrc/xpu_kernels.h",
7+
"csrc/xpu_kernels.cpp"
8+
]
29

310
[default]
411
extend-ignore-re = [

benchmarking/inference_benchmark.py

Lines changed: 71 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
--batches BATCHES [BATCHES ...]
2222
--input-length INPUT_LENGTH
2323
--out-dir OUT_DIR
24+
--iterations ITERATIONS
25+
--warmup-runs WARMUP_RUNS
26+
--output-length OUTPUT_LENGTH
2427
"""
2528

2629
import argparse
@@ -30,6 +33,9 @@
3033
from optimum_benchmark.logging_utils import setup_logging
3134
import torch
3235

36+
torch.backends.cudnn.benchmark = False
37+
torch.backends.cudnn.deterministic = True
38+
3339
BFLOAT16_SUPPORT = torch.cuda.get_device_capability()[0] >= 8
3440

3541
WEIGHTS_CONFIGS = {
@@ -73,9 +79,8 @@
7379
},
7480
}
7581

76-
if __name__ == "__main__":
77-
setup_logging(level="INFO")
7882

83+
def parse_args():
7984
parser = argparse.ArgumentParser(description="bitsandbytes inference benchmark tool")
8085

8186
parser.add_argument("model_id", type=str, help="The model checkpoint to use.")
@@ -98,37 +103,73 @@
98103

99104
parser.add_argument("--out-dir", type=str, default="reports")
100105

101-
args = parser.parse_args()
106+
parser.add_argument("--iterations", type=int, default=10, help="Number of iterations for each benchmark run")
107+
parser.add_argument(
108+
"--warmup-runs", type=int, default=10, help="Number of warmup runs to discard before measurement"
109+
)
110+
parser.add_argument(
111+
"--output-length",
112+
type=int,
113+
default=64,
114+
help="If set, `max_new_tokens` and `min_new_tokens` will be set to this value.",
115+
)
116+
117+
return parser.parse_args()
118+
119+
120+
def run_benchmark(args, config, batch_size):
121+
launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn", start_method="spawn")
122+
scenario_config = InferenceConfig(
123+
latency=True,
124+
memory=True,
125+
input_shapes={"batch_size": batch_size, "sequence_length": args.input_length},
126+
iterations=args.iterations,
127+
warmup_runs=args.warmup_runs,
128+
# set duration to 0 to disable the duration-based stopping criterion
129+
# this is IMPORTANT to ensure that all benchmarks run the same number of operations, regardless of hardware speed/bottlenecks
130+
duration=0,
131+
# for consistent results, set a fixed min and max for output tokens
132+
generate_kwargs={"min_new_tokens": args.output_length, "max_new_tokens": args.output_length},
133+
forward_kwargs={"min_new_tokens": args.output_length, "max_new_tokens": args.output_length},
134+
)
135+
136+
backend_config = PyTorchConfig(
137+
device="cuda",
138+
device_ids="0",
139+
device_map="auto",
140+
no_weights=False,
141+
model=args.model_id,
142+
**WEIGHTS_CONFIGS[config],
143+
)
144+
145+
test_name = (
146+
f"benchmark-{config}"
147+
f"-bsz-{batch_size}"
148+
f"-isz-{args.input_length}"
149+
f"-osz-{args.output_length}"
150+
f"-iter-{args.iterations}"
151+
f"-wrmup-{args.warmup_runs}"
152+
)
153+
benchmark_config = BenchmarkConfig(
154+
name=test_name,
155+
scenario=scenario_config,
156+
launcher=launcher_config,
157+
backend=backend_config,
158+
)
159+
160+
out_path = out_dir / (test_name + ".json")
161+
print(f"[{test_name}] Starting:")
162+
benchmark_report = Benchmark.launch(benchmark_config)
163+
benchmark_report.save_json(out_path)
164+
165+
166+
if __name__ == "__main__":
167+
setup_logging(level="INFO")
168+
args = parse_args()
102169

103170
out_dir = Path(args.out_dir)
104171
out_dir.mkdir(parents=True, exist_ok=True)
105172

106173
for batch_size in args.batches:
107-
print(f"Benchmarking batch size: {batch_size}")
108174
for config in args.configs:
109-
launcher_config = ProcessConfig(device_isolation=True, start_method="spawn")
110-
scenario_config = InferenceConfig(
111-
latency=True,
112-
memory=True,
113-
input_shapes={"batch_size": batch_size, "sequence_length": args.input_length},
114-
)
115-
backend_config = PyTorchConfig(
116-
device="cuda",
117-
device_ids="0",
118-
device_map="auto",
119-
no_weights=False,
120-
model=args.model_id,
121-
**WEIGHTS_CONFIGS[config],
122-
)
123-
benchmark_config = BenchmarkConfig(
124-
name=f"benchmark-{config}-bsz{batch_size}",
125-
scenario=scenario_config,
126-
launcher=launcher_config,
127-
backend=backend_config,
128-
)
129-
130-
out_path = out_dir / f"benchmark_{config}_bsz{batch_size}.json"
131-
132-
benchmark_report = Benchmark.launch(benchmark_config)
133-
benchmark_report.log()
134-
benchmark_report.save_json(out_path)
175+
run_benchmark(args, config, batch_size)

0 commit comments

Comments
 (0)