Skip to content

Commit be5cecb

Browse files
authored
Merge branch 'main' into main
2 parents 8724c99 + f0ec93d commit be5cecb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+746
-631
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Features:
4949
Bug fixes:
5050
- Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13
5151
- Fixed an unsafe use of eval. #8
52-
- Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`). #13 #15
52+
- Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`). #13 #15
5353

5454
Docs:
5555
- Added instructions how to solve "\_\_fatbinwrap_" errors.
@@ -149,3 +149,9 @@ Bug fixes:
149149

150150
Bug fixes:
151151
- Fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
152+
153+
### 0.35.4
154+
155+
Bug fixes:
156+
- Fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
157+
- Fixed a bug where not finding the cuda runtime led to an incomprehensible error.

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ outlined on that page and do not file a public issue.
2828

2929
## License
3030
By contributing to bitsandbytes, you agree that your contributions will be licensed
31-
under the LICENSE file in the root directory of this source tree.
31+
under the LICENSE file in the root directory of this source tree.

Makefile

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
2626
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
2727

2828
# NVIDIA NVCC compilation flags
29-
COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
30-
COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
29+
COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
30+
COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
3131
COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
3232
COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
3333
COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
3434
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
3535
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
36-
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
36+
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
3737

3838
# CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
3939
CC_CUDA92 := -gencode arch=compute_30,code=sm_30
@@ -58,38 +58,38 @@ CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
5858

5959

6060
all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
61-
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
62-
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
61+
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
62+
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
6363
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
6464

6565
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
6666
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
67-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
67+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
6868
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
6969

7070
cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
7171
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
72-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
72+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
7373
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
7474

7575
cuda110_nomatmul: $(BUILD_DIR) env
7676
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
77-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
77+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
7878
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
7979

8080
cuda11x_nomatmul: $(BUILD_DIR) env
8181
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
82-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
82+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
8383
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
8484

8585
cuda110: $(BUILD_DIR) env
8686
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
87-
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
87+
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
8888
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
8989

9090
cuda11x: $(BUILD_DIR) env
9191
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
92-
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
92+
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
9393
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
9494

9595
cpuonly: $(BUILD_DIR) env
@@ -117,7 +117,7 @@ $(ROOT_DIR)/dependencies/cub:
117117
cd dependencies/cub; git checkout 1.11.0
118118

119119
clean:
120-
rm build/*
120+
rm build/*
121121

122122
cleaneggs:
123123
rm -rf *.egg*

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# bitsandbytes
22

3-
The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
3+
The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
44

55

66

@@ -48,7 +48,7 @@ out = linear(x.to(torch.float16))
4848

4949
Requirements: anaconda, cudatoolkit, pytorch
5050

51-
Hardware requirements:
51+
Hardware requirements:
5252
- LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or older).
5353
- 8-bit optimizers and quantization: NVIDIA Maxwell GPU or newer (>=GTX 9XX).
5454

@@ -87,7 +87,7 @@ Note that by default all parameter tensors with less than 4096 elements are kept
8787
```
8888
# parameter tensors with less than 16384 values are optimized in 32-bit
8989
# it is recommended to use multiplies of 4096
90-
adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
90+
adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
9191
```
9292

9393
### Change Bits and other Hyperparameters for Individual Parameters

bitsandbytes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# This source code is licensed under the MIT license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
from . import cuda_setup, utils
67
from .autograd._functions import (
78
MatmulLtState,
89
bmm_cublas,
@@ -12,7 +13,6 @@
1213
)
1314
from .cextension import COMPILED_WITH_CUDA
1415
from .nn import modules
15-
from . import cuda_setup, utils
1616

1717
if COMPILED_WITH_CUDA:
1818
from .optim import adam

bitsandbytes/__main__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
# from bitsandbytes.debug_cli import cli
2-
3-
# cli()
41
import os
52
import sys
63
from warnings import warn
@@ -31,8 +28,8 @@ def print_debug_info() -> None:
3128

3229

3330
from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL
34-
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
3531
from .cuda_setup.env_vars import to_be_ignored
32+
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
3633

3734
print_header("POTENTIALLY LIBRARY-PATH-LIKE ENV VARS")
3835
for k, v in os.environ.items():

bitsandbytes/autograd/_functions.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import operator
22
import warnings
3+
from dataclasses import dataclass
4+
from functools import reduce # Required in Python 3
35

46
import torch
7+
58
import bitsandbytes.functional as F
69

7-
from dataclasses import dataclass
8-
from functools import reduce # Required in Python 3
910

1011
# math.prod not compatible with python < 3.8
1112
def prod(iterable):
@@ -15,10 +16,10 @@ def prod(iterable):
1516

1617
"""
1718
This class pools outlier dimensions across layers.
18-
This is particularly important for small models where outlier features
19+
This is particularly important for small models where outlier features
1920
are less systematic and occur with low frequency.
2021
"""
21-
class GlobalOutlierPooler(object):
22+
class GlobalOutlierPooler:
2223
_instance = None
2324

2425
def __init__(self):
@@ -49,8 +50,9 @@ def get_current_outlier_idx(self):
4950

5051
class MatMul8bit(torch.autograd.Function):
5152
@staticmethod
52-
def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):
53-
53+
def forward(ctx, A, B, out=None, quant_type="vector", precision=None):
54+
if precision is None:
55+
precision = [8, 8, 8]
5456
if precision[0] != 8:
5557
with torch.no_grad():
5658
output = torch.matmul(A, B)

bitsandbytes/cextension.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import ctypes as ct
2-
import torch
3-
42
from pathlib import Path
53
from warnings import warn
64

5+
import torch
6+
77

8-
class CUDASetup(object):
8+
class CUDASetup:
99
_instance = None
1010

1111
def __init__(self):
@@ -52,8 +52,13 @@ def generate_instructions(self):
5252
self.add_log_entry('python setup.py install')
5353

5454
def initialize(self):
55-
self.cuda_setup_log = []
55+
self.has_printed = False
5656
self.lib = None
57+
self.run_cuda_setup()
58+
59+
def run_cuda_setup(self):
60+
self.initialized = True
61+
self.cuda_setup_log = []
5762

5863
from .cuda_setup.main import evaluate_cuda_setup
5964
binary_name, cudart_path, cuda, cc, cuda_version_string = evaluate_cuda_setup()
@@ -89,7 +94,8 @@ def initialize(self):
8994
else:
9095
self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
9196
self.lib = ct.cdll.LoadLibrary(binary_path)
92-
except:
97+
except Exception as ex:
98+
self.add_log_entry(str(ex))
9399
self.print_log_stack()
94100

95101
def add_log_entry(self, msg, is_warning=False):
@@ -116,16 +122,14 @@ def get_instance(cls):
116122
CUDASetup.get_instance().generate_instructions()
117123
CUDASetup.get_instance().print_log_stack()
118124
raise RuntimeError('''
119-
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs to fix your environment!
125+
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment!
120126
If you cannot find any issues and suspect a bug, please open an issue with detals about your environment:
121127
https://github.com/TimDettmers/bitsandbytes/issues''')
122128
lib.cadam32bit_g32
123129
lib.get_context.restype = ct.c_void_p
124130
lib.get_cusparse.restype = ct.c_void_p
125131
COMPILED_WITH_CUDA = True
126132
except AttributeError:
127-
warn(
128-
"The installed version of bitsandbytes was compiled without GPU support. "
129-
"8-bit optimizers and GPU quantization are unavailable."
130-
)
133+
warn("The installed version of bitsandbytes was compiled without GPU support. "
134+
"8-bit optimizers and GPU quantization are unavailable.")
131135
COMPILED_WITH_CUDA = False
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
from .paths import CUDA_RUNTIME_LIB, extract_candidate_paths, determine_cuda_runtime_lib_path
21
from .main import evaluate_cuda_setup
2+
from .paths import (
3+
CUDA_RUNTIME_LIB,
4+
determine_cuda_runtime_lib_path,
5+
extract_candidate_paths,
6+
)

0 commit comments

Comments
 (0)