Skip to content

Commit f0ec93d

Browse files
authored
Merge pull request #76 from tomaarsen/cleanup
Cleanup involving a handful of failures, some optimization and a lot of code quality improvements
2 parents c059bd2 + c91f592 commit f0ec93d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+279
-476
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Features:
4949
Bug fixes:
5050
- Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13
5151
- Fixed an unsafe use of eval. #8
52-
- Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`). #13 #15
52+
- Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`). #13 #15
5353

5454
Docs:
5555
- Added instructions how to solve "\_\_fatbinwrap_" errors.

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ outlined on that page and do not file a public issue.
2828

2929
## License
3030
By contributing to bitsandbytes, you agree that your contributions will be licensed
31-
under the LICENSE file in the root directory of this source tree.
31+
under the LICENSE file in the root directory of this source tree.

Makefile

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
2626
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
2727

2828
# NVIDIA NVCC compilation flags
29-
COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
30-
COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
29+
COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
30+
COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
3131
COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
3232
COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
3333
COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
3434
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
3535
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
36-
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
36+
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
3737

3838
# CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
3939
CC_CUDA92 := -gencode arch=compute_30,code=sm_30
@@ -58,38 +58,38 @@ CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
5858

5959

6060
all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
61-
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
62-
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
61+
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
62+
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
6363
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
6464

6565
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
6666
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
67-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
67+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
6868
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
6969

7070
cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
7171
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
72-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
72+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
7373
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
7474

7575
cuda110_nomatmul: $(BUILD_DIR) env
7676
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
77-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
77+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
7878
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
7979

8080
cuda11x_nomatmul: $(BUILD_DIR) env
8181
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
82-
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
82+
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
8383
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
8484

8585
cuda110: $(BUILD_DIR) env
8686
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
87-
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
87+
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
8888
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
8989

9090
cuda11x: $(BUILD_DIR) env
9191
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
92-
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
92+
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
9393
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
9494

9595
cpuonly: $(BUILD_DIR) env
@@ -117,7 +117,7 @@ $(ROOT_DIR)/dependencies/cub:
117117
cd dependencies/cub; git checkout 1.11.0
118118

119119
clean:
120-
rm build/*
120+
rm build/*
121121

122122
cleaneggs:
123123
rm -rf *.egg*

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# bitsandbytes
22

3-
The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
3+
The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
44

55

66

@@ -48,7 +48,7 @@ out = linear(x.to(torch.float16))
4848

4949
Requirements: anaconda, cudatoolkit, pytorch
5050

51-
Hardware requirements:
51+
Hardware requirements:
5252
- LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or older).
5353
- 8-bit optimizers and quantization: NVIDIA Maxwell GPU or newer (>=GTX 9XX).
5454

@@ -87,7 +87,7 @@ Note that by default all parameter tensors with less than 4096 elements are kept
8787
```
8888
# parameter tensors with less than 16384 values are optimized in 32-bit
8989
# it is recommended to use multiplies of 4096
90-
adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
90+
adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
9191
```
9292

9393
### Change Bits and other Hyperparameters for Individual Parameters

bitsandbytes/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# This source code is licensed under the MIT license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
from . import cuda_setup, utils
67
from .autograd._functions import (
78
MatmulLtState,
89
bmm_cublas,
@@ -12,7 +13,6 @@
1213
)
1314
from .cextension import COMPILED_WITH_CUDA
1415
from .nn import modules
15-
from . import cuda_setup, utils
1616

1717
if COMPILED_WITH_CUDA:
1818
from .optim import adam

bitsandbytes/__main__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
# from bitsandbytes.debug_cli import cli
2-
3-
# cli()
41
import os
52
import sys
63
from warnings import warn
@@ -31,8 +28,8 @@ def print_debug_info() -> None:
3128

3229

3330
from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL
34-
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
3531
from .cuda_setup.env_vars import to_be_ignored
32+
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
3633

3734
print_header("POTENTIALLY LIBRARY-PATH-LIKE ENV VARS")
3835
for k, v in os.environ.items():

bitsandbytes/autograd/_functions.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import operator
22
import warnings
3+
from dataclasses import dataclass
4+
from functools import reduce # Required in Python 3
35

46
import torch
7+
58
import bitsandbytes.functional as F
69

7-
from dataclasses import dataclass
8-
from functools import reduce # Required in Python 3
910

1011
# math.prod not compatible with python < 3.8
1112
def prod(iterable):
@@ -15,10 +16,10 @@ def prod(iterable):
1516

1617
"""
1718
This class pools outlier dimensions across layers.
18-
This is particularly important for small models where outlier features
19+
This is particularly important for small models where outlier features
1920
are less systematic and occur with low frequency.
2021
"""
21-
class GlobalOutlierPooler(object):
22+
class GlobalOutlierPooler:
2223
_instance = None
2324

2425
def __init__(self):
@@ -49,8 +50,9 @@ def get_current_outlier_idx(self):
4950

5051
class MatMul8bit(torch.autograd.Function):
5152
@staticmethod
52-
def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):
53-
53+
def forward(ctx, A, B, out=None, quant_type="vector", precision=None):
54+
if precision is None:
55+
precision = [8, 8, 8]
5456
if precision[0] != 8:
5557
with torch.no_grad():
5658
output = torch.matmul(A, B)

bitsandbytes/cextension.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import ctypes as ct
2-
import torch
3-
42
from pathlib import Path
53
from warnings import warn
64

5+
import torch
6+
77

8-
class CUDASetup(object):
8+
class CUDASetup:
99
_instance = None
1010

1111
def __init__(self):
@@ -122,7 +122,7 @@ def get_instance(cls):
122122
CUDASetup.get_instance().generate_instructions()
123123
CUDASetup.get_instance().print_log_stack()
124124
raise RuntimeError('''
125-
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs aboveto fix your environment!
125+
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment!
126126
If you cannot find any issues and suspect a bug, please open an issue with detals about your environment:
127127
https://github.com/TimDettmers/bitsandbytes/issues''')
128128
lib.cadam32bit_g32
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
from .paths import CUDA_RUNTIME_LIB, extract_candidate_paths, determine_cuda_runtime_lib_path
21
from .main import evaluate_cuda_setup
2+
from .paths import (
3+
CUDA_RUNTIME_LIB,
4+
determine_cuda_runtime_lib_path,
5+
extract_candidate_paths,
6+
)

bitsandbytes/cuda_setup/main.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
"""
1818

1919
import ctypes
20+
2021
import torch
2122

22-
from .paths import determine_cuda_runtime_lib_path
2323
from bitsandbytes.cextension import CUDASetup
2424

25+
from .paths import determine_cuda_runtime_lib_path
26+
2527

2628
def check_cuda_result(cuda, result_val):
2729
# 3. Check for CUDA errors
@@ -48,7 +50,7 @@ def get_cuda_version(cuda, cudart_path):
4850
minor = (version-(major*1000))//10
4951

5052
if major < 11:
51-
CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currenlty not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
53+
CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
5254

5355
return f'{major}{minor}'
5456

@@ -129,7 +131,7 @@ def evaluate_cuda_setup():
129131
failure = True
130132
cuda_setup.add_log_entry("WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!", is_warning=True)
131133
else:
132-
cuda_setup.add_log_entry((f"CUDA SETUP: CUDA runtime path found: {cudart_path}"))
134+
cuda_setup.add_log_entry(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
133135

134136
if cc == '' or cc is None:
135137
failure = True

0 commit comments

Comments
 (0)