bitsandbytes-foundation
diff --git a/‎CHANGELOG.md‎
Lines changed: 7 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 12 additions & 12 deletions b/‎Makefile‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bitsandbytes/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎bitsandbytes/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bitsandbytes/__main__.py‎
Lines changed: 1 addition & 4 deletions b/‎bitsandbytes/__main__.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 8 additions & 6 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎bitsandbytes/cextension.py‎
Lines changed: 14 additions & 10 deletions b/‎bitsandbytes/cextension.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎bitsandbytes/cuda_setup/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎bitsandbytes/cuda_setup/__init__.py‎
Lines changed: 5 additions & 1 deletion
@@ -49,7 +49,7 @@ Features:
 Bug fixes:
  - Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13
  - Fixed an unsafe use of eval. #8
- - Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`).  #13 #15 
+ - Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`).  #13 #15
 
 Docs:
  - Added instructions how to solve "\_\_fatbinwrap_" errors.
@@ -149,3 +149,9 @@ Bug fixes:
 
 Bug fixes:
  - Fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
+
+### 0.35.4
+
+Bug fixes:
+ - Fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
+ - Fixed a bug where not finding the cuda runtime led to an incomprehensible error.
@@ -28,4 +28,4 @@ outlined on that page and do not file a public issue.
 
 ## License
 By contributing to bitsandbytes, you agree that your contributions will be licensed
-under the LICENSE file in the root directory of this source tree.
+under the LICENSE file in the root directory of this source tree.
@@ -26,14 +26,14 @@ INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
-COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler 
-COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler 
+COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
+COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
 COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
+COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
 
 # CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
 CC_CUDA92 := -gencode arch=compute_30,code=sm_30
@@ -58,38 +58,38 @@ CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
 
 
 all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) 
-	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
 cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
 
 cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
 
 cuda110_nomatmul: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
 
 cuda11x_nomatmul: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
 
 cuda110: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
 cuda11x: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
 cpuonly: $(BUILD_DIR) env
@@ -117,7 +117,7 @@ $(ROOT_DIR)/dependencies/cub:
 	cd dependencies/cub; git checkout 1.11.0
 
 clean:
-	rm build/* 
+	rm build/*
 
 cleaneggs:
 	rm -rf *.egg*
 
@@ -1,6 +1,6 @@
 # bitsandbytes
 
-The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions. 
+The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
 
 
 
@@ -48,7 +48,7 @@ out = linear(x.to(torch.float16))
 
 Requirements: anaconda, cudatoolkit, pytorch
 
-Hardware requirements: 
+Hardware requirements:
  - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or older).
  - 8-bit optimizers and quantization: NVIDIA Maxwell GPU or newer (>=GTX 9XX).
 
@@ -87,7 +87,7 @@ Note that by default all parameter tensors with less than 4096 elements are kept
 ```
 # parameter tensors with less than 16384 values are optimized in 32-bit
 # it is recommended to use multiplies of 4096
-adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) 
+adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
 ```
 
 ### Change Bits and other Hyperparameters for Individual Parameters
 
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from . import cuda_setup, utils
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
@@ -12,7 +13,6 @@
 )
 from .cextension import COMPILED_WITH_CUDA
 from .nn import modules
-from . import cuda_setup, utils
 
 if COMPILED_WITH_CUDA:
     from .optim import adam
 
@@ -1,6 +1,3 @@
-# from bitsandbytes.debug_cli import cli
-
-# cli()
 import os
 import sys
 from warnings import warn
@@ -31,8 +28,8 @@ def print_debug_info() -> None:
 
 
 from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL
-from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
 from .cuda_setup.env_vars import to_be_ignored
+from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
 
 print_header("POTENTIALLY LIBRARY-PATH-LIKE ENV VARS")
 for k, v in os.environ.items():
 
@@ -1,11 +1,12 @@
 import operator
 import warnings
+from dataclasses import dataclass
+from functools import reduce  # Required in Python 3
 
 import torch
+
 import bitsandbytes.functional as F
 
-from dataclasses import dataclass
-from functools import reduce  # Required in Python 3
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -15,10 +16,10 @@ def prod(iterable):
 
 """
     This class pools outlier dimensions across layers.
-    This is particularly important for small models where outlier features 
+    This is particularly important for small models where outlier features
     are less systematic and occur with low frequency.
 """
-class GlobalOutlierPooler(object):
+class GlobalOutlierPooler:
     _instance = None
 
     def __init__(self):
@@ -49,8 +50,9 @@ def get_current_outlier_idx(self):
 
 class MatMul8bit(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):
-
+    def forward(ctx, A, B, out=None, quant_type="vector", precision=None):
+        if precision is None:
+            precision = [8, 8, 8]
         if precision[0] != 8:
             with torch.no_grad():
                 output = torch.matmul(A, B)
 
@@ -1,11 +1,11 @@
 import ctypes as ct
-import torch
-
 from pathlib import Path
 from warnings import warn
 
+import torch
+
 
-class CUDASetup(object):
+class CUDASetup:
     _instance = None
 
     def __init__(self):
@@ -52,8 +52,13 @@ def generate_instructions(self):
         self.add_log_entry('python setup.py install')
 
     def initialize(self):
-        self.cuda_setup_log = []
+        self.has_printed = False
         self.lib = None
+        self.run_cuda_setup()
+
+    def run_cuda_setup(self):
+        self.initialized = True
+        self.cuda_setup_log = []
 
         from .cuda_setup.main import evaluate_cuda_setup
         binary_name, cudart_path, cuda, cc, cuda_version_string = evaluate_cuda_setup()
@@ -89,7 +94,8 @@ def initialize(self):
             else:
                 self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
                 self.lib = ct.cdll.LoadLibrary(binary_path)
-        except:
+        except Exception as ex:
+            self.add_log_entry(str(ex))
             self.print_log_stack()
 
     def add_log_entry(self, msg, is_warning=False):
@@ -116,16 +122,14 @@ def get_instance(cls):
         CUDASetup.get_instance().generate_instructions()
         CUDASetup.get_instance().print_log_stack()
         raise RuntimeError('''
-        CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs to fix your environment!
+        CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment!
         If you cannot find any issues and suspect a bug, please open an issue with detals about your environment:
         https://github.com/TimDettmers/bitsandbytes/issues''')
     lib.cadam32bit_g32
     lib.get_context.restype = ct.c_void_p
     lib.get_cusparse.restype = ct.c_void_p
     COMPILED_WITH_CUDA = True
 except AttributeError:
-    warn(
-        "The installed version of bitsandbytes was compiled without GPU support. "
-        "8-bit optimizers and GPU quantization are unavailable."
-    )
+    warn("The installed version of bitsandbytes was compiled without GPU support. "
+        "8-bit optimizers and GPU quantization are unavailable.")
     COMPILED_WITH_CUDA = False
@@ -1,2 +1,6 @@
-from .paths import CUDA_RUNTIME_LIB, extract_candidate_paths, determine_cuda_runtime_lib_path
 from .main import evaluate_cuda_setup
+from .paths import (
+    CUDA_RUNTIME_LIB,
+    determine_cuda_runtime_lib_path,
+    extract_candidate_paths,
+)