bitsandbytes-foundation
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 32 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/bug-report.yml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/feature-request.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/feature-request.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/stale.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/stale.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.style.yapf‎
Lines changed: 13 additions & 0 deletions b/‎.style.yapf‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 44 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 6 deletions b/‎README.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 7 additions & 9 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎bitsandbytes/cuda_setup/env_vars.py‎
Lines changed: 2 additions & 0 deletions b/‎bitsandbytes/cuda_setup/env_vars.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bitsandbytes/cuda_setup/main.py‎
Lines changed: 8 additions & 4 deletions b/‎bitsandbytes/cuda_setup/main.py‎
Lines changed: 8 additions & 4 deletions
@@ -0,0 +1,32 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve bitsandbytes
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your relevant system information with us
+      placeholder: platform, python version, hardware, ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+
+      placeholder: |
+        Reproducer: 
+   
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
@@ -0,0 +1,30 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR? 
@@ -0,0 +1,27 @@
+name: Stale Bot
+
+on:
+  schedule:
+    - cron: "0 15 * * *"
+
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'TimDettmers/bitsandbytes'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python scripts/stale.py
@@ -133,3 +133,4 @@ dmypy.json
 
 dependencies
 cuda_build
+.vscode/*
@@ -0,0 +1,13 @@
+[style]
+ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = True
+ALLOW_MULTILINE_LAMBDAS = True
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = True
+COLUMN_LIMIT = 88
+COALESCE_BRACKETS = True
+SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = True
+SPACES_BEFORE_COMMENT = 2
+SPLIT_BEFORE_BITWISE_OPERATOR = True
+SPLIT_BEFORE_FIRST_ARGUMENT = True
+SPLIT_BEFORE_LOGICAL_OPERATOR = True
+SPLIT_BEFORE_NAMED_ASSIGNS = True
+SPLIT_COMPLEX_COMPREHENSION = True
@@ -283,3 +283,47 @@ Bug fixes:
  - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
  - Fixed bug where read-permission was assumed for a file. #497
  - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
+
+
+### 0.41.0
+
+Features:
+ - Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
+ - CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
+
+Bug fixes:
+ - Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
+ - Fixed a missing scipy dependency in requirements.txt. #544
+ - Fixed a bug, where a view operation could cause an error in 8-bit layers.
+ - Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
+ - Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
+ - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
+ - Fixed bug where read-permission was assumed for a file. #497
+ - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
+
+Documentation:
+ - Improved documentation for GPUs that do not support 8-bit matmul. #529
+ - Added description and pointers for the NF4 data type. #543
+
+User experience:
+ - Improved handling of default compute_dtype for Linear4bit Layers, so that compute_dtype = input_dtype if the input data type is stable enough (float32, bfloat16, but not float16).
+
+Performance:
+ - improved 4-bit inference performance for A100 GPUs. This degraded performance for A40/RTX3090 and RTX 4090 GPUs slightly.
+
+### 0.41.1
+
+Bug fixes:
+ - Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
+
+### 0.41.2
+
+Feature:
+ - 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
+
+### 0.41.3
+
+Bug fixes:
+ - Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
+ - Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
+
@@ -38,7 +38,7 @@ python setup.py install
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf,
+  'decapoda-research/llama-7b-hf',
   device_map='auto',
   load_in_8bit=True,
   max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')
@@ -119,7 +119,7 @@ torch.nn.Embedding(...) ->  bnb.nn.StableEmbedding(...) # recommended for NLP mo
 ```
 
 Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so:
-```
+```python
 # parameter tensors with less than 16384 values are optimized in 32-bit
 # it is recommended to use multiplies of 4096
 adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
@@ -146,13 +146,13 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands.
 
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 
-# For example, the following installs CUDA 11.8 to ~/local/cuda-11.8 and exports the path to your .bashrc
-bash cuda install 118 ~/local 1 
+# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
+bash install_cuda.sh 117 ~/local 1 
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
 
@@ -496,15 +496,15 @@ class MatMul4Bit(torch.autograd.Function):
     # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
 
     @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state=None):
+    def forward(ctx, A, B, out=None, bias=None, quant_state: F.QuantState = None):
         # default of pytorch behavior if inputs are empty
         ctx.is_empty = False
         if prod(A.shape) == 0:
             ctx.is_empty = True
             ctx.A = A
             ctx.B = B
             ctx.bias = bias
-            B_shape = state[1]
+            B_shape = quant_state.shape
             if A.shape[-1] == B_shape[0]:
                 return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
             else:
@@ -513,10 +513,10 @@ def forward(ctx, A, B, out=None, bias=None, state=None):
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
-        ctx.state = state
+        ctx.state = quant_state
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
@@ -534,7 +534,6 @@ def backward(ctx, grad_output):
 
         req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad
         A, B = ctx.tensors
-        state = ctx.state
 
         grad_A, grad_B, grad_bias = None, None, None
 
@@ -563,12 +562,11 @@ def matmul(
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
 
-def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
+def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None):
     assert quant_state is not None
     if A.numel() == A.shape[-1] and A.requires_grad == False:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
-        if A.shape[-1] % blocksize != 0:
-            warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
+        if A.shape[-1] % quant_state.blocksize != 0:
+            warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
         else:
             out = F.gemv_4bit(A, B.t(), out, state=quant_state)
 
@@ -8,6 +8,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
         "OLDPWD",
         "SSH_AUTH_SOCK",  # SSH stuff, therefore unrelated
         "SSH_TTY",
+        "GOOGLE_VM_CONFIG_LOCK_FILE",  # on GCP setups, requires elevated permissions, causing problems in Jupyter notebooks
         "HOME",  # Linux shell default
         "TMUX",  # Terminal Multiplexer
         "XDG_DATA_DIRS",  # XDG: Desktop environment stuff
@@ -19,6 +20,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
         "PATH",  # this is for finding binaries, not libraries
         "LESSOPEN",  # related to the `less` command
         "LESSCLOSE",
+        "GOOGLE_VM_CONFIG_LOCK_FILE", # Google Cloud stuff, contains root only paths
         "_",  # current Python interpreter
     }
     return env_var in ignorable
 
@@ -64,9 +64,10 @@ def generate_instructions(self):
             self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
             self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
             self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://github.com/TimDettmers/bitsandbytes/blob/main/cuda_install.sh')
+            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
             self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
             self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
+
             return
 
         make_cmd = f'CUDA_VERSION={self.cuda_version_string}'
@@ -214,8 +215,11 @@ def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
     paths = set()
     for libname in CUDA_RUNTIME_LIBS:
         for path in candidate_paths:
-            if (path / libname).is_file():
-                paths.add(path / libname)
+            try:
+                if (path / libname).is_file():
+                    paths.add(path / libname)
+            except PermissionError:
+                pass
     return paths
 
 
@@ -361,4 +365,4 @@ def evaluate_cuda_setup():
         "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
         binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
 
-    return binary_name, cudart_path, cc, cuda_version_string
+    return binary_name, cudart_path, cc, cuda_version_string
Original file line number	Diff line number	Diff line change
`@@ -133,3 +133,4 @@ dmypy.json`
`133`	`133`
`134`	`134`	`dependencies`
`135`	`135`	`cuda_build`
	`136`	`+.vscode/*`