Relax bitsandbytes requirements (#946)

kashif · carmocca · awaelchli · web-flow · commit 929f9c79e630 · 2024-03-15T16:23:59.000+01:00
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: awaelchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/litgpt/utils.py b/litgpt/utils.py
@@ -39,7 +39,7 @@ def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> i
         if requires_grad is None or p.requires_grad == requires_grad:
             if hasattr(p, "quant_state"):
                 # bitsandbytes 4bit layer support
-                total += math.prod(p.quant_state[1])
+                total += math.prod(p.quant_state.shape)
             else:
                 total += p.numel()
     return total
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ license = { file = "LICENSE" }
 
 dependencies = [
     "torch>=2.2.0",
-    "lightning @ git+https://github.com/Lightning-AI/lightning@b19c3a961c79028d7c39a4f1ff1c2df991406d1d",
+    "lightning @ git+https://github.com/Lightning-AI/lightning@75553845c6bbcc305fbae38a46ef4e532e4ac85a",
     # TODO: install from PyPI when https://github.com/omni-us/jsonargparse/pull/466 is released
     "jsonargparse[signatures] @ git+https://github.com/omni-us/jsonargparse",
 ]
@@ -32,8 +32,7 @@ test = [
     "protobuf",
 ]
 all = [
-    "bitsandbytes==0.41.0",      # quantization
-    "scipy",                     # required by bitsandbytes
+    "bitsandbytes==0.42.0",      # quantization
     "sentencepiece",             # llama-based models
     "tokenizers",                # pythia, falcon, redpajama
     "datasets",                  # eval
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -1,6 +1,7 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
 import sys
+from copy import deepcopy
 from functools import partial
 from pathlib import Path
 from urllib.request import urlretrieve
@@ -698,7 +699,7 @@ def test_model_kv_cache_amp():
 
 
 @RunIf(min_cuda_gpus=1)
-@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+@pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice(config):
     from torch.backends.cuda import (
@@ -754,7 +755,7 @@ def assert_sdpa_backend(original_fn, q, k, v, mask):
 
 
 @RunIf(min_cuda_gpus=1)
-@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+@pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice_kv_cache(config):
     from torch.backends.cuda import (
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -158,7 +158,6 @@ def test_num_parameters():
 
 @RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("mode", ["nf4", "nf4-dq", "fp4", "fp4-dq", "int8", "int8-training"])
-@pytest.mark.skip("To be fixed")
 def test_num_parameters_bitsandbytes(mode):
     from lightning.fabric.plugins import BitsandbytesPrecision
 
diff --git a/tutorials/quantize.md b/tutorials/quantize.md
@@ -46,7 +46,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 Uses the normalized float 4 (nf4) data type. This is recommended over "fp4" based on the paper's experimental results and theoretical analysis.
 
 ```bash
-pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+pip install bitsandbytes
 
 litgpt generate base --quantize bnb.nf4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
@@ -62,7 +62,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model).
 
 ```bash
-pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+pip install bitsandbytes
 
 litgpt generate base --quantize bnb.nf4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
@@ -77,7 +77,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 Uses pure FP4 quantization.
 
 ```bash
-pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+pip install bitsandbytes
 
 litgpt generate base --quantize bnb.fp4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
@@ -93,7 +93,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model).
 
 ```bash
-pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+pip install bitsandbytes
 
 litgpt generate base --quantize bnb.fp4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
@@ -106,7 +106,7 @@ Memory used: 5.38 GB
 Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check out the [paper](https://arxiv.org/abs/2110.02861) to learn more about how it works.
 
 ```bash
-pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+pip install bitsandbytes
 
 litgpt generate base --quantize bnb.int8 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision 16-true --max_new_tokens 256
 ...