Support 4bit BNB layers meta-device materialization (#19150)

carmocca · web-flow · commit 6dfa5cca9de5 · 2023-12-20T22:13:18.000+01:00
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
@@ -8,4 +8,4 @@ hydra-core >=1.0.5, <1.4.0
 jsonargparse[signatures] >=4.26.1, <4.27.0
 rich >=12.3.0, <13.6.0
 tensorboardX >=2.2, <2.7.0  # min version is set by torch.onnx missing attribute
-bitsandbytes <=0.41.1
+bitsandbytes ==0.41.1  # strict
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `lightning.fabric.utilities.AttributeDict` for convenient dict-attribute access to represent state in script ([#18943](https://github.com/Lightning-AI/lightning/pull/18943))
 
 
+- Added support for meta-device initialization and materialization of 4-bit Bitsandbytes layers ([#19150](https://github.com/Lightning-AI/lightning/pull/19150))
+
+
 - Added `TransformerEnginePrecision(fallback_compute_dtype=)` to control the dtype of operations that don't support fp8 ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
 
 
diff --git a/src/lightning/fabric/plugins/precision/bitsandbytes.py b/src/lightning/fabric/plugins/precision/bitsandbytes.py
@@ -13,19 +13,21 @@
 # limitations under the License.
 import functools
 import logging
+import math
 import os
 import warnings
 from contextlib import ExitStack
 from functools import partial
 from types import ModuleType
-from typing import Any, Callable, ContextManager, Literal, Optional, OrderedDict, Set, Type
+from typing import Any, Callable, ContextManager, Literal, Optional, OrderedDict, Set, Tuple, Type, cast
 
 import torch
 from lightning_utilities import apply_to_collection
 from lightning_utilities.core.imports import RequirementCache
 from torch import Tensor
+from torch.nn import init
 from torch.nn.modules.module import _IncompatibleKeys
-from typing_extensions import override
+from typing_extensions import Self, override
 
 from lightning.fabric.plugins.precision.precision import Precision
 from lightning.fabric.plugins.precision.utils import (
@@ -37,7 +39,8 @@
 
 log = logging.getLogger(__name__)
 
-_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes>=0.41.0")
+# TODO: unpin after resolving the `quant_state` format breaking changes
+_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes==0.41.0")
 
 
 class BitsandbytesPrecision(Precision):
@@ -109,6 +112,7 @@ def convert_module(self, module: torch.nn.Module) -> torch.nn.Module:
         # convert modules if they haven't been converted already
         bnb = _import_bitsandbytes()
         if not any(isinstance(m, (bnb.nn.Linear8bitLt, bnb.nn.Linear4bit)) for m in module.modules()):
+            # this will not quantize the model but only replace the layer classes
             _convert_layers(module, self._linear_cls, self.ignore_modules)
 
         # set the compute dtype if necessary
@@ -164,11 +168,36 @@ def _quantize_on_load_hook(quantize_fn: Callable[[torch.Tensor], None], state_di
 
 
 def _ignore_missing_weights_hook(module: torch.nn.Module, incompatible_keys: _IncompatibleKeys) -> None:
+    # since we manually loaded the weight in the `_quantize_on_load_hook` hook, we need to avoid this missing key false
+    # positive
     for key in reversed(incompatible_keys.missing_keys):
         if key.endswith("weight"):
             incompatible_keys.missing_keys.remove(key)
 
 
+def _replace_param(
+    param: torch.nn.Parameter, data: torch.Tensor, quant_state: Optional[Tuple] = None
+) -> torch.nn.Parameter:
+    bnb = _import_bitsandbytes()
+
+    # doing `param.data = weight` raises a RuntimeError if param.data was on meta-device, so
+    # we need to re-create the parameters instead of overwriting the data
+    if param.device.type == "meta":
+        if isinstance(param, bnb.nn.Params4bit):
+            return bnb.nn.Params4bit(
+                data,
+                requires_grad=data.requires_grad,
+                quant_state=quant_state,
+                compress_statistics=param.compress_statistics,
+                quant_type=param.quant_type,
+            )
+        return torch.nn.Parameter(data, requires_grad=data.requires_grad)
+    param.data = data
+    if isinstance(param, bnb.nn.Params4bit):
+        param.quant_state = quant_state
+    return param
+
+
 @functools.lru_cache(maxsize=1)
 def _import_bitsandbytes() -> ModuleType:
     if not _BITSANDBYTES_AVAILABLE:
@@ -192,51 +221,160 @@ class _Linear8bitLt(bnb.nn.Linear8bitLt):
 
         def __init__(self, *args: Any, device: Optional[_DEVICE] = None, threshold: float = 6.0, **kwargs: Any) -> None:
             super().__init__(*args, device=device, threshold=threshold, **kwargs)
+            self.weight = cast(bnb.nn.Int8Params, self.weight)  # type: ignore[has-type]
+            self.bias = cast(Optional[torch.nn.Parameter], self.bias)  # type: ignore[has-type]
             # if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up
             # filling the device memory with float32 weights which could lead to OOM
             if torch.tensor(0, device=device).device.type == "cuda":
-                self._quantize_weight(self.weight.data)
-            self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self._quantize_weight))
+                self.quantize_()
+            self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self.quantize_))
             self.register_load_state_dict_post_hook(_ignore_missing_weights_hook)
 
-        def _quantize_weight(self, weight: torch.Tensor) -> None:
+        def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torch.device] = None) -> None:
+            """Inplace quantize."""
+            if weight is None:
+                weight = self.weight.data
+                if weight.data.type == torch.int8:
+                    # already quantized
+                    return
+            assert isinstance(self.weight, bnb.nn.Int8Params)
+            self.weight = self.quantize(self.weight, weight, device)
+
+        @staticmethod
+        def quantize(
+            int8params: bnb.nn.Int8Params, weight: torch.Tensor, device: Optional[torch.device]
+        ) -> bnb.nn.Int8Params:
+            device = device or torch.device("cuda")
+            if device.type != "cuda":
+                raise RuntimeError(f"Unexpected device type: {device.type}")
             # https://github.com/TimDettmers/bitsandbytes/blob/0.41.0/bitsandbytes/nn/modules.py#L291-L302
-            B = weight.contiguous().to(device="cuda", dtype=torch.float16)
-            if self.state.has_fp16_weights:
-                self.weight.data = B
+            B = weight.contiguous().to(device=device, dtype=torch.float16)
+            if int8params.has_fp16_weights:
+                int8params.data = B
             else:
                 CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
                 del CBt
                 del SCBt
-                self.weight.data = CB
-                setattr(self.weight, "CB", CB)
-                setattr(self.weight, "SCB", SCB)
+                int8params.data = CB
+                setattr(int8params, "CB", CB)
+                setattr(int8params, "SCB", SCB)
+            return int8params
+
+        def to_empty(self, *, device: _DEVICE, recurse: bool = True) -> Self:
+            if self.weight.device.type == "meta":
+                # need custom logic if int8params is on meta device
+                raise NotImplementedError
+            if self.weight.dtype == torch.uint8:  # was quantized
+                # need the original shape here
+                raise NotImplementedError
+            device = torch.device(device)
+            weight = torch.empty_like(self.weight.data, device=device)
+            if device.type == "cuda":  # re-quantize
+                self.quantize_(weight, device)
+            else:
+                self.weight = _replace_param(self.weight, weight)
+            if self.bias is not None:
+                self.bias = _replace_param(self.bias, torch.empty_like(self.bias, device=device))
+            return self
+
+        def reset_parameters(self) -> None:
+            # from `torch.nn.Linear.reset_parameters`
+            if self.bias is not None:
+                fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                init.uniform_(self.bias, -bound, bound)
+
+            linear_init_finished = isinstance(self.weight, bnb.nn.Params4bit)
+            if linear_init_finished and self.weight.dtype == torch.uint8:  # was quantized
+                # need the original shape here
+                raise NotImplementedError
+            weight = self.weight.data
+            torch.nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
+            if linear_init_finished:
+                if self.weight.device.type == "meta":
+                    # need custom logic if int8params is on meta device
+                    raise NotImplementedError
+                if self.weight.device.type == "cuda":  # re-quantize
+                    self.quantize_(weight)
+                else:
+                    self.weight = _replace_param(self.weight, weight)
 
     class _Linear4bit(bnb.nn.Linear4bit):
-        """Wraps `bnb.nn.Linear4bit` and enables instantiation directly on the device and re-quantizaton when loading
-        the state dict."""
+        """Wraps `bnb.nn.Linear4bit` to enable: instantiation directly on the device, re-quantizaton when loading the
+        state dict, meta-device initialization, and materialization."""
 
         def __init__(self, *args: Any, device: Optional[_DEVICE] = None, **kwargs: Any) -> None:
             super().__init__(*args, device=device, **kwargs)
+            self.weight = cast(bnb.nn.Params4bit, self.weight)  # type: ignore[has-type]
+            self.bias = cast(Optional[torch.nn.Parameter], self.bias)  # type: ignore[has-type]
             # if the device is CUDA or we are under a CUDA context manager, quantize the weight here, so we don't end up
             # filling the device memory with float32 weights which could lead to OOM
             if torch.tensor(0, device=device).device.type == "cuda":
-                self._quantize_weight(self.weight.data)
-            self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self._quantize_weight))
+                self.quantize_()
+            self._register_load_state_dict_pre_hook(partial(_quantize_on_load_hook, self.quantize_))
             self.register_load_state_dict_post_hook(_ignore_missing_weights_hook)
 
-        def _quantize_weight(self, weight: torch.Tensor) -> None:
+        def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torch.device] = None) -> None:
+            """Inplace quantize."""
+            if weight is None:
+                weight = self.weight.data
+                if weight.data.type == torch.uint8:
+                    # already quantized
+                    return
+            assert isinstance(self.weight, bnb.nn.Params4bit)
+            self.weight = self.quantize(self.weight, weight, device)
+
+        @staticmethod
+        def quantize(
+            params4bit: bnb.nn.Params4bit, weight: torch.Tensor, device: Optional[torch.device]
+        ) -> bnb.nn.Params4bit:
+            device = device or torch.device("cuda")
+            if device.type != "cuda":
+                raise RuntimeError(f"Unexpected device type: {device.type}")
             # https://github.com/TimDettmers/bitsandbytes/blob/0.41.0/bitsandbytes/nn/modules.py#L156-L159
-            params4bit = self.weight
-            w = weight.contiguous().to(device="cuda", dtype=torch.half)
+            w = weight.contiguous().to(device=device, dtype=torch.half)
             w_4bit, quant_state = bnb.functional.quantize_4bit(
                 w,
                 blocksize=params4bit.blocksize,
                 compress_statistics=params4bit.compress_statistics,
                 quant_type=params4bit.quant_type,
             )
-            params4bit.data = w_4bit
-            params4bit.quant_state = quant_state
+            return _replace_param(params4bit, w_4bit, quant_state)
+
+        def to_empty(self, *, device: _DEVICE, recurse: bool = True) -> Self:
+            if self.weight.dtype == torch.uint8:  # was quantized
+                # cannot init the quantized params directly
+                weight = torch.empty(self.weight.quant_state[1], device=device, dtype=torch.half)  # type: ignore[arg-type]
+            else:
+                weight = torch.empty_like(self.weight.data, device=device)  # type: ignore[arg-type]
+            device = torch.device(device)
+            if device.type == "cuda":  # re-quantize
+                self.quantize_(weight, device)
+            else:
+                self.weight = _replace_param(self.weight, weight)
+            if self.bias is not None:
+                self.bias = _replace_param(self.bias, torch.empty_like(self.bias, device=device))
+            return self
+
+        def reset_parameters(self) -> None:
+            # from `torch.nn.Linear.reset_parameters`
+            if self.bias is not None:
+                fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                init.uniform_(self.bias, -bound, bound)
+
+            linear_init_finished = isinstance(self.weight, bnb.nn.Params4bit)
+            if linear_init_finished and self.weight.dtype == torch.uint8:  # was quantized
+                # cannot init the quantized params directly
+                weight = torch.empty(self.weight.quant_state[1], device=self.weight.device, dtype=torch.half)
+            else:
+                weight = self.weight.data
+            torch.nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
+            if linear_init_finished:
+                if self.weight.device.type == "cuda":  # re-quantize
+                    self.quantize_(weight)
+                else:
+                    self.weight = _replace_param(self.weight, weight)
 
     # Use a class instead `functools.partial` to respect `isinstance` checks and attribute accesses
     class _Int8LinearInference(_Linear8bitLt):
@@ -281,17 +419,21 @@ def _convert_layers(module: torch.nn.Module, linear_cls: Type, ignore_modules: S
         if isinstance(child, torch.nn.Linear) and not any(fullname.startswith(s) for s in ignore_modules):
             log.debug(f"Replacing layer {fullname!r} with bitsandbytes equivalent")
             has_bias = child.bias is not None
+            # since we are going to copy over the child's data, the device doesn't matter. I chose CPU
+            # to avoid spiking CUDA memory even though initialization is slower
+            # 4bit layers support quantizing from meta-device params so this is only relevant for 8-bit
+            _Linear4bit = globals()["_Linear4bit"]
+            device = torch.device("meta" if issubclass(linear_cls, _Linear4bit) else "cpu")
             replacement = linear_cls(
-                # since we are going to copy over the child's data, the device doesn't matter. I chose CPU
-                # to avoid spiking CUDA memory even though initialization is slower
                 child.in_features,
                 child.out_features,
                 bias=has_bias,
-                device=torch.device("cpu"),
+                device=device,
             )
             if has_bias:
-                replacement.bias.data = child.bias.data.clone()
-            replacement._quantize_weight(child.weight.data.clone())
+                replacement.bias = _replace_param(replacement.bias, child.bias.data.clone())
+            state = {"quant_state": replacement.weight.quant_state if issubclass(linear_cls, _Linear4bit) else None}
+            replacement.weight = _replace_param(replacement.weight, child.weight.data.clone(), **state)
             module.__setattr__(name, replacement)
         else:
             _convert_layers(child, linear_cls, ignore_modules, prefix=fullname)
diff --git a/src/lightning/fabric/utilities/init.py b/src/lightning/fabric/utilities/init.py
@@ -11,9 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import itertools
 from typing import Any, Callable, Dict, Optional, Sequence
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_13
+import torch
+
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_13, _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.types import _DEVICE
 
 if _TORCH_GREATER_EQUAL_1_13:
     from torch.overrides import TorchFunctionMode
@@ -54,3 +58,23 @@ def __torch_function__(
                 return kwargs["tensor"]
             return args[0]
         return func(*args, **kwargs)
+
+
+def _materialize(module: torch.nn.Module, device: _DEVICE) -> None:
+    """Materialize a module."""
+    if not _TORCH_GREATER_EQUAL_2_1:
+        raise RuntimeError("recurse=False requires torch 2.1")
+    module.to_empty(device=device, recurse=False)  # type: ignore[arg-type]
+    if not hasattr(module, "reset_parameters"):
+        raise TypeError(
+            f"Materialization requires that the `{type(module).__name__}.reset_parameters` method is implemented."
+            " This method is used to initialize any children parameters or buffers in this module."
+        )
+    module.reset_parameters()
+
+
+def _materialize_meta_tensors(module: torch.nn.Module, device: _DEVICE) -> None:
+    """Materialize all tensors in a given module."""
+    for module in module.modules():
+        if any(t.is_meta for t in itertools.chain(module.parameters(recurse=False), module.buffers(recurse=False))):
+            _materialize(module, device)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - The Trainer now restores the training mode set through `.train()` or `.eval()` on a submodule-level when switching from validation to training ([#18951](https://github.com/Lightning-AI/lightning/pull/18951))
 
 
+- Added support for meta-device initialization and materialization of 4-bit Bitsandbytes layers ([#19150](https://github.com/Lightning-AI/lightning/pull/19150))
+
+
 - Added `TransformerEnginePrecision(fallback_compute_dtype=)` to control the dtype of operations that don't support fp8 ([#19082](https://github.com/Lightning-AI/lightning/pull/19082))
 
 
diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py
diff --git a/tests/tests_fabric/utilities/test_init.py b/tests/tests_fabric/utilities/test_init.py