Skip to content

Commit 781fcd5

Browse files
committed
partially reverted 76b40a5
1 parent c6d0a84 commit 781fcd5

File tree

3 files changed

+64
-47
lines changed

3 files changed

+64
-47
lines changed

bitsandbytes/functional.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -571,9 +571,9 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n
571571
class QuantState:
572572
"""container for quantization state components to work with Params4bit and similar clases"""
573573
valid_quant_types = ('fp4', 'nf4')
574-
valid_qs_type_keys = [f"quant_state.bitsandbytes__{x}" for x in valid_quant_types]
575-
valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state',
576-
'quant_type', 'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
574+
valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
575+
valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
576+
'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
577577

578578
def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None):
579579
self.absmax = absmax
@@ -611,16 +611,19 @@ def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState
611611
"""
612612

613613
# unpacking tensor with non-tensor components
614-
qs_key = [k for k, v in qs_dict.items() if k in cls.valid_qs_type_keys and isinstance(v, torch.Tensor)]
614+
qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
615615
if not len(qs_key) and 'quant_type' not in qs_dict:
616-
raise ValueError("Expected packed or unpacked quant_state items, found neither")
617-
elif len(qs_key) != 1:
618-
raise ValueError(f"There should be exaclly one quant_state item with key from {cls.valid_qs_type_keys}. Detected {len(qs_key)} such items")
616+
raise ValueError("Expected packed or unpacked quant_state items, found neither")
617+
elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
618+
raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.")
619619

620620
# unpacking minor and non-tensor quant state items if necessary
621621
if len(qs_key) == 1:
622622
qs_key = qs_key[0]
623-
qs_dict |= unpack_tensor_to_dict(qs_dict.pop(qs_key))
623+
qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(qs_key)))
624+
625+
qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()} # strip prefixes
626+
assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
624627

625628
if 'nested_absmax' in qs_dict:
626629
offset = torch.tensor(float(qs_dict['nested_offset'])).to(device)
@@ -654,7 +657,7 @@ def as_dict(self, packed=False):
654657
'quant_type': self.quant_type,
655658
'absmax': self.absmax,
656659
'blocksize': self.blocksize,
657-
'quant_map': self.code,
660+
'quant_map': self.code,
658661
'dtype': str(self.dtype).strip('torch.'),
659662
'shape': tuple(self.shape) if self.nested else None,
660663
}
@@ -677,6 +680,7 @@ def as_dict(self, packed=False):
677680
def to(self, device):
678681
# make sure the quantization state is on the right device
679682
self.absmax = self.absmax.to(device)
683+
self.offset = self.offset.to(device)
680684
if self.nested:
681685
self.offset = self.offset.to(device)
682686
self.state2.absmax = self.state2.absmax.to(device)

bitsandbytes/nn/modules.py

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -155,28 +155,38 @@ def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64,
155155
return self
156156

157157
@classmethod
158-
def from_state_dict(cls, state_dict, prefix="", requires_grad=False):
159-
data = state_dict.pop(prefix.rstrip('.'))
160-
161-
# extracting components for QuantState from state_dict
162-
qs_dict = {}
163-
for k, v in state_dict.items():
164-
if k.replace(prefix, '').split('.')[0] in QuantState.valid_qs_keys:
165-
qs_dict[k] = v
166-
state_dict = {k: v for k, v in state_dict.items() if k not in qs_dict}
167-
qs_dict = {k.replace(prefix, ''): v for k, v in qs_dict.items()}
168-
169-
if data.device.type != "cuda":
170-
raise ValueError(f"`data.device.type` must be 'cuda', detected {data.device.type}")
171-
172-
cls.requires_grad = requires_grad
173-
cls.quant_state = QuantState.from_dict(qs_dict=qs_dict, device=data.device)
174-
cls.blocksize = cls.quant_state.blocksize # this attribute can be deprecated - it duplicates same one in quant_state
175-
cls.compress_statistics = cls.quant_state.nested # this attribute can be deprecated - it duplicates quant_state.nested
176-
cls.quant_type = cls.quant_state.quant_type # this attribute can be deprecated - it duplicates same one in quant_state
177-
178-
self = torch.Tensor._make_subclass(cls, data=data.to(data.device))
179-
return self, state_dict
158+
def from_prequantized(cls, data, quantized_stats, requires_grad=False, device='cuda', **kwargs):
159+
self = torch.Tensor._make_subclass(cls, data.to(device))
160+
self.requires_grad = requires_grad
161+
self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
162+
self.blocksize = self.quant_state.blocksize
163+
self.compress_statistics = self.quant_state.nested
164+
self.quant_type = self.quant_state.quant_type
165+
return self
166+
167+
# @classmethod
168+
# def from_state_dict(cls, state_dict, prefix="", requires_grad=False):
169+
# data = state_dict.pop(prefix.rstrip('.'))
170+
171+
# # extracting components for QuantState from state_dict
172+
# qs_dict = {}
173+
# for k, v in state_dict.items():
174+
# if k.replace(prefix, '').split('.')[0] in QuantState.valid_qs_keys:
175+
# qs_dict[k] = v
176+
# state_dict = {k: v for k, v in state_dict.items() if k not in qs_dict}
177+
# qs_dict = {k.replace(prefix, ''): v for k, v in qs_dict.items()}
178+
179+
# if data.device.type != "cuda":
180+
# raise ValueError(f"`data.device.type` must be 'cuda', detected {data.device.type}")
181+
182+
# cls.requires_grad = requires_grad
183+
# cls.quant_state = QuantState.from_dict(qs_dict=qs_dict, device=data.device)
184+
# cls.blocksize = cls.quant_state.blocksize # this attribute can be deprecated - it duplicates same one in quant_state
185+
# cls.compress_statistics = cls.quant_state.nested # this attribute can be deprecated - it duplicates quant_state.nested
186+
# cls.quant_type = cls.quant_state.quant_type # this attribute can be deprecated - it duplicates same one in quant_state
187+
188+
# self = torch.Tensor._make_subclass(cls, data=data.to(data.device))
189+
# return self, state_dict
180190

181191
def cuda(self, device):
182192
w = self.data.contiguous().half().cuda(device)
@@ -251,17 +261,17 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
251261
for k, v in self.weight.quant_state.as_dict(packed=True).items():
252262
destination[prefix + "weight." + k] = v if keep_vars else v.detach()
253263

254-
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
255-
missing_keys, unexpected_keys, error_msgs):
256-
# Note: super()._load_from_state_dict() is not called here intentionally.
257-
if self.bias is not None:
258-
bias_data = state_dict.pop(prefix + "bias", None)
259-
self.bias.data = bias_data.to(self.bias.data.device)
260-
261-
self.weight, state_dict = bnb.nn.Params4bit.from_state_dict(
262-
state_dict, prefix=prefix + "weight" + ".", requires_grad=False
263-
)
264-
unexpected_keys.extend(state_dict.keys())
264+
# def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
265+
# missing_keys, unexpected_keys, error_msgs):
266+
# # Note: super()._load_from_state_dict() is not called here intentionally.
267+
# if self.bias is not None:
268+
# bias_data = state_dict.pop(prefix + "bias", None)
269+
# self.bias.data = bias_data.to(self.bias.data.device)
270+
271+
# self.weight, state_dict = bnb.nn.Params4bit.from_state_dict(
272+
# state_dict, prefix=prefix + "weight" + ".", requires_grad=False
273+
# )
274+
# unexpected_keys.extend(state_dict.keys())
265275

266276
def forward(self, x: torch.Tensor):
267277
# weights are cast automatically as Int8Params, but the bias has to be cast manually

tests/test_linear4bit.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import torch
88

99
import bitsandbytes as bnb
10-
from bitsandbytes import functional as F
11-
from bitsandbytes.nn.modules import Linear4bit
1210

1311

1412
@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
@@ -41,7 +39,10 @@ def test_linear_serialization(quant_type, compress_statistics, bias):
4139

4240
# saving to state_dict:
4341
sd = linear_q.state_dict()
44-
42+
# restoring from state_dict:
43+
bias_data2 = sd.pop("bias", None)
44+
weight_data2 = sd.pop("weight")
45+
weight2 = bnb.nn.Params4bit.from_prequantized(quantized_stats=sd, data=weight_data2)
4546
# creating new layer with same params:
4647
linear_q2 = bnb.nn.Linear4bit(
4748
linear.in_features,
@@ -53,15 +54,17 @@ def test_linear_serialization(quant_type, compress_statistics, bias):
5354
device=device, # TODO create on meta device to save loading time
5455
)
5556
# loading weights from state_dict:
56-
linear_q2.load_state_dict(sd)
57+
linear_q2.weight = weight2.to(device)
58+
if bias:
59+
linear_q2.bias = torch.nn.Parameter(bias_data2)
5760

5861
# MATCHING
5962
a, b = linear_q.weight, linear_q2.weight
6063

6164
assert a.device == b.device
6265
assert a.dtype == b.dtype
6366
assert torch.equal(a, b)
64-
67+
6568
q0 = a.quant_state
6669
q1 = b.quant_state
6770
for attr in ('code', 'dtype', 'blocksize', 'absmax'):

0 commit comments

Comments
 (0)