Skip to content

Commit 3e66bcb

Browse files
committed
Merge branch 'main' into stable
2 parents ad82d2d + d3e14ef commit 3e66bcb

File tree

1 file changed

+54
-1
lines changed

1 file changed

+54
-1
lines changed

dequant.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def dequantize_tensor(tensor, dtype=None, dequant_dtype=None):
2323
return dequantize(tensor.data, qtype, oshape, dtype=dequant_dtype).to(dtype)
2424
else:
2525
# this is incredibly slow
26-
tqdm.write(f"Falling back to numpy dequant for qtype: {qtype}")
26+
tqdm.write(f"Falling back to numpy dequant for qtype: {getattr(qtype, 'name', repr(qtype))}")
2727
new = gguf.quants.dequantize(tensor.cpu().numpy(), qtype)
2828
return torch.from_numpy(new).to(tensor.device, dtype=dtype)
2929

@@ -48,6 +48,10 @@ def to_uint32(x):
4848
x = x.view(torch.uint8).to(torch.int32)
4949
return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)
5050

51+
def to_uint16(x):
52+
x = x.view(torch.uint8).to(torch.int32)
53+
return (x[:, 0] | x[:, 1] << 8).unsqueeze(1)
54+
5155
def split_block_dims(blocks, *args):
5256
n_max = blocks.shape[1]
5357
dims = list(args) + [n_max - sum(args)]
@@ -233,6 +237,53 @@ def dequantize_blocks_Q2_K(blocks, block_size, type_size, dtype=None):
233237

234238
return qs.reshape((n_blocks, -1))
235239

240+
# IQ quants
241+
KVALUES = torch.tensor([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=torch.int8)
242+
243+
def dequantize_blocks_IQ4_NL(blocks, block_size, type_size, dtype=None):
244+
n_blocks = blocks.shape[0]
245+
246+
d, qs = split_block_dims(blocks, 2)
247+
d = d.view(torch.float16).to(dtype)
248+
249+
qs = qs.reshape((n_blocks, -1, 1, block_size//2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
250+
qs = (qs & 0x0F).reshape((n_blocks, -1, 1)).to(torch.int32)
251+
252+
kvalues = KVALUES.to(qs.device).expand(*qs.shape[:-1], 16)
253+
qs = torch.gather(kvalues, dim=-1, index=qs).reshape((n_blocks, -1))
254+
del kvalues # should still be view, but just to be safe
255+
256+
return (d * qs)
257+
258+
def dequantize_blocks_IQ4_XS(blocks, block_size, type_size, dtype=None):
259+
n_blocks = blocks.shape[0]
260+
d, scales_h, scales_l, qs = split_block_dims(blocks, 2, 2, QK_K // 64)
261+
d = d.view(torch.float16).to(dtype)
262+
scales_h = to_uint16(scales_h)
263+
264+
shift_a = torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2))
265+
shift_b = torch.tensor([2 * i for i in range(QK_K // 32)], device=d.device, dtype=torch.uint8).reshape((1, -1, 1))
266+
267+
scales_l = scales_l.reshape((n_blocks, -1, 1)) >> shift_a.reshape((1, 1, 2))
268+
scales_h = scales_h.reshape((n_blocks, -1, 1)) >> shift_b.reshape((1, -1, 1))
269+
270+
scales_l = scales_l.reshape((n_blocks, -1)) & 0x0F
271+
scales_h = scales_h.reshape((n_blocks, -1)).to(torch.uint8) & 0x03
272+
273+
scales = (scales_l | (scales_h << 4)).to(torch.int8) - 32
274+
dl = (d * scales.to(dtype)).reshape((n_blocks, -1, 1))
275+
276+
qs = qs.reshape((n_blocks, -1, 1, 16)) >> shift_a.reshape((1, 1, 2, 1))
277+
qs = qs.reshape((n_blocks, -1, 32, 1)) & 0x0F
278+
279+
kvalues = KVALUES.to(qs.device).expand(*qs.shape[:-1], 16)
280+
qs = torch.gather(kvalues, dim=-1, index=qs.to(torch.int32)).reshape((n_blocks, -1, 32))
281+
del kvalues # see IQ4_NL
282+
del shift_a
283+
del shift_b
284+
285+
return (dl * qs).reshape((n_blocks, -1))
286+
236287
dequantize_functions = {
237288
gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
238289
gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
@@ -245,4 +296,6 @@ def dequantize_blocks_Q2_K(blocks, block_size, type_size, dtype=None):
245296
gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
246297
gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
247298
gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
299+
gguf.GGMLQuantizationType.IQ4_NL: dequantize_blocks_IQ4_NL,
300+
gguf.GGMLQuantizationType.IQ4_XS: dequantize_blocks_IQ4_XS,
248301
}

0 commit comments

Comments
 (0)