Skip to content

Commit f46686b

Browse files
committed
Revert "DEMO repack"
This reverts commit caf0e42.
1 parent caf0e42 commit f46686b

File tree

1 file changed

+8
-44
lines changed

1 file changed

+8
-44
lines changed

convert_hf_to_gguf.py

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def prepare_tensors(self):
489489
old_dtype = data_torch.dtype
490490

491491
# convert any unsupported data types to float32
492-
if data_torch.dtype not in (torch.float16, torch.float32, torch.int32):
492+
if data_torch.dtype not in (torch.float16, torch.float32):
493493
data_torch = data_torch.to(torch.float32)
494494

495495
# use the first number-like part of the tensor name as the block id
@@ -7093,7 +7093,6 @@ def set_gguf_parameters(self):
70937093
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
70947094

70957095
_experts: list[dict[str, Tensor]] | None = None
7096-
_experts_s: list[dict[str, Tensor]] | None = None # scale (for quantized experts)
70977096

70987097
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
70997098
# skip vision tensors and remove "language_model." for Kimi-VL
@@ -7121,42 +7120,28 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
71217120
if self._experts is None:
71227121
self._experts = [{} for _ in range(self.block_count)]
71237122

7124-
if self._experts_s is None:
7125-
self._experts_s = [{} for _ in range(self.block_count)]
7126-
7127-
if name.endswith(".weight_packed"):
7128-
self._experts[bid][name] = data_torch
7123+
self._experts[bid][name] = data_torch
71297124

7130-
if name.endswith(".weight_scale"):
7131-
self._experts_s[bid][name] = data_torch
7125+
if len(self._experts[bid]) >= n_experts * 3:
7126+
tensors: list[tuple[str, Tensor]] = []
71327127

7133-
# TODO @ngxson : this is demo, won't compat with other models
7134-
if len(self._experts[bid]) + len(self._experts_s[bid]) >= n_experts * 3 * 2:
71357128
# merge the experts into a single 3d tensor
71367129
for w_name in ["down_proj", "gate_proj", "up_proj"]:
71377130
datas: list[Tensor] = []
7138-
datas_s: list[Tensor] = []
71397131

71407132
for xid in range(n_experts):
7141-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight_packed"
7133+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
71427134
datas.append(self._experts[bid][ename])
71437135
del self._experts[bid][ename]
71447136

7145-
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight_scale"
7146-
datas_s.append(self._experts_s[bid][ename])
7147-
del self._experts_s[bid][ename]
7148-
7149-
data_packed = torch.stack(datas, dim=0)
7150-
data_scale = torch.stack(datas_s, dim=0)
7137+
data_torch = torch.stack(datas, dim=0)
71517138

71527139
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
71537140

71547141
new_name = self.map_tensor_name(merged_name)
71557142

7156-
target_shape = (n_experts, data_packed.shape[1], data_packed.shape[2] * 32)
7157-
self.repack_compressed_tensor(new_name, data_packed, data_scale, target_shape)
7158-
#tensors.append((new_name, data_torch))
7159-
return []
7143+
tensors.append((new_name, data_torch))
7144+
return tensors
71607145
else:
71617146
return []
71627147

@@ -7191,27 +7176,6 @@ def prepare_tensors(self):
71917176
if len(experts) > 0:
71927177
raise ValueError(f"Unprocessed experts: {experts}")
71937178

7194-
def repack_compressed_tensor(self, new_name: str, blocks: Tensor, scales: Tensor, shape: Sequence[int]):
7195-
assert blocks.dtype == torch.int32
7196-
assert len(blocks.shape) == 3
7197-
assert len(scales.shape) == 3
7198-
logger.info(f"Repacking compressed_tensor {new_name} with shape {shape}")
7199-
# flatten the first two dimensions
7200-
blocks = blocks.reshape(-1, blocks.shape[2])
7201-
scales = scales.reshape(-1, scales.shape[2])
7202-
# TODO: for kimi-k2, this will cast bf16 to f16, this may reduce the accuracy of the model
7203-
# we have to do this because Q4_0 in GGUF only supports f16 scales
7204-
scales = scales.to(torch.float16)
7205-
scales = scales.to(torch.float16).view(torch.uint16).reshape(-1, 1)
7206-
repacked = blocks.reshape((blocks.shape[0] * blocks.shape[1]) // 4, 4)
7207-
repacked = repacked.view(torch.uint16)
7208-
assert repacked.shape[0] == scales.shape[0] # should have the same number of blocks
7209-
repacked = torch.concat([scales, repacked], dim=1)
7210-
repacked = repacked.view(torch.uint8)
7211-
shape_list = list(shape)
7212-
shape_list[-1] = (shape_list[-1] // 32) * 18 # block * 18 bytes for Q4_0 block size
7213-
self.gguf_writer.add_tensor(new_name, repacked.numpy(), raw_dtype=gguf.GGMLQuantizationType.Q4_0, raw_shape=shape_list)
7214-
72157179

72167180
@ModelBase.register("MiniMaxM2ForCausalLM")
72177181
class MiniMaxM2Model(TextModel):

0 commit comments

Comments
 (0)