Skip to content

Commit 09e3df4

Browse files
committed
[fix] fix weight convert for Ling mini 2.0 w/ half rotary 🐛
1 parent e078a63 commit 09e3df4

File tree

1 file changed

+35
-6
lines changed

1 file changed

+35
-6
lines changed

convert_hf_to_gguf.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7828,6 +7828,28 @@ def prepare_tensors(self):
78287828
class BailingMoeV2Model(TextModel):
78297829
model_arch = gguf.MODEL_ARCH.BAILINGMOE_V2
78307830

7831+
@staticmethod
7832+
def permute(
7833+
weights: Tensor, n_head: int, n_head_kv: int | None, rope_dim: int | None
7834+
):
7835+
if n_head_kv is not None and n_head != n_head_kv:
7836+
n_head = n_head_kv
7837+
if rope_dim is None:
7838+
rope_dim = weights.shape[0] // n_head
7839+
weights_rope, weights_nope = weights.reshape(
7840+
n_head, weights.shape[0] // n_head, *weights.shape[1:]
7841+
).split([rope_dim, weights.shape[0] // n_head - rope_dim], dim=1)
7842+
return torch.cat(
7843+
[
7844+
weights_rope.reshape(
7845+
n_head, 2, rope_dim // 2, *weights_rope.shape[2:]
7846+
)
7847+
.swapaxes(1, 2)
7848+
.reshape(weights_rope.shape),
7849+
weights_nope,
7850+
], dim=1
7851+
).reshape(weights.shape)
7852+
78317853
def set_vocab(self):
78327854
self._set_vocab_gpt2()
78337855

@@ -7867,6 +7889,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78677889
if match and int(match.group(1)) >= block_count:
78687890
return []
78697891

7892+
rope_dim = int(self.hparams['partial_rotary_factor'] * self.hparams['head_dim'])
78707893
if name.endswith("query_key_value.weight"):
78717894
n_head = self.hparams["num_attention_heads"]
78727895
n_kv_head = self.hparams.get("num_key_value_heads")
@@ -7876,10 +7899,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
78767899
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
78777900

78787901
return [
7879-
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
7880-
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
7902+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeV2Model.permute(q, n_head, n_head, rope_dim)),
7903+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeV2Model.permute(k, n_head, n_kv_head, rope_dim)),
78817904
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
78827905
]
7906+
elif "attention.key_layernorm" in name or "attention.query_layernorm" in name:
7907+
mapping = {
7908+
"attention.key_layernorm": "self_attn.key_layernorm",
7909+
"attention.query_layernorm": "self_attn.query_layernorm",
7910+
}
7911+
for k, v in mapping.items():
7912+
name = name.replace(k, v)
7913+
return [(self.map_tensor_name(name), BailingMoeV2Model.permute(data_torch, 1, 1, rope_dim))]
78837914
elif name.find("mlp.experts") != -1:
78847915
n_experts = self.hparams["num_experts"]
78857916
assert bid is not None
@@ -7912,10 +7943,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
79127943
return tensors
79137944

79147945
pre_tensor_name_mapping = {
7915-
'attention.dense': 'self_attn.dense',
7916-
'attention.key_layernorm': 'self_attn.key_layernorm',
7917-
'attention.query_layernorm': 'self_attn.query_layernorm',
7918-
'mlp.gate.expert_bias': 'mlp.gate.e_score_correction.bias',
7946+
"attention.dense": "self_attn.dense",
7947+
"mlp.gate.expert_bias": "mlp.gate.e_score_correction.bias",
79197948
}
79207949
for k, v in pre_tensor_name_mapping.items():
79217950
name = name.replace(k, v)

0 commit comments

Comments
 (0)