Skip to content

Commit 8edece8

Browse files
authored
fix expert tensor mapping and spaces in vocab
1 parent 711ab17 commit 8edece8

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

convert_hf_to_gguf.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2656,11 +2656,13 @@ def set_vocab(self):
26562656
def decode_grok_token(token: dict, toktype: gguf.TokenType) -> tuple[gguf.TokenType, int, str]:
26572657
tokid: int = token["token"]
26582658
tokb: list[int] = token["bytes"]
2659+
if tokb == [32]:
2660+
tokb = [0xe2, 0x96, 0x81]
26592661
if len(tokb) == 1:
26602662
return gguf.TokenType.BYTE, tokid, "<0x{:02X}>".format(tokb[0])
26612663
else:
26622664
try:
2663-
tokc = bytes(tokb).decode("utf-8")
2665+
tokc = bytes(tokb).decode("utf-8").replace(" ", "▁")
26642666
except Exception:
26652667
tokc = None
26662668
if tokc is None or not all(tokb):
@@ -2722,7 +2724,7 @@ def set_gguf_parameters(self):
27222724

27232725
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
27242726
# process the experts separately
2725-
if name.find(".moe.") != -1:
2727+
if name.find(".moe.") != -1 or name.find(".block_sparse_moe.") != -1:
27262728
n_experts = self.hparams["num_local_experts"]
27272729

27282730
assert bid is not None
@@ -2736,17 +2738,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27362738
tensors: list[tuple[str, Tensor]] = []
27372739

27382740
# merge the experts into a single 3d tensor
2739-
for wid in ["linear", "linear_1", "linear_v"]:
2741+
for wid in [("linear", "w1"), ("linear_1", "w2"), ("linear_v", "w3")]:
27402742
datas: list[Tensor] = []
27412743

27422744
for xid in range(n_experts):
2743-
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
2745+
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
2746+
if ename not in self._experts[bid]:
2747+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
27442748
datas.append(self._experts[bid][ename])
27452749
del self._experts[bid][ename]
27462750

27472751
data_torch = torch.stack(datas, dim=0)
27482752

2749-
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
2753+
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
27502754

27512755
new_name = self.map_tensor_name(merged_name)
27522756

0 commit comments

Comments
 (0)