@@ -2656,11 +2656,13 @@ def set_vocab(self):
26562656 def decode_grok_token (token : dict , toktype : gguf .TokenType ) -> tuple [gguf .TokenType , int , str ]:
26572657 tokid : int = token ["token" ]
26582658 tokb : list [int ] = token ["bytes" ]
2659+ if tokb == [32 ]:
2660+ tokb = [0xe2 , 0x96 , 0x81 ]
26592661 if len (tokb ) == 1 :
26602662 return gguf .TokenType .BYTE , tokid , "<0x{:02X}>" .format (tokb [0 ])
26612663 else :
26622664 try :
2663- tokc = bytes (tokb ).decode ("utf-8" )
2665+ tokc = bytes (tokb ).decode ("utf-8" ). replace ( " " , "▁" )
26642666 except Exception :
26652667 tokc = None
26662668 if tokc is None or not all (tokb ):
@@ -2722,7 +2724,7 @@ def set_gguf_parameters(self):
27222724
27232725 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
27242726 # process the experts separately
2725- if name .find (".moe." ) != - 1 :
2727+ if name .find (".moe." ) != - 1 or name . find ( ".block_sparse_moe." ) != - 1 :
27262728 n_experts = self .hparams ["num_local_experts" ]
27272729
27282730 assert bid is not None
@@ -2736,17 +2738,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27362738 tensors : list [tuple [str , Tensor ]] = []
27372739
27382740 # merge the experts into a single 3d tensor
2739- for wid in ["linear" , "linear_1" , "linear_v" ]:
2741+ for wid in [( "linear" , "w1" ), ( " linear_1" , "w2" ), ( " linear_v", "w3" ) ]:
27402742 datas : list [Tensor ] = []
27412743
27422744 for xid in range (n_experts ):
2743- ename = f"transformer.decoder_layer.{ bid } .moe.{ xid } .{ wid } .weight"
2745+ ename = f"transformer.decoder_layer.{ bid } .moe.{ xid } .{ wid [0 ]} .weight"
2746+ if ename not in self ._experts [bid ]:
2747+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ wid [1 ]} .weight"
27442748 datas .append (self ._experts [bid ][ename ])
27452749 del self ._experts [bid ][ename ]
27462750
27472751 data_torch = torch .stack (datas , dim = 0 )
27482752
2749- merged_name = f"transformer.decoder_layer.{ bid } .moe.{ wid } .weight"
2753+ merged_name = f"transformer.decoder_layer.{ bid } .moe.{ wid [ 0 ] } .weight"
27502754
27512755 new_name = self .map_tensor_name (merged_name )
27522756
0 commit comments