@@ -555,6 +555,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
555555 # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
556556 # or pull the latest version of the model from Huggingface
557557 # don't edit the hashes manually!
558+ if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273" :
559+ # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
560+ res = "grok-2"
558561 if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5" :
559562 # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
560563 res = "llama-bpe"
@@ -1905,57 +1908,109 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
19051908 return tensors
19061909
19071910
1908- @Model .register ("GrokForCausalLM" )
1911+ @Model .register ("GrokForCausalLM" , "Grok1ForCausalLM" )
19091912class GrokModel (Model ):
19101913 model_arch = gguf .MODEL_ARCH .GROK
19111914
19121915 def set_vocab (self ):
1913- self ._set_vocab_sentencepiece ()
1916+ if (self .dir_model / 'tokenizer.model' ).is_file ():
1917+ self ._set_vocab_sentencepiece ()
1918+ return
1919+
1920+ if not (self .dir_model / 'tokenizer.json' ).is_file () or not (self .dir_model / 'chat_template.jinja' ).is_file ():
1921+ logger .error ('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer' )
1922+ sys .exit (1 )
1923+
1924+ self ._set_vocab_gpt2 ()
19141925
19151926 def __init__ (self , * args , ** kwargs ):
19161927 super ().__init__ (* args , ** kwargs )
19171928
19181929 def set_gguf_parameters (self ):
19191930 super ().set_gguf_parameters ()
19201931
1921- _experts : list [dict [str , Tensor ]] | None = None
1932+ self .gguf_writer .add_attn_logit_softcapping (self .hparams .get ("attn_logit_softcapping" , 30.0 ))
1933+ self .gguf_writer .add_router_logit_softcapping (self .hparams .get ("router_logit_softcapping" , 30.0 ))
1934+ if (final_logit_softcap := self .hparams .get ("final_logit_softcapping" )):
1935+ self .gguf_writer .add_final_logit_softcapping (final_logit_softcap )
1936+
1937+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
1938+ rope_dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
1939+
1940+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
1941+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
1942+
1943+ # Treat "original" as "yarn", seems to have been a mistake
1944+ if self .hparams .get ("rope_type" ) in ("yarn" , "original" ):
1945+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
1946+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["scaling_factor" ])
1947+ self .gguf_writer .add_rope_scaling_orig_ctx_len (self .hparams ["original_max_position_embeddings" ])
1948+ self .gguf_writer .add_rope_scaling_yarn_ext_factor (self .hparams ["extrapolation_factor" ])
1949+ self .gguf_writer .add_rope_scaling_yarn_attn_factor (self .hparams ["attn_factor" ])
1950+ self .gguf_writer .add_rope_scaling_yarn_beta_fast (self .hparams ["beta_fast" ])
1951+ self .gguf_writer .add_rope_scaling_yarn_beta_slow (self .hparams ["beta_slow" ])
1952+
1953+ if temp_len := self .hparams .get ("attn_temperature_len" ):
1954+ self .gguf_writer .add_attn_temperature_length (temp_len )
1955+
1956+ self .gguf_writer .add_attn_output_scale (self .hparams .get ("attn_output_multiplier" , rope_dim ** - 0.5 ))
1957+ self .gguf_writer .add_embedding_scale (self .hparams ["embedding_multiplier_scale" ])
1958+ self .gguf_writer .add_logit_scale (self .hparams ["output_multiplier_scale" ])
1959+
1960+ _experts : list [dict [str , list [Tensor ]]] | None = None
1961+ _cur_expert = ""
19221962
19231963 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1964+ tensors : list [tuple [str , Tensor ]] = []
1965+ is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
1966+
1967+ if not is_expert :
1968+ tensors .append ((self .map_tensor_name (name ), data_torch ))
1969+
19241970 # process the experts separately
1925- if name . find ( ".moe." ) != - 1 :
1971+ if is_expert or self . _cur_expert :
19261972 n_experts = self .hparams ["num_local_experts" ]
19271973
19281974 assert bid is not None
19291975
19301976 if self ._experts is None :
19311977 self ._experts = [{} for _ in range (self .block_count )]
19321978
1933- self ._experts [bid ][name ] = data_torch
1934-
1935- if len (self ._experts [bid ]) >= n_experts * 3 :
1936- tensors : list [tuple [str , Tensor ]] = []
1979+ # concatenate split tensors
1980+ if name in self ._experts [bid ]:
1981+ self ._cur_expert = name
1982+ self ._experts [bid ][name ].append (data_torch )
1983+ return []
1984+ elif is_expert :
1985+ self ._cur_expert = name
1986+ self ._experts [bid ][name ] = [data_torch ]
1987+ return []
1988+ else :
1989+ self ._cur_expert = ""
19371990
1938- # merge the experts into a single 3d tensor
1939- for wid in ["linear" , "linear_1" , "linear_v" ]:
1940- datas : list [Tensor ] = []
1991+ for bid in range (self .block_count ):
1992+ if len (self ._experts [bid ]) >= n_experts * 3 :
1993+ # merge the experts into a single 3d tensor
1994+ for wid in [("linear" , "w1" , 0 ), ("linear_1" , "w2" , 1 ), ("linear_v" , "w3" , 0 )]:
1995+ datas : list [Tensor ] = []
19411996
1942- for xid in range (n_experts ):
1943- ename = f"transformer.decoder_layer.{ bid } .moe.{ xid } .{ wid } .weight"
1944- datas .append (self ._experts [bid ][ename ])
1945- del self ._experts [bid ][ename ]
1997+ for xid in range (n_experts ):
1998+ ename = f"transformer.decoder_layer.{ bid } .moe.{ xid } .{ wid [0 ]} .weight"
1999+ if ename not in self ._experts [bid ]:
2000+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ wid [1 ]} .weight"
2001+ tensor_list = self ._experts [bid ][ename ]
2002+ datas .append (torch .cat (tensor_list , dim = wid [2 ]) if len (tensor_list ) > 1 else tensor_list [0 ])
2003+ del self ._experts [bid ][ename ]
19462004
1947- data_torch = torch .stack (datas , dim = 0 )
2005+ data_torch = torch .stack (datas , dim = 0 )
19482006
1949- merged_name = f"transformer.decoder_layer.{ bid } .moe.{ wid } .weight"
2007+ merged_name = f"transformer.decoder_layer.{ bid } .moe.{ wid [ 0 ] } .weight"
19502008
1951- new_name = self .map_tensor_name (merged_name )
2009+ new_name = self .map_tensor_name (merged_name )
19522010
1953- tensors .append ((new_name , data_torch ))
1954- return tensors
1955- else :
1956- return []
2011+ yield (new_name , data_torch )
19572012
1958- return [( self . map_tensor_name ( name ), data_torch )]
2013+ yield from tensors
19592014
19602015
19612016@Model .register ("DbrxForCausalLM" )
0 commit comments