@@ -3430,40 +3430,52 @@ def prepare_tensors(self):
34303430 raise ValueError (f"Unprocessed experts: { experts } " )
34313431
34323432
3433- @Model .register ("DeepseekV2ForCausalLM " )
3434- class DeepseekV2Model (Model ):
3435- model_arch = gguf .MODEL_ARCH .DEEPSEEK2
3433+ @Model .register ("DeepseekForCausalLM " )
3434+ class DeepseekModel (Model ):
3435+ model_arch = gguf .MODEL_ARCH .DEEPSEEK
34363436
34373437 def set_vocab (self ):
3438- self ._set_vocab_gpt2 ()
3438+ try :
3439+ self ._set_vocab_sentencepiece ()
3440+ except FileNotFoundError :
3441+ self ._set_vocab_gpt2 ()
34393442
34403443 def set_gguf_parameters (self ):
34413444 super ().set_gguf_parameters ()
34423445 hparams = self .hparams
3446+ if "head_dim" in hparams :
3447+ rope_dim = hparams ["head_dim" ]
3448+ else :
3449+ rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
34433450
3451+ self .gguf_writer .add_rope_dimension_count (rope_dim )
3452+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
34443453 self .gguf_writer .add_leading_dense_block_count (hparams ["first_k_dense_replace" ])
34453454 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
3446- if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
3447- self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
3448- self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
3449- self .gguf_writer .add_key_length (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
3450- self .gguf_writer .add_value_length (hparams ["v_head_dim" ])
34513455 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
3456+ self .gguf_writer .add_expert_weights_scale (1.0 )
34523457 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
34533458 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
3454- self .gguf_writer .add_expert_weights_scale (hparams ["routed_scaling_factor" ])
3455- self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
3456-
3457- if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
3458- if self .hparams ["rope_scaling" ].get ("type" ) == "yarn" :
3459- self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
3460- self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
3461- self .gguf_writer .add_rope_scaling_orig_ctx_len (self .hparams ["rope_scaling" ]["original_max_position_embeddings" ])
3462- self .gguf_writer .add_rope_scaling_yarn_log_mul (0.1 * hparams ["rope_scaling" ]["mscale_all_dim" ])
34633459
34643460 _experts : list [dict [str , Tensor ]] | None = None
34653461
3462+ @staticmethod
3463+ def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
3464+ if n_head_kv is not None and n_head != n_head_kv :
3465+ n_head = n_head_kv
3466+ return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
3467+ .swapaxes (1 , 2 )
3468+ .reshape (weights .shape ))
3469+
34663470 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3471+ n_head = self .hparams ["num_attention_heads" ]
3472+ n_kv_head = self .hparams .get ("num_key_value_heads" )
3473+
3474+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
3475+ data_torch = DeepseekModel .permute (data_torch , n_head , n_head )
3476+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
3477+ data_torch = DeepseekModel .permute (data_torch , n_head , n_kv_head )
3478+
34673479 # process the experts separately
34683480 if name .find ("mlp.experts" ) != - 1 :
34693481 n_experts = self .hparams ["n_routed_experts" ]
@@ -3509,52 +3521,40 @@ def prepare_tensors(self):
35093521 raise ValueError (f"Unprocessed experts: { experts } " )
35103522
35113523
3512- @Model .register ("DeepseekForCausalLM " )
3513- class DeepseekModel (Model ):
3514- model_arch = gguf .MODEL_ARCH .DEEPSEEK
3524+ @Model .register ("DeepseekV2ForCausalLM " )
3525+ class DeepseekV2Model (Model ):
3526+ model_arch = gguf .MODEL_ARCH .DEEPSEEK2
35153527
35163528 def set_vocab (self ):
3517- try :
3518- self ._set_vocab_sentencepiece ()
3519- except FileNotFoundError :
3520- self ._set_vocab_gpt2 ()
3529+ self ._set_vocab_gpt2 ()
35213530
35223531 def set_gguf_parameters (self ):
35233532 super ().set_gguf_parameters ()
35243533 hparams = self .hparams
3525- if "head_dim" in hparams :
3526- rope_dim = hparams ["head_dim" ]
3527- else :
3528- rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
35293534
3530- self .gguf_writer .add_rope_dimension_count (rope_dim )
3531- self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
35323535 self .gguf_writer .add_leading_dense_block_count (hparams ["first_k_dense_replace" ])
35333536 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
3537+ if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
3538+ self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
3539+ self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
3540+ self .gguf_writer .add_key_length (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
3541+ self .gguf_writer .add_value_length (hparams ["v_head_dim" ])
35343542 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
3535- self .gguf_writer .add_expert_weights_scale (1.0 )
35363543 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
35373544 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
3545+ self .gguf_writer .add_expert_weights_scale (hparams ["routed_scaling_factor" ])
3546+ self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
35383547
3539- _experts : list [dict [str , Tensor ]] | None = None
3548+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
3549+ if self .hparams ["rope_scaling" ].get ("type" ) == "yarn" :
3550+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
3551+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
3552+ self .gguf_writer .add_rope_scaling_orig_ctx_len (self .hparams ["rope_scaling" ]["original_max_position_embeddings" ])
3553+ self .gguf_writer .add_rope_scaling_yarn_log_mul (0.1 * hparams ["rope_scaling" ]["mscale_all_dim" ])
35403554
3541- @staticmethod
3542- def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
3543- if n_head_kv is not None and n_head != n_head_kv :
3544- n_head = n_head_kv
3545- return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
3546- .swapaxes (1 , 2 )
3547- .reshape (weights .shape ))
3555+ _experts : list [dict [str , Tensor ]] | None = None
35483556
35493557 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3550- n_head = self .hparams ["num_attention_heads" ]
3551- n_kv_head = self .hparams .get ("num_key_value_heads" )
3552-
3553- if name .endswith (("q_proj.weight" , "q_proj.bias" )):
3554- data_torch = DeepseekModel .permute (data_torch , n_head , n_head )
3555- if name .endswith (("k_proj.weight" , "k_proj.bias" )):
3556- data_torch = DeepseekModel .permute (data_torch , n_head , n_kv_head )
3557-
35583558 # process the experts separately
35593559 if name .find ("mlp.experts" ) != - 1 :
35603560 n_experts = self .hparams ["n_routed_experts" ]
0 commit comments