@@ -556,11 +556,8 @@ def set_gguf_parameters(self):
556556 logger .info (f"gguf: experts used count = { n_experts_used } " )
557557
558558 if (head_dim := self .hparams .get ("head_dim" )) is not None :
559- # Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
560- # https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
561- if self .hparams .get ("model_type" ) != "deepseek_v3" :
562- self .gguf_writer .add_key_length (head_dim )
563- self .gguf_writer .add_value_length (head_dim )
559+ self .gguf_writer .add_key_length (head_dim )
560+ self .gguf_writer .add_value_length (head_dim )
564561
565562 self .gguf_writer .add_file_type (self .ftype )
566563 logger .info (f"gguf: file type = { self .ftype } " )
@@ -1901,9 +1898,7 @@ def set_gguf_parameters(self):
19011898 hparams = self .hparams
19021899 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
19031900
1904- if "head_dim" in hparams :
1905- rope_dim = hparams ["head_dim" ]
1906- else :
1901+ if (rope_dim := hparams .get ("head_dim" )) is None :
19071902 rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
19081903 self .gguf_writer .add_rope_dimension_count (rope_dim )
19091904
@@ -1985,7 +1980,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
19851980 if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
19861981 if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
19871982 base = self .hparams .get ("rope_theta" , 10000.0 )
1988- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1983+ if (dim := self .hparams .get ("head_dim" )) is None :
1984+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
19891985 freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
19901986
19911987 factor = rope_scaling .get ("factor" , 8.0 )
@@ -2321,9 +2317,7 @@ def set_gguf_parameters(self):
23212317 hparams = self .hparams
23222318 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
23232319
2324- if "head_dim" in hparams :
2325- rope_dim = hparams ["head_dim" ]
2326- else :
2320+ if (rope_dim := hparams .get ("head_dim" )) is None :
23272321 rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
23282322 self .gguf_writer .add_rope_dimension_count (rope_dim )
23292323
@@ -2363,7 +2357,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
23632357 if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
23642358 if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
23652359 base = self .hparams .get ("rope_theta" , 10000.0 )
2366- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
2360+ if (dim := self .hparams .get ("head_dim" )) is None :
2361+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
23672362 freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
23682363
23692364 factor = rope_scaling .get ("factor" , 8.0 )
@@ -3681,9 +3676,7 @@ def set_gguf_parameters(self):
36813676 hparams = self .hparams
36823677 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
36833678
3684- if "head_dim" in hparams :
3685- rope_dim = hparams ["head_dim" ]
3686- else :
3679+ if (rope_dim := hparams .get ("head_dim" )) is None :
36873680 rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
36883681 self .gguf_writer .add_rope_dimension_count (rope_dim )
36893682
@@ -5098,9 +5091,7 @@ def set_vocab(self):
50985091 def set_gguf_parameters (self ):
50995092 super ().set_gguf_parameters ()
51005093 hparams = self .hparams
5101- if "head_dim" in hparams :
5102- rope_dim = hparams ["head_dim" ]
5103- else :
5094+ if (rope_dim := hparams .get ("head_dim" )) is None :
51045095 rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
51055096
51065097 self .gguf_writer .add_rope_dimension_count (rope_dim )
@@ -5990,7 +5981,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
59905981 if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
59915982 if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
59925983 base = self .hparams .get ("rope_theta" , 10000.0 )
5993- dim = self .hparams .get ("head_dim" , self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
5984+ if (dim := self .hparams .get ("head_dim" )) is None :
5985+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
59945986 freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
59955987
59965988 factor = rope_scaling .get ("factor" , 8.0 )
@@ -6102,7 +6094,8 @@ def set_vocab(self):
61026094 def set_gguf_parameters (self ):
61036095 super ().set_gguf_parameters ()
61046096 hparams = self .hparams
6105- rope_dim = hparams .get ("head_dim" ) or hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
6097+ if (rope_dim := hparams .get ("head_dim" )) is None :
6098+ rope_dim = hparams ["hidden_size" ] // hparams ["num_attention_heads" ]
61066099
61076100 self .gguf_writer .add_rope_dimension_count (rope_dim )
61086101 rope_scaling = self .hparams .get ("rope_scaling" ) or {}
@@ -6134,7 +6127,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
61346127 n_head = self .hparams ["num_attention_heads" ]
61356128 n_kv_head = self .hparams .get ("num_key_value_heads" )
61366129 n_embd = self .hparams ["hidden_size" ]
6137- head_dim = self .hparams .get ("head_dim" ) or n_embd // n_head
6130+ if (head_dim := self .hparams .get ("head_dim" )) is None :
6131+ head_dim = n_embd // n_head
61386132
61396133 output_name = self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT )
61406134
0 commit comments