@@ -2949,6 +2949,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
29492949 def set_vocab (self ):
29502950 self ._set_vocab_gpt2 ()
29512951
2952+ self .gguf_writer .add_add_bos_token (True )
2953+ self .gguf_writer .add_diffusion_shift_logits (False )
2954+
29522955 def set_gguf_parameters (self ):
29532956 super ().set_gguf_parameters ()
29542957 self ._try_set_pooling_type ()
@@ -2974,14 +2977,6 @@ def set_gguf_parameters(self):
29742977 feed_forward_length = self .hparams .get ("mlp_hidden_size" , 12288 )
29752978 self .gguf_writer .add_feed_forward_length (feed_forward_length )
29762979
2977- # Set RoPE parameters
2978- if "rope_theta" in self .hparams :
2979- self .gguf_writer .add_rope_freq_base (self .hparams ["rope_theta" ])
2980-
2981- # Set RMS norm epsilon
2982- if "rms_norm_eps" in self .hparams :
2983- self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2984-
29852980 # LLaDA models use non-causal attention for diffusion, similar to Dream
29862981 self .gguf_writer .add_causal_attention (False )
29872982 # Handle RoPE scaling similar to LlamaModel and Dream
@@ -2992,11 +2987,6 @@ def set_gguf_parameters(self):
29922987 if mask_token_id is not None :
29932988 self .gguf_writer .add_mask_token_id (mask_token_id )
29942989
2995- self .gguf_writer .add_add_bos_token (True )
2996-
2997- logging .info ("Adding diffusion shift logits to False" )
2998- self .gguf_writer .add_diffusion_shift_logits (False )
2999-
30002990 @staticmethod
30012991 def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
30022992 if n_head_kv is not None and n_head != n_head_kv :
0 commit comments