@@ -1173,6 +1173,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
11731173 if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
11741174 # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
11751175 res = "lfm2"
1176+ if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb" :
1177+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
1178+ res = "exaone4"
11761179
11771180 if res is None :
11781181 logger .warning ("\n " )
@@ -3240,11 +3243,12 @@ def set_gguf_parameters(self):
32403243 self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
32413244 self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
32423245 self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
3243- self .gguf_writer .add_rope_freq_base (self .hparams ["rope_theta" ])
32443246 if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
32453247 self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
3246- if (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
3247- self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
3248+ if (shared_expert_count := self .hparams .get ('moe_num_shared_experts' )) is not None :
3249+ self .gguf_writer .add_expert_shared_count (shared_expert_count )
3250+ if shared_expert_count > 0 and (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
3251+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
32483252
32493253 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
32503254 # Modify correction bias name as in DeepseekV2
@@ -7109,6 +7113,75 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
71097113 yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
71107114
71117115
7116+ @ModelBase .register ("Exaone4ForCausalLM" )
7117+ class Exaone4Model (TextModel ):
7118+ model_arch = gguf .MODEL_ARCH .EXAONE4
7119+
7120+ def set_vocab (self ):
7121+ tokens , toktypes , tokpre = self .get_vocab_base ()
7122+ self .gguf_writer .add_tokenizer_model ("gpt2" )
7123+ self .gguf_writer .add_tokenizer_pre (tokpre )
7124+ self .gguf_writer .add_token_list (tokens )
7125+ self .gguf_writer .add_token_types (toktypes )
7126+
7127+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
7128+ special_vocab .add_to_gguf (self .gguf_writer )
7129+
7130+ def set_gguf_parameters (self ):
7131+ super ().set_gguf_parameters ()
7132+ hparams = self .hparams
7133+ self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
7134+
7135+ if hparams .get ("sliding_window" ) is not None :
7136+ self .gguf_writer .add_sliding_window (hparams ["sliding_window" ])
7137+ if "layer_types" in hparams :
7138+ self .gguf_writer .add_sliding_window_pattern ([t == "sliding_attention" for t in hparams ["layer_types" ]])
7139+ elif "sliding_window_pattern" in hparams :
7140+ sliding_window_pattern = []
7141+ if isinstance (hparams ["sliding_window_pattern" ], str ): # e.g. LLLG
7142+ for i in range (hparams ["num_hidden_layers" ]):
7143+ sliding_window_pattern .append (hparams ["sliding_window_pattern" ][i % len (hparams ["sliding_window_pattern" ])] == "L" )
7144+ if isinstance (hparams ["sliding_window_pattern" ], int ): # e.g. 4
7145+ for i in range (hparams ["num_hidden_layers" ]):
7146+ sliding_window_pattern .append ((i + 1 ) % hparams ["sliding_window_pattern" ] != 0 )
7147+ if len (sliding_window_pattern ) == hparams ["num_hidden_layers" ]:
7148+ self .gguf_writer .add_sliding_window_pattern (sliding_window_pattern )
7149+
7150+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
7151+ if rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) == "linear" and "factor" in rope_scaling :
7152+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
7153+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
7154+
7155+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
7156+ if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
7157+ if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
7158+ base = self .hparams .get ("rope_theta" , 10_000.0 )
7159+ if (dim := self .hparams .get ("head_dim" )) is None :
7160+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
7161+ freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
7162+
7163+ factor = rope_scaling .get ("factor" , 16.0 )
7164+ low_freq_factor = rope_scaling .get ("low_freq_factor" , 1.0 )
7165+ high_freq_factor = rope_scaling .get ("high_freq_factor" , 4.0 )
7166+ old_context_len = self .hparams .get ("original_max_position_embeddings" , 8192 )
7167+
7168+ low_freq_wavelen = old_context_len / low_freq_factor
7169+ high_freq_wavelen = old_context_len / high_freq_factor
7170+
7171+ rope_factors = []
7172+ for freq in freqs :
7173+ wavelen = 2 * math .pi / freq
7174+ if wavelen < high_freq_wavelen :
7175+ rope_factors .append (1 )
7176+ elif wavelen > low_freq_wavelen :
7177+ rope_factors .append (factor )
7178+ else :
7179+ smooth = (old_context_len / wavelen - low_freq_factor ) / (high_freq_factor - low_freq_factor )
7180+ rope_factors .append (1 / ((1 - smooth ) / factor + smooth ))
7181+
7182+ yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
7183+
7184+
71127185@ModelBase .register ("GraniteForCausalLM" )
71137186class GraniteModel (LlamaModel ):
71147187 """Conversion for IBM's GraniteForCausalLM"""
0 commit comments