@@ -858,11 +858,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
858858 case LLM_ARCH_GEMMA2:
859859 {
860860 hparams.n_swa = 4096 ; // default value of gemma 2
861+ hparams.n_swa_pattern = 2 ;
862+ hparams.attn_soft_cap = true ;
863+
861864 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa , false );
862865 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
863866 ml.get_key (LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping , false );
864867 ml.get_key (LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping , false );
865- hparams.attn_soft_cap = true ;
866868
867869 switch (hparams.n_layer ) {
868870 case 26 : type = LLM_TYPE_2B; break ;
@@ -873,6 +875,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
873875 } break ;
874876 case LLM_ARCH_GEMMA3:
875877 {
878+ hparams.n_swa_pattern = 6 ;
879+
876880 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa );
877881 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
878882
@@ -952,6 +956,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
952956 } break ;
953957 case LLM_ARCH_COHERE2:
954958 {
959+ hparams.n_swa_pattern = 4 ;
960+
955961 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa );
956962 ml.get_key (LLM_KV_LOGIT_SCALE, hparams.f_logit_scale );
957963 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
@@ -7374,12 +7380,8 @@ struct llm_build_gemma3 : public llm_graph_context {
73747380 // TODO: is causal == true correct? might need some changes
73757381 auto * inp_attn = build_attn_inp_kv_unified (true , true );
73767382
7377- // "5-to-1 interleaved attention"
7378- // 5 layers of local attention followed by 1 layer of global attention
7379- static const int sliding_window_pattern = 6 ;
7380-
73817383 for (int il = 0 ; il < n_layer; ++il) {
7382- const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
7384+ const bool is_sliding = hparams. is_sliding (il );
73837385
73847386 const float freq_base_l = is_sliding ? 10000 .0f : freq_base;
73857387 const float freq_scale_l = is_sliding ? 1 .0f : freq_scale;
@@ -7970,13 +7972,8 @@ struct llm_build_cohere2 : public llm_graph_context {
79707972
79717973 auto * inp_attn = build_attn_inp_kv_unified (true , true );
79727974
7973- // sliding window switch pattern
7974- const int32_t sliding_window_pattern = 4 ;
7975-
79767975 for (int il = 0 ; il < n_layer; ++il) {
7977- // three layers sliding window attention (window size 4096) and ROPE
7978- // fourth layer uses global attention without positional embeddings
7979- const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
7976+ const bool is_sliding = hparams.is_sliding (il);
79807977
79817978 // norm
79827979 cur = build_norm (inpL, model.layers [il].attn_norm , NULL , LLM_NORM, il);
0 commit comments