@@ -859,11 +859,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
859859 case LLM_ARCH_GEMMA2:
860860 {
861861 hparams.n_swa = 4096 ; // default value of gemma 2
862+ hparams.n_swa_pattern = 2 ;
863+ hparams.attn_soft_cap = true ;
864+
862865 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa , false );
863866 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
864867 ml.get_key (LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping , false );
865868 ml.get_key (LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping , false );
866- hparams.attn_soft_cap = true ;
867869
868870 switch (hparams.n_layer ) {
869871 case 26 : type = LLM_TYPE_2B; break ;
@@ -874,6 +876,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
874876 } break ;
875877 case LLM_ARCH_GEMMA3:
876878 {
879+ hparams.n_swa_pattern = 6 ;
880+
877881 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa );
878882 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
879883
@@ -953,6 +957,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
953957 } break ;
954958 case LLM_ARCH_COHERE2:
955959 {
960+ hparams.n_swa_pattern = 4 ;
961+
956962 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa );
957963 ml.get_key (LLM_KV_LOGIT_SCALE, hparams.f_logit_scale );
958964 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
@@ -7413,12 +7419,8 @@ struct llm_build_gemma3 : public llm_graph_context {
74137419 // TODO: is causal == true correct? might need some changes
74147420 auto * inp_attn = build_attn_inp_kv_unified (true , true );
74157421
7416- // "5-to-1 interleaved attention"
7417- // 5 layers of local attention followed by 1 layer of global attention
7418- static const int sliding_window_pattern = 6 ;
7419-
74207422 for (int il = 0 ; il < n_layer; ++il) {
7421- const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
7423+ const bool is_sliding = hparams. is_sliding (il );
74227424
74237425 const float freq_base_l = is_sliding ? 10000 .0f : freq_base;
74247426 const float freq_scale_l = is_sliding ? 1 .0f : freq_scale;
@@ -8009,13 +8011,8 @@ struct llm_build_cohere2 : public llm_graph_context {
80098011
80108012 auto * inp_attn = build_attn_inp_kv_unified (true , true );
80118013
8012- // sliding window switch pattern
8013- const int32_t sliding_window_pattern = 4 ;
8014-
80158014 for (int il = 0 ; il < n_layer; ++il) {
8016- // three layers sliding window attention (window size 4096) and ROPE
8017- // fourth layer uses global attention without positional embeddings
8018- const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
8015+ const bool is_sliding = hparams.is_sliding (il);
80198016
80208017 // norm
80218018 cur = build_norm (inpL, model.layers [il].attn_norm , NULL , LLM_NORM, il);
0 commit comments