@@ -858,11 +858,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
858858        case  LLM_ARCH_GEMMA2:
859859            {
860860                hparams.n_swa  = 4096 ; //  default value of gemma 2
861+                 hparams.n_swa_pattern  = 2 ;
862+                 hparams.attn_soft_cap  = true ;
863+ 
861864                ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa , false );
862865                ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
863866                ml.get_key (LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping , false );
864867                ml.get_key (LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping , false );
865-                 hparams.attn_soft_cap  = true ;
866868
867869                switch  (hparams.n_layer ) {
868870                    case  26 : type = LLM_TYPE_2B; break ;
@@ -873,6 +875,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
873875            } break ;
874876        case  LLM_ARCH_GEMMA3:
875877            {
878+                 hparams.n_swa_pattern  = 6 ;
879+ 
876880                ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa );
877881                ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps );
878882
@@ -952,6 +956,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
952956            } break ;
953957        case  LLM_ARCH_COHERE2:
954958            {
959+                 hparams.n_swa_pattern  = 4 ;
960+ 
955961                ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa );
956962                ml.get_key (LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale );
957963                ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps );
@@ -7374,12 +7380,8 @@ struct llm_build_gemma3 : public llm_graph_context {
73747380        //  TODO: is causal == true correct? might need some changes
73757381        auto  * inp_attn = build_attn_inp_kv_unified (true , true );
73767382
7377-         //  "5-to-1 interleaved attention"
7378-         //  5 layers of local attention followed by 1 layer of global attention
7379-         static  const  int  sliding_window_pattern = 6 ;
7380- 
73817383        for  (int  il = 0 ; il < n_layer; ++il) {
7382-             const  bool  is_sliding = il % sliding_window_pattern < (sliding_window_pattern -  1 );
7384+             const  bool  is_sliding = hparams. is_sliding (il );
73837385
73847386            const  float  freq_base_l  = is_sliding ? 10000 .0f  : freq_base;
73857387            const  float  freq_scale_l = is_sliding ? 1 .0f      : freq_scale;
@@ -7970,13 +7972,8 @@ struct llm_build_cohere2 : public llm_graph_context {
79707972
79717973        auto  * inp_attn = build_attn_inp_kv_unified (true , true );
79727974
7973-         //  sliding window switch pattern
7974-         const  int32_t  sliding_window_pattern = 4 ;
7975- 
79767975        for  (int  il = 0 ; il < n_layer; ++il) {
7977-             //  three layers sliding window attention (window size 4096) and ROPE
7978-             //  fourth layer uses global attention without positional embeddings
7979-             const  bool  is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1 );
7976+             const  bool  is_sliding = hparams.is_sliding (il);
79807977
79817978            //  norm
79827979            cur = build_norm (inpL, model.layers [il].attn_norm , NULL , LLM_NORM, il);
0 commit comments