@@ -726,21 +726,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
726726 // sanity checks for models that have attention layers
727727 if (qs.n_attention_wv != 0 && !is_clip_model)
728728 {
729- const auto & n_head_kv_iter = model.hparams .n_head_kv_arr .begin ();
730- // attention layers have a non-zero number of kv heads
731- int32_t n_layer_attn = model.hparams .n_layer - std::count (n_head_kv_iter, n_head_kv_iter + model.hparams .n_layer , 0 );
729+ int32_t n_layer_all = model.hparams .n_layer ;
732730 if (llama_model_has_encoder (&model)) {
733- // now n_layer_attn is the number of attention layers in the encoder
731+ // now n_layer_all is the number of attention layers in the encoder
734732 // for each decoder block, there are 2 attention layers
735- n_layer_attn += 2 * model.hparams .dec_n_layer ;
733+ n_layer_all += 2 * model.hparams .dec_n_layer ;
736734 }
737735
738736 // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
739737 const int32_t n_layer_recr = std::count (model.hparams .recurrent_layer_arr .begin (), model.hparams .recurrent_layer_arr .end (), true );
740738
741- LLAMA_LOG_INFO (" %s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n " , __func__, n_layer_attn , n_layer_recr, pruned_attention_w);
739+ LLAMA_LOG_INFO (" %s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n " , __func__, n_layer_all , n_layer_recr, pruned_attention_w);
742740
743- GGML_ASSERT ((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && " n_attention_wv is unexpected" );
741+ GGML_ASSERT ((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && " n_attention_wv is unexpected" );
744742 }
745743
746744 size_t total_size_org = 0 ;
0 commit comments