@@ -600,17 +600,26 @@ static struct ggml_tensor * llm_build_kqv(
600600 cur = ggml_flash_attn_ext (ctx, q, k, padded_v, kq_mask, kq_scale, hparams.f_max_alibi_bias ,
601601 hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0 .0f );
602602
603+ LLAMA_LOG_INFO (" kq_scale: %f\n " , kq_scale);
604+
605+ // 检查 Softmax 参数
606+ if (hparams.attn_soft_cap ) {
607+ LLAMA_LOG_INFO (" Soft capping applied: %f\n " , hparams.f_attn_logit_softcapping );
608+ }
609+ LLAMA_LOG_INFO (" q shape: [%ld, %ld, %ld]\n " , q->ne [0 ], q->ne [1 ], q->ne [2 ]);
610+ LLAMA_LOG_INFO (" k shape: [%ld, %ld, %ld]\n " , k->ne [0 ], k->ne [1 ], k->ne [2 ]);
611+ LLAMA_LOG_INFO (" padded_v shape: [%ld, %ld, %ld]\n " , padded_v->ne [0 ], padded_v->ne [1 ], padded_v->ne [2 ]);
612+
603613 if (v->type == GGML_TYPE_F32) {
604614 ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
605615 }
606616
607617 if (n_embd_head_v < n_embd_head_k) {
608- cur = ggml_reshape_3d (ctx, cur, n_embd_head_v_out, n_head, n_tokens);
609- cur = ggml_view_3d (ctx, cur, n_embd_head_v, n_head, n_tokens,
618+ cur = ggml_cont (ctx, ggml_reshape_3d (ctx, cur, n_embd_head_v_out, n_head, n_tokens) );
619+ cur = ggml_cont (ctx, ggml_view_3d (ctx, cur, n_embd_head_v, n_head, n_tokens,
610620 ggml_element_size (cur) * n_embd_head_v_out,
611621 ggml_element_size (cur) * n_embd_head_v_out * n_head,
612- 0 );
613- cur = ggml_cont (ctx, cur);
622+ 0 ));
614623 }
615624
616625 cur = ggml_reshape_2d (ctx, cur, n_embd_head_v*n_head, n_tokens);
0 commit comments