diff --git a/src/llama-arch.h b/src/llama-arch.h index a421fa6c6..b9f06f9db 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -262,6 +262,7 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KQ_A_MQA, LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_K_B, LLM_TENSOR_ATTN_V_B, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 96d39b246..77792f95e 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -5927,6 +5927,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; + const uint32_t q_lora_rank = hparams.n_lora_q; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5953,68 +5954,96 @@ ggml_cgraph * llm_build_context::build_deepseek2() { // self_attention { - struct ggml_tensor * q = NULL; - if (!is_lite) { - // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); - cb(q, "q", il); - - q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); - cb(q, "q", il); - - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); - cb(q, "q", il); - } else { - q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(q, "q", il); + ggml_tensor * q = nullptr; + ggml_tensor * kv_rope_compressed = nullptr; + ggml_tensor * q_rope; + ggml_tensor * q_nope; + ggml_tensor * k_rope; + ggml_tensor * kv_compressed; + if (model.layers[il].wkq_a_mqa) { + auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur); + cb(mqa, "mqa", il); + size_t qnb1; + if (!is_lite) { + q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0); + q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + qnb1 = q->nb[1]; + cb(q, "q", il); + kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], + q_lora_rank*ggml_element_size(mqa)); + } else { + q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0); + kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], + n_embd_k_gqa*ggml_element_size(mqa)); + qnb1 = mqa->nb[1]; + } + q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, 0); + q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, ggml_row_size(q->type, n_embd_head_qk_nope)); + k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens, + mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank)); + kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0); } + else { + if (!is_lite) { + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - 0); - cb(q_nope, "q_nope", il); + kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_rope_compressed, "kv_rope_compressed", il); - // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_rope, "q_rope", il); + ggml_build_forward_expand(gf, q); + ggml_build_forward_expand(gf, kv_rope_compressed); - q_rope = ggml_rope_ext( - ctx0, q_rope, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_rope, "q_rope", il); + q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); + cb(q, "q", il); + + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + } else { + q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q, "q", il); - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_rope_compresseed, "kv_rope_compresseed", il); + kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_rope_compressed, "kv_rope_compressed", il); - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_rope_compresseed->nb[1], - kv_rope_compresseed->nb[1], - ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); + ggml_build_forward_expand(gf, q); + ggml_build_forward_expand(gf, kv_rope_compressed); + } + + q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0); + + q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + + k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens, + kv_rope_compressed->nb[1], + kv_rope_compressed->nb[1], + ggml_row_size(kv_rope_compressed->type, kv_lora_rank)); + + kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, + kv_rope_compressed->nb[1], 0); + } + cb(q_nope, "q_nope", il); + cb(q_rope, "q_rope", il); cb(k_rope, "k_rope", il); + cb(kv_compressed, "kv_compressed", il); - // shared RoPE key - k_rope = ggml_rope_ext( - ctx0, k_rope, inp_pos, nullptr, + q_rope = ggml_rope_ext(ctx0, q_rope, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_rope, "k_rope", il); + ext_factor, attn_factor_scaled, beta_fast, beta_slow); + cb(q_rope, "q_rope", il); - // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens, - kv_rope_compresseed->nb[1], - 0); - cb(kv_compressed, "kv_compressed", il); + k_rope = ggml_rope_ext(ctx0, k_rope, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow); + cb(k_rope, "k_rope", il); kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, model.layers[il].attn_kv_a_norm, NULL, LLM_NORM_RMS, cb, il); cb(kv_compressed, "kv_compressed", il); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index e921dc9b3..600ae2564 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1645,14 +1645,43 @@ bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) { layer.attn_kv_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); + bool merged = false; + if (ml.merge_qkv) { + auto q_name = is_lite ? tn(LLM_TENSOR_ATTN_Q, "weight", i) : tn(LLM_TENSOR_ATTN_Q_A, "weight", i); + auto k_name = tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i); + auto wq = ml.require_tensor_meta(q_name.c_str()); + auto wk = ml.require_tensor_meta(k_name.c_str()); + GGML_ASSERT(wq && wk); + if (wq->type == wk->type) { + GGML_ASSERT(wq->ne[0] == wk->ne[0]); + layer.wkq_a_mqa = ggml_new_tensor_2d(ctx_split, wq->type, wq->ne[0], wq->ne[1] + wk->ne[1]); + snprintf(layer.wkq_a_mqa->name, GGML_MAX_NAME, "blk.%d.attn_qk_a_mqa.weight", i); + if (is_lite) { + layer.wq = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0); + } else { + layer.wq_a = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0); + } + layer.wkv_a_mqa = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, k_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]); + merged = true; + use_mmap_buffer = false; + printf("============== Merged %s (%ld x %ld) and %s (%ld x %ld)\n", q_name.c_str(), + wq->ne[0], wq->ne[1], k_name.c_str(), wk->ne[0], wk->ne[1]); + } + } + if (!is_lite) { - layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + if (!merged) { + layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + } layer.wq_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); - } else { + } else if (!merged) { layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); } - layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + if (!merged) { + layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + } + layer.wkv_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!layer.wkv_b) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d8a6f301..dd627a4f6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -805,6 +805,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KQ_A_MQA, "blk.%d.attn_kq_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, diff --git a/src/llama-model.h b/src/llama-model.h index 81420788b..769f05133 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -160,6 +160,7 @@ struct llama_layer { struct ggml_tensor * wq_a = nullptr; struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr; + struct ggml_tensor * wkq_a_mqa = nullptr; struct ggml_tensor * wkv_b = nullptr; struct ggml_tensor * wk_b = nullptr; struct ggml_tensor * wv_b = nullptr;