Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KQ_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
Expand Down
133 changes: 81 additions & 52 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5927,6 +5927,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
const uint32_t q_lora_rank = hparams.n_lora_q;

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand All @@ -5953,68 +5954,96 @@ ggml_cgraph * llm_build_context::build_deepseek2() {

// self_attention
{
struct ggml_tensor * q = NULL;
if (!is_lite) {
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);

q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
cb(q, "q", il);

// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
} else {
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(q, "q", il);
ggml_tensor * q = nullptr;
ggml_tensor * kv_rope_compressed = nullptr;
ggml_tensor * q_rope;
ggml_tensor * q_nope;
ggml_tensor * k_rope;
ggml_tensor * kv_compressed;
if (model.layers[il].wkq_a_mqa) {
auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur);
cb(mqa, "mqa", il);
size_t qnb1;
if (!is_lite) {
q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0);
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
qnb1 = q->nb[1];
cb(q, "q", il);
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
q_lora_rank*ggml_element_size(mqa));
} else {
q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0);
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
n_embd_k_gqa*ggml_element_size(mqa));
qnb1 = mqa->nb[1];
}
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, 0);
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, ggml_row_size(q->type, n_embd_head_qk_nope));
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0);
}
else {
if (!is_lite) {
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);

// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
0);
cb(q_nope, "q_nope", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);

// and {n_head * n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_rope, "q_rope", il);
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);

q_rope = ggml_rope_ext(
ctx0, q_rope, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow
);
cb(q_rope, "q_rope", il);
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
cb(q, "q", il);

q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
} else {
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(q, "q", il);

// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compresseed, "kv_rope_compresseed", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);

// and {n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
kv_rope_compresseed->nb[1],
kv_rope_compresseed->nb[1],
ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);
}

q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);

q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));

k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
kv_rope_compressed->nb[1],
kv_rope_compressed->nb[1],
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));

kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
kv_rope_compressed->nb[1], 0);
}
cb(q_nope, "q_nope", il);
cb(q_rope, "q_rope", il);
cb(k_rope, "k_rope", il);
cb(kv_compressed, "kv_compressed", il);

// shared RoPE key
k_rope = ggml_rope_ext(
ctx0, k_rope, inp_pos, nullptr,
q_rope = ggml_rope_ext(ctx0, q_rope, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow
);
cb(k_rope, "k_rope", il);
ext_factor, attn_factor_scaled, beta_fast, beta_slow);
cb(q_rope, "q_rope", il);

// split into {kv_lora_rank, n_tokens}
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens,
kv_rope_compresseed->nb[1],
0);
cb(kv_compressed, "kv_compressed", il);
k_rope = ggml_rope_ext(ctx0, k_rope, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow);
cb(k_rope, "k_rope", il);

kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, model.layers[il].attn_kv_a_norm, NULL, LLM_NORM_RMS, cb, il);
cb(kv_compressed, "kv_compressed", il);
Expand Down
35 changes: 32 additions & 3 deletions src/llama-load-tensors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1645,14 +1645,43 @@ bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) {

layer.attn_kv_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});

bool merged = false;
if (ml.merge_qkv) {
auto q_name = is_lite ? tn(LLM_TENSOR_ATTN_Q, "weight", i) : tn(LLM_TENSOR_ATTN_Q_A, "weight", i);
auto k_name = tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i);
auto wq = ml.require_tensor_meta(q_name.c_str());
auto wk = ml.require_tensor_meta(k_name.c_str());
GGML_ASSERT(wq && wk);
if (wq->type == wk->type) {
GGML_ASSERT(wq->ne[0] == wk->ne[0]);
layer.wkq_a_mqa = ggml_new_tensor_2d(ctx_split, wq->type, wq->ne[0], wq->ne[1] + wk->ne[1]);
snprintf(layer.wkq_a_mqa->name, GGML_MAX_NAME, "blk.%d.attn_qk_a_mqa.weight", i);
if (is_lite) {
layer.wq = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
} else {
layer.wq_a = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
}
layer.wkv_a_mqa = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, k_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
merged = true;
use_mmap_buffer = false;
printf("============== Merged %s (%ld x %ld) and %s (%ld x %ld)\n", q_name.c_str(),
wq->ne[0], wq->ne[1], k_name.c_str(), wk->ne[0], wk->ne[1]);
}
}

if (!is_lite) {
layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
if (!merged) {
layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
}
layer.wq_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
} else {
} else if (!merged) {
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
}

layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
if (!merged) {
layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
}

layer.wkv_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
{kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, llama_model_loader::TENSOR_NOT_REQUIRED);
if (!layer.wkv_b) {
Expand Down
1 change: 1 addition & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KQ_A_MQA, "blk.%d.attn_kq_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
Expand Down
1 change: 1 addition & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ struct llama_layer {
struct ggml_tensor * wq_a = nullptr;
struct ggml_tensor * wq_b = nullptr;
struct ggml_tensor * wkv_a_mqa = nullptr;
struct ggml_tensor * wkq_a_mqa = nullptr;
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wv_b = nullptr;
Expand Down