Skip to content

Commit 937a48d

Browse files
committed
Better/shorter variable names and more tidying up of code
1 parent 68302ee commit 937a48d

File tree

3 files changed

+66
-70
lines changed

3 files changed

+66
-70
lines changed

src/llama-graph.cpp

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,7 +1478,7 @@ ggml_tensor * llm_graph_context::build_attn(
14781478
ggml_tensor * llm_graph_context::build_attn_mla(
14791479
llm_graph_input_attn_kv_unified * inp,
14801480
ggml_cgraph * gf,
1481-
ggml_tensor * wv_decompress,
1481+
ggml_tensor * wv_b,
14821482
ggml_tensor * wo,
14831483
ggml_tensor * q_cur,
14841484
ggml_tensor * k_cur,
@@ -1497,8 +1497,8 @@ ggml_tensor * llm_graph_context::build_attn_mla(
14971497
const auto kv_lora_rank = hparams.n_lora_kv;
14981498

14991499
// note: deepseek with MLA option converts into MQA with larger n_ebed (ie: GQA with 1 group)
1500-
const int64_t n_embd_k_compressed = kv_lora_rank + hparams.n_rot;
1501-
const int64_t n_embd_v_compressed = kv_lora_rank;
1500+
const int64_t n_embd_k_cmpr = kv_lora_rank + hparams.n_rot;
1501+
const int64_t n_embd_v_cmpr = kv_lora_rank;
15021502

15031503
// note: this is the smaller n_ebed what we get after decompression
15041504
const int64_t n_embd_head_v = hparams.n_embd_head_v;
@@ -1514,17 +1514,17 @@ ggml_tensor * llm_graph_context::build_attn_mla(
15141514
GGML_ASSERT(kv_self->size == n_ctx);
15151515

15161516
ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il],
1517-
n_tokens*n_embd_k_compressed,
1518-
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_compressed)*kv_head);
1517+
n_tokens*n_embd_k_cmpr,
1518+
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_cmpr)*kv_head);
15191519
//cb(k_cache_view, "k_cache_view", il);
15201520

15211521
// note: storing RoPE-ed version of K in the KV cache
15221522
ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
15231523

1524-
v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_compressed, n_tokens);
1524+
v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_cmpr, n_tokens);
15251525

15261526
ggml_tensor * v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il],
1527-
n_tokens, n_embd_v_compressed,
1527+
n_tokens, n_embd_v_cmpr,
15281528
( n_ctx)*ggml_element_size(kv_self->v_l[il]),
15291529
(kv_head)*ggml_element_size(kv_self->v_l[il]));
15301530

@@ -1543,34 +1543,34 @@ ggml_tensor * llm_graph_context::build_attn_mla(
15431543

15441544
const auto n_kv = kv_self->n;
15451545

1546-
ggml_tensor * k_compressed = ggml_view_2d(ctx0, kv_self->k_l[il],
1547-
n_embd_k_compressed, n_kv,
1548-
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_compressed),
1546+
ggml_tensor * k_cmpr = ggml_view_2d(ctx0, kv_self->k_l[il],
1547+
n_embd_k_cmpr, n_kv,
1548+
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_cmpr),
15491549
0);
1550-
cb(k_compressed, "k_compressed", il);
1550+
cb(k_cmpr, "k_cmpr", il);
15511551

1552-
struct ggml_tensor * v_compressed_trans = ggml_view_2d(ctx0, kv_self->v_l[il],
1553-
n_kv, n_embd_v_compressed,
1552+
struct ggml_tensor * v_cmpr_trans = ggml_view_2d(ctx0, kv_self->v_l[il],
1553+
n_kv, n_embd_v_cmpr,
15541554
ggml_element_size(kv_self->v_l[il])*n_ctx,
15551555
0);
1556-
cb(v_compressed_trans, "v_compressed_trans", il);
1556+
cb(v_cmpr_trans, "v_cmpr_trans", il);
15571557

1558-
ggml_tensor * q_compressed = ggml_view_2d(ctx0, q_cur,
1559-
n_embd_k_compressed, n_tokens*n_head,
1560-
ggml_row_size(q_cur->type, n_embd_k_compressed),
1558+
ggml_tensor * q_cmpr = ggml_view_2d(ctx0, q_cur,
1559+
n_embd_k_cmpr, n_tokens*n_head,
1560+
ggml_row_size(q_cur->type, n_embd_k_cmpr),
15611561
0);
1562-
cb(q_compressed, "q_compressed", il);
1562+
cb(q_cmpr, "q_cmpr", il);
15631563

1564-
ggml_tensor * kq = ggml_mul_mat(ctx0, k_compressed, q_compressed);
1565-
cb(kq, "kq", il);
1564+
ggml_tensor * kq_cmpr = ggml_mul_mat(ctx0, k_cmpr, q_cmpr);
1565+
cb(kq_cmpr, "kq_cmpr", il);
15661566

1567-
kq = ggml_view_3d(ctx0, kq, n_kv, n_tokens, n_head,
1568-
ggml_row_size(kq->type, n_kv),
1569-
ggml_row_size(kq->type, n_kv)*n_tokens,
1567+
kq_cmpr = ggml_view_3d(ctx0, kq_cmpr, n_kv, n_tokens, n_head,
1568+
ggml_row_size(kq_cmpr->type, n_kv),
1569+
ggml_row_size(kq_cmpr->type, n_kv)*n_tokens,
15701570
0);
1571-
cb(kq, "kq_view", il);
1571+
cb(kq_cmpr, "kq_view", il);
15721572

1573-
ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1573+
ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq_cmpr, kq_mask, kq_scale, hparams.f_max_alibi_bias);
15741574
cb(kq_soft_max, "kq_soft_max", il);
15751575

15761576
kq_soft_max = ggml_view_2d(ctx0, kq_soft_max,
@@ -1579,24 +1579,24 @@ ggml_tensor * llm_graph_context::build_attn_mla(
15791579
0);
15801580
cb(kq_soft_max, "kq_soft_max_view", il);
15811581

1582-
ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, v_compressed_trans, kq_soft_max);
1583-
cb(kqv_compressed, "kqv_compressed,", il);
1582+
ggml_tensor * kqv_cmpr = ggml_mul_mat(ctx0, v_cmpr_trans, kq_soft_max);
1583+
cb(kqv_cmpr, "kqv_cmpr,", il);
15841584

1585-
kqv_compressed = ggml_view_3d(ctx0, kqv_compressed,
1586-
n_embd_v_compressed, n_tokens, n_head,
1587-
ggml_row_size(kqv_compressed->type, n_embd_v_compressed),
1588-
ggml_row_size(kqv_compressed->type, n_embd_v_compressed)*n_tokens,
1585+
kqv_cmpr = ggml_view_3d(ctx0, kqv_cmpr,
1586+
n_embd_v_cmpr, n_tokens, n_head,
1587+
ggml_row_size(kqv_cmpr->type, n_embd_v_cmpr),
1588+
ggml_row_size(kqv_cmpr->type, n_embd_v_cmpr)*n_tokens,
15891589
0);
1590-
cb(kqv_compressed, "kqv_compressed_view", il);
1590+
cb(kqv_cmpr, "kqv_cmpr_view", il);
15911591

1592-
ggml_tensor * wv_decompress_view = ggml_view_3d(ctx0, wv_decompress,
1593-
n_embd_v_compressed, n_embd_head_v, n_head,
1594-
ggml_row_size(wv_decompress->type, n_embd_v_compressed),
1595-
ggml_row_size(wv_decompress->type, n_embd_v_compressed)*n_embd_head_v,
1592+
ggml_tensor * wv_b_view = ggml_view_3d(ctx0, wv_b,
1593+
n_embd_v_cmpr, n_embd_head_v, n_head,
1594+
ggml_row_size(wv_b->type, n_embd_v_cmpr),
1595+
ggml_row_size(wv_b->type, n_embd_v_cmpr)*n_embd_head_v,
15961596
0);
1597-
cb(wv_decompress_view, "wv_decompress_view", il);
1597+
cb(wv_b_view, "wv_b_view", il);
15981598

1599-
ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_decompress_view, kqv_compressed);
1599+
ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_view, kqv_cmpr);
16001600
cb(kqv, "kqv", il);
16011601

16021602
kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);

src/llama-graph.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ struct llm_graph_context {
540540
ggml_tensor * build_attn_mla(
541541
llm_graph_input_attn_kv_unified * inp,
542542
ggml_cgraph * gf,
543-
ggml_tensor * wv_decompress,
543+
ggml_tensor * wv_b,
544544
ggml_tensor * wo,
545545
ggml_tensor * q_cur, // [n_embd_k, n_tokens, n_head]
546546
ggml_tensor * k_cur, // [n_embd_k, n_tokens]

src/llama-model.cpp

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9571,21 +9571,21 @@ struct llm_build_deepseek2 : public llm_graph_context {
95719571
ggml_row_size(q->type, n_embd_head_qk_nope));
95729572
cb(q_pe, "q_pe", il);
95739573

9574-
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
9575-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
9574+
ggml_tensor * kv_pe_cmprresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
9575+
cb(kv_pe_cmprresseed, "kv_pe_cmprresseed", il);
95769576

95779577
// split into {kv_lora_rank, n_tokens}
9578-
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
9579-
kv_pe_compresseed->nb[1],
9578+
ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_pe_cmprresseed, kv_lora_rank, n_tokens,
9579+
kv_pe_cmprresseed->nb[1],
95809580
0);
9581-
cb(kv_compressed, "kv_compressed", il);
9581+
cb(kv_cmpr, "kv_cmpr", il);
95829582

95839583
// and {n_embd_head_qk_rope, n_tokens}
9584-
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed,
9584+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_cmprresseed,
95859585
n_embd_head_qk_rope, 1, n_tokens,
9586-
kv_pe_compresseed->nb[1],
9587-
kv_pe_compresseed->nb[1],
9588-
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
9586+
kv_pe_cmprresseed->nb[1],
9587+
kv_pe_cmprresseed->nb[1],
9588+
ggml_row_size(kv_pe_cmprresseed->type, kv_lora_rank));
95899589
cb(k_pe, "k_pe", il);
95909590

95919591
// TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
@@ -9605,11 +9605,11 @@ struct llm_build_deepseek2 : public llm_graph_context {
96059605
cb(k_pe, "k_pe", il);
96069606

96079607
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
9608-
kv_compressed = ggml_cont(ctx0, kv_compressed);
9609-
kv_compressed = build_norm(kv_compressed,
9608+
kv_cmpr = ggml_cont(ctx0, kv_cmpr);
9609+
kv_cmpr = build_norm(kv_cmpr,
96109610
model.layers[il].attn_kv_a_norm, nullptr,
96119611
LLM_NORM_RMS, il);
9612-
cb(kv_compressed, "kv_compressed", il);
9612+
cb(kv_cmpr, "kv_cmpr", il);
96139613

96149614
if (cparams.mla_attn) {
96159615
// note: deepseek with MLA option converts into MQA (ie: GQA with 1 group)
@@ -9633,20 +9633,18 @@ struct llm_build_deepseek2 : public llm_graph_context {
96339633
0);
96349634
cb(wk_b, "wk_b", il);
96359635

9636-
// note: this operation *MUST* use F32 (or have `wk_b` stored as F32 or BF16 in the GGUF)
9636+
// note: this operation *MUST* use F32 or it will cause gibberish output
96379637
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope);
9638-
//if (wk_b->type != GGML_TYPE_F32 && wk_b->type != GGML_TYPE_BF16) {
9639-
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
9640-
//}
9638+
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
96419639
cb(q_nope_absorbed, "q_nope_absorbed", il);
96429640

96439641
ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
96449642
cb(q_states, "q_states", il);
96459643

9646-
ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed, k_pe, 0);
9644+
ggml_tensor * k_states = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
96479645
cb(k_states, "k_states", il);
96489646

9649-
ggml_tensor * v_states = kv_compressed;
9647+
ggml_tensor * v_states = kv_cmpr;
96509648
cb(v_states, "v_states", il);
96519649

96529650
cur = build_attn_mla(inp_attn, gf,
@@ -9655,27 +9653,25 @@ struct llm_build_deepseek2 : public llm_graph_context {
96559653
} else {
96569654
// note: deepseek without MLA option converts into MHA
96579655

9658-
// note: this operation *MUST* use F32 (or have `wkv_b` stored as F32 or BF16 in the GGUF)
9659-
ggml_tensor * kv_decompressed = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
9660-
//if (model.layers[il].wkv_b->type != GGML_TYPE_F32 && model.layers[il].wkv_b->type != GGML_TYPE_BF16) {
9661-
ggml_mul_mat_set_prec(kv_decompressed, GGML_PREC_F32);
9662-
//}
9663-
cb(kv_decompressed, "kv_decompressed", il);
9656+
// note: this operation *MUST* use F32 or it will cause gibberish output
9657+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
9658+
ggml_mul_mat_set_prec(kv, GGML_PREC_F32);
9659+
cb(kv, "kv", il);
96649660

96659661
// split into {n_head * n_embd_head_qk_nope, n_tokens}
9666-
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv_decompressed,
9662+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
96679663
n_embd_head_qk_nope, n_head, n_tokens,
9668-
ggml_row_size(kv_decompressed->type, n_embd_head_qk_nope + n_embd_head_v),
9669-
ggml_row_size(kv_decompressed->type, n_head * (n_embd_head_qk_nope + n_embd_head_v)),
9664+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
9665+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + n_embd_head_v)),
96709666
0);
96719667
cb(k_nope, "k_nope", il);
96729668

96739669
// and {n_head * n_embd_head_v, n_tokens}
9674-
ggml_tensor * v_states = ggml_view_3d(ctx0, kv_decompressed,
9670+
ggml_tensor * v_states = ggml_view_3d(ctx0, kv,
96759671
n_embd_head_v, n_head, n_tokens,
9676-
ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope + n_embd_head_v)),
9677-
ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope + n_embd_head_v)*n_head),
9678-
ggml_row_size(kv_decompressed->type, (n_embd_head_qk_nope)));
9672+
ggml_row_size(kv->type, (n_embd_head_qk_nope + n_embd_head_v)),
9673+
ggml_row_size(kv->type, (n_embd_head_qk_nope + n_embd_head_v)*n_head),
9674+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
96799675
cb(v_states, "v_states", il);
96809676

96819677
v_states = ggml_cont(ctx0, v_states);

0 commit comments

Comments
 (0)