Skip to content

Commit 024b68d

Browse files
authored
allow merged qkv tensor in place of split ones
1 parent 928fea0 commit 024b68d

File tree

2 files changed

+31
-37
lines changed

2 files changed

+31
-37
lines changed

src/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
450450
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
451451
{ LLM_TENSOR_POS_EMBD, "position_embd" },
452452
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
453+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
453454
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
454455
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
455456
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },

src/llama-model.cpp

Lines changed: 30 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,7 +2132,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21322132
for (int i = 0; i < n_layer; ++i) {
21332133
auto & layer = layers[i];
21342134

2135-
if (arch == LLM_ARCH_BERT) {
2135+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2136+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2137+
2138+
if (!layer.wqkv) {
21362139
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
21372140
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
21382141

@@ -2141,12 +2144,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21412144

21422145
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
21432146
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2144-
} else {
2145-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2146-
}
2147-
2148-
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2149-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
21502147
}
21512148

21522149
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -5910,48 +5907,44 @@ struct llm_build_bert : public llm_graph_context {
59105907
ggml_tensor * Vcur;
59115908

59125909
// self-attention
5913-
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
5914-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5915-
5916-
if (model.layers[il].attn_q_norm) {
5917-
Qcur = build_norm(Qcur,
5918-
model.layers[il].attn_q_norm,
5919-
model.layers[il].attn_q_norm_b,
5920-
LLM_NORM, il);
5921-
}
5922-
5923-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5924-
5925-
if (model.layers[il].attn_k_norm) {
5926-
Kcur = build_norm(Kcur,
5927-
model.layers[il].attn_k_norm,
5928-
model.layers[il].attn_k_norm_b,
5929-
LLM_NORM, il);
5930-
}
5931-
5932-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5933-
5934-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5935-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5936-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5937-
} else {
5938-
// compute Q and K and RoPE them
5910+
if (model.layers[il].wqkv) {
59395911
cur = build_lora_mm(model.layers[il].wqkv, cur);
59405912
cb(cur, "wqkv", il);
59415913

5942-
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5914+
if (model.layers[il].bqkv) {
59435915
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
59445916
cb(cur, "bqkv", il);
59455917
}
59465918

59475919
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
59485920
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
59495921
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5922+
} else {
5923+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5924+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5925+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5926+
}
59505927

5951-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5952-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5953-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5928+
if (model.layers[il].attn_q_norm) {
5929+
Qcur = build_norm(Qcur,
5930+
model.layers[il].attn_q_norm,
5931+
model.layers[il].attn_q_norm_b,
5932+
LLM_NORM, il);
5933+
}
5934+
5935+
if (model.layers[il].attn_k_norm) {
5936+
Kcur = build_norm(Kcur,
5937+
model.layers[il].attn_k_norm,
5938+
model.layers[il].attn_k_norm_b,
5939+
LLM_NORM, il);
5940+
}
5941+
5942+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5943+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5944+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
59545945

5946+
// RoPE
5947+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
59555948
Qcur = ggml_rope_ext(
59565949
ctx0, Qcur, inp_pos, nullptr,
59575950
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,

0 commit comments

Comments
 (0)