@@ -2132,7 +2132,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21322132 for (int i = 0; i < n_layer; ++i) {
21332133 auto & layer = layers[i];
21342134
2135- if (arch == LLM_ARCH_BERT) {
2135+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2136+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2137+
2138+ if (!layer.wqkv) {
21362139 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
21372140 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
21382141
@@ -2141,12 +2144,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21412144
21422145 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
21432146 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2144- } else {
2145- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2146- }
2147-
2148- if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2149- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
21502147 }
21512148
21522149 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -5910,48 +5907,44 @@ struct llm_build_bert : public llm_graph_context {
59105907 ggml_tensor * Vcur;
59115908
59125909 // self-attention
5913- if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
5914- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5915-
5916- if (model.layers[il].attn_q_norm) {
5917- Qcur = build_norm(Qcur,
5918- model.layers[il].attn_q_norm,
5919- model.layers[il].attn_q_norm_b,
5920- LLM_NORM, il);
5921- }
5922-
5923- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5924-
5925- if (model.layers[il].attn_k_norm) {
5926- Kcur = build_norm(Kcur,
5927- model.layers[il].attn_k_norm,
5928- model.layers[il].attn_k_norm_b,
5929- LLM_NORM, il);
5930- }
5931-
5932- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5933-
5934- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5935- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5936- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5937- } else {
5938- // compute Q and K and RoPE them
5910+ if (model.layers[il].wqkv) {
59395911 cur = build_lora_mm(model.layers[il].wqkv, cur);
59405912 cb(cur, "wqkv", il);
59415913
5942- if (model.arch == LLM_ARCH_NOMIC_BERT_MOE ) {
5914+ if (model.layers[il].bqkv ) {
59435915 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
59445916 cb(cur, "bqkv", il);
59455917 }
59465918
59475919 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
59485920 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
59495921 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5922+ } else {
5923+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5924+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5925+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5926+ }
59505927
5951- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5952- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5953- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5928+ if (model.layers[il].attn_q_norm) {
5929+ Qcur = build_norm(Qcur,
5930+ model.layers[il].attn_q_norm,
5931+ model.layers[il].attn_q_norm_b,
5932+ LLM_NORM, il);
5933+ }
5934+
5935+ if (model.layers[il].attn_k_norm) {
5936+ Kcur = build_norm(Kcur,
5937+ model.layers[il].attn_k_norm,
5938+ model.layers[il].attn_k_norm_b,
5939+ LLM_NORM, il);
5940+ }
5941+
5942+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5943+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5944+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
59545945
5946+ // RoPE
5947+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
59555948 Qcur = ggml_rope_ext(
59565949 ctx0, Qcur, inp_pos, nullptr,
59575950 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
0 commit comments