@@ -3267,10 +3267,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
32673267 {
32683268 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
32693269
3270- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3270+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
32713271 // if output is NULL, init from the input tok embed, duplicated to allow offloading
32723272 if (output == NULL) {
3273- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_DUPLICATED);
3273+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
32743274 }
32753275 }
32763276
@@ -3313,10 +3313,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33133313 {
33143314 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
33153315
3316- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3316+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
33173317 // if output is NULL, init from the input tok embed, duplicated to allow offloading
33183318 if (output == NULL) {
3319- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_DUPLICATED);
3319+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
33203320 }
33213321 }
33223322
@@ -3352,56 +3352,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33523352
33533353 // out_proj
33543354 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3355-
3356- layer.wq = nullptr;
3357- layer.wk = nullptr;
3358- layer.wv = nullptr;
3359- layer.wo = nullptr;
3360-
33613355 } else {
33623356 // Attention layers
33633357
33643358 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
33653359 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
33663360 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
33673361 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3368-
3369- layer.ssm_in = nullptr;
3370- layer.ssm_conv1d = nullptr;
3371- layer.ssm_conv1d_b = nullptr;
3372- layer.ssm_x = nullptr;
3373- layer.ssm_dt_norm = nullptr;
3374- layer.ssm_dt = nullptr;
3375- layer.ssm_dt_b = nullptr;
3376- layer.ssm_b_norm = nullptr;
3377- layer.ssm_c_norm = nullptr;
3378- layer.ssm_a = nullptr;
3379- layer.ssm_d = nullptr;
3380- layer.ssm_out = nullptr;
33813362 }
33823363
33833364 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
33843365
3385- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3366+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
33863367
33873368 if (layer.ffn_gate_inp) {
33883369 // MoE
33893370 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
33903371 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
33913372 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3392-
3393- layer.ffn_gate = nullptr;
3394- layer.ffn_down = nullptr;
3395- layer.ffn_up = nullptr;
33963373 } else {
33973374 // FFN (no MoE)
33983375 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
33993376 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
34003377 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3401-
3402- layer.ffn_gate_exps = nullptr;
3403- layer.ffn_down_exps = nullptr;
3404- layer.ffn_up_exps = nullptr;
34053378 }
34063379 }
34073380 } break;
@@ -10228,7 +10201,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
1022810201 // TODO: skip computing output earlier for unused tokens
1022910202
1023010203 y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
10231- y = ggml_mul (ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)) );
10204+ y = ggml_swiglu_split (ctx0, ggml_cont(ctx0, z), y );
1023210205
1023310206 // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
1023410207 cur = build_lora_mm(layer.ssm_out, y);
@@ -10352,7 +10325,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
1035210325 // TODO: skip computing output earlier for unused tokens
1035310326
1035410327 y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10355- y = ggml_mul (ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)) );
10328+ y = ggml_swiglu_split (ctx0, ggml_cont(ctx0, z), y );
1035610329
1035710330 // grouped RMS norm
1035810331 y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
0 commit comments