Skip to content

Commit 3f3e384

Browse files
authored
Handle TENSOR_SKIP
Ported the hanges from: sammcj/llama.cpp@f129567 sammcj/llama.cpp@dcbbd2c Except op info since ik_llama.cpp doesn't support this operation.
1 parent a90aec1 commit 3f3e384

File tree

1 file changed

+41
-31
lines changed

1 file changed

+41
-31
lines changed

src/llama.cpp

Lines changed: 41 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4885,8 +4885,9 @@ struct llama_model_loader {
48854885
return cur;
48864886
}
48874887

4888-
static const int TENSOR_NOT_REQUIRED = 1;
4889-
static const int TENSOR_DUPLICATED = 2;
4888+
static const int TENSOR_NOT_REQUIRED = 1 << 0;
4889+
static const int TENSOR_DUPLICATED = 1 << 1;
4890+
static const int TENSOR_SKIP = 1 << 2;
48904891

48914892
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
48924893
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
@@ -7581,6 +7582,10 @@ static bool llm_load_tensors(
75817582

75827583
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
75837584

7585+
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
7586+
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
7587+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
7588+
75847589
// create tensors for the weights
75857590
{
75867591
// note: cast to int64_t since we will use these for the tensor dimensions
@@ -9201,63 +9206,69 @@ static bool llm_load_tensors(
92019206
ggml_context * ctx_layer = ctx_for_layer(i);
92029207
ggml_context * ctx_split = ctx_for_layer_split(i);
92039208

9209+
int flags = 0;
9210+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
9211+
// skip all tensors in the NextN layers
9212+
flags |= TENSOR_SKIP;
9213+
}
9214+
92049215
auto & layer = model.layers[i];
92059216

9206-
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
9217+
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
92079218

92089219
// GLM-style attention with bias terms
9209-
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
9210-
layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
9211-
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
9212-
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0);
9213-
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0);
9214-
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0);
9220+
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
9221+
layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
9222+
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
9223+
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
9224+
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
9225+
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
92159226

9216-
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
9227+
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
92179228

92189229
// K/Q norm tensors (optional for GLM-4.5 355B variant)
92199230
layer.attn_q_norm = create_tensor(ctx_layer,
9220-
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9231+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
92219232
layer.attn_k_norm = create_tensor(ctx_layer,
9222-
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9233+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
92239234

9224-
layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
9235+
layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
92259236

92269237
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
92279238
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
92289239
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
92299240

92309241
if (use_moe) {
92319242
// MoE layers
9232-
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
9243+
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
92339244
// gate bias
9234-
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, 0);
9245+
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
92359246

92369247
// MoE branch
92379248
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
92389249

92399250
layer.ffn_gate_exps = create_tensor(ctx_split,
9240-
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
9251+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
92419252
layer.ffn_down_exps = create_tensor(ctx_split,
9242-
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
9253+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
92439254
layer.ffn_up_exps = create_tensor(ctx_split,
9244-
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
9255+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
92459256

92469257
// Shared expert
92479258
if (n_expert_shared > 0) {
92489259
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
92499260
layer.ffn_gate_shexp = create_tensor(ctx_split,
9250-
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
9261+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
92519262
layer.ffn_down_shexp = create_tensor(ctx_split,
9252-
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
9263+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
92539264
layer.ffn_up_shexp = create_tensor(ctx_split,
9254-
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
9265+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
92559266
}
92569267
} else {
92579268
// Dense layers (first k layers) - GLM uses separate gate/up projections
9258-
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
9259-
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
9260-
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
9269+
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
9270+
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
9271+
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
92619272
}
92629273
// --- NextN / MTP tensors (preserved but unused), on the final layer ---
92639274
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
@@ -9266,33 +9277,32 @@ static bool llm_load_tensors(
92669277
layer.nextn.eh_proj = create_tensor(ctx_for_layer(final_layer),
92679278
tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", final_layer),
92689279
{ 2*n_embd, n_embd },
9269-
llama_model_loader::TENSOR_NOT_REQUIRED);
9280+
flags);
92709281
// EMBED_TOKENS: [embd, vocab]
92719282
layer.nextn.embed_tokens = create_tensor(ctx_for_layer(final_layer),
92729283
tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", final_layer),
92739284
{ n_embd, n_vocab },
9274-
llama_model_loader::TENSOR_NOT_REQUIRED);
9285+
flags);
92759286
// ENORM, HNORM: [embd]
92769287
layer.nextn.enorm = create_tensor(ctx_for_layer(final_layer),
92779288
tn(LLM_TENSOR_NEXTN_ENORM, "weight", final_layer),
92789289
{ n_embd },
9279-
llama_model_loader::TENSOR_NOT_REQUIRED);
9290+
flags);
92809291
layer.nextn.hnorm = create_tensor(ctx_for_layer(final_layer),
92819292
tn(LLM_TENSOR_NEXTN_HNORM, "weight", final_layer),
92829293
{ n_embd },
9283-
llama_model_loader::TENSOR_NOT_REQUIRED);
9294+
flags);
92849295
// SHARED_HEAD_HEAD: [embd, vocab]
92859296
layer.nextn.shared_head_head = create_tensor(ctx_for_layer(final_layer),
92869297
tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", final_layer),
92879298
{ n_embd, n_vocab },
9288-
llama_model_loader::TENSOR_NOT_REQUIRED);
9299+
flags);
92899300
// SHARED_HEAD_NORM: [embd]
92909301
layer.nextn.shared_head_norm = create_tensor(ctx_for_layer(final_layer),
92919302
tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", final_layer),
92929303
{ n_embd },
9293-
llama_model_loader::TENSOR_NOT_REQUIRED);
9304+
flags);
92949305
}
9295-
92969306
}
92979307
}
92989308
break;

0 commit comments

Comments
 (0)