@@ -4885,8 +4885,9 @@ struct llama_model_loader {
48854885 return cur;
48864886 }
48874887
4888- static const int TENSOR_NOT_REQUIRED = 1;
4889- static const int TENSOR_DUPLICATED = 2;
4888+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
4889+ static const int TENSOR_DUPLICATED = 1 << 1;
4890+ static const int TENSOR_SKIP = 1 << 2;
48904891
48914892 struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
48924893 const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
@@ -7581,6 +7582,10 @@ static bool llm_load_tensors(
75817582
75827583 LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
75837584
7585+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
7586+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
7587+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
7588+
75847589 // create tensors for the weights
75857590 {
75867591 // note: cast to int64_t since we will use these for the tensor dimensions
@@ -9201,63 +9206,69 @@ static bool llm_load_tensors(
92019206 ggml_context * ctx_layer = ctx_for_layer(i);
92029207 ggml_context * ctx_split = ctx_for_layer_split(i);
92039208
9209+ int flags = 0;
9210+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
9211+ // skip all tensors in the NextN layers
9212+ flags |= TENSOR_SKIP;
9213+ }
9214+
92049215 auto & layer = model.layers[i];
92059216
9206- layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0 );
9217+ layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags );
92079218
92089219 // GLM-style attention with bias terms
9209- layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0 );
9210- layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0 );
9211- layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0 );
9212- layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0 );
9213- layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0 );
9214- layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0 );
9220+ layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags );
9221+ layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags );
9222+ layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags );
9223+ layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags );
9224+ layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags );
9225+ layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags );
92159226
9216- layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0 );
9227+ layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags );
92179228
92189229 // K/Q norm tensors (optional for GLM-4.5 355B variant)
92199230 layer.attn_q_norm = create_tensor(ctx_layer,
9220- tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9231+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags );
92219232 layer.attn_k_norm = create_tensor(ctx_layer,
9222- tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9233+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags );
92239234
9224- layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0 );
9235+ layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags );
92259236
92269237 // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
92279238 // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
92289239 const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
92299240
92309241 if (use_moe) {
92319242 // MoE layers
9232- layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0 );
9243+ layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags );
92339244 // gate bias
9234- layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, 0 );
9245+ layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags );
92359246
92369247 // MoE branch
92379248 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
92389249
92399250 layer.ffn_gate_exps = create_tensor(ctx_split,
9240- tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0 );
9251+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags );
92419252 layer.ffn_down_exps = create_tensor(ctx_split,
9242- tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0 );
9253+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags );
92439254 layer.ffn_up_exps = create_tensor(ctx_split,
9244- tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0 );
9255+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags );
92459256
92469257 // Shared expert
92479258 if (n_expert_shared > 0) {
92489259 const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
92499260 layer.ffn_gate_shexp = create_tensor(ctx_split,
9250- tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0 );
9261+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags );
92519262 layer.ffn_down_shexp = create_tensor(ctx_split,
9252- tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0 );
9263+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags );
92539264 layer.ffn_up_shexp = create_tensor(ctx_split,
9254- tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0 );
9265+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags );
92559266 }
92569267 } else {
92579268 // Dense layers (first k layers) - GLM uses separate gate/up projections
9258- layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0 );
9259- layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0 );
9260- layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0 );
9269+ layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags );
9270+ layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags );
9271+ layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags );
92619272 }
92629273 // --- NextN / MTP tensors (preserved but unused), on the final layer ---
92639274 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
@@ -9266,33 +9277,32 @@ static bool llm_load_tensors(
92669277 layer.nextn.eh_proj = create_tensor(ctx_for_layer(final_layer),
92679278 tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", final_layer),
92689279 { 2*n_embd, n_embd },
9269- llama_model_loader::TENSOR_NOT_REQUIRED );
9280+ flags );
92709281 // EMBED_TOKENS: [embd, vocab]
92719282 layer.nextn.embed_tokens = create_tensor(ctx_for_layer(final_layer),
92729283 tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", final_layer),
92739284 { n_embd, n_vocab },
9274- llama_model_loader::TENSOR_NOT_REQUIRED );
9285+ flags );
92759286 // ENORM, HNORM: [embd]
92769287 layer.nextn.enorm = create_tensor(ctx_for_layer(final_layer),
92779288 tn(LLM_TENSOR_NEXTN_ENORM, "weight", final_layer),
92789289 { n_embd },
9279- llama_model_loader::TENSOR_NOT_REQUIRED );
9290+ flags );
92809291 layer.nextn.hnorm = create_tensor(ctx_for_layer(final_layer),
92819292 tn(LLM_TENSOR_NEXTN_HNORM, "weight", final_layer),
92829293 { n_embd },
9283- llama_model_loader::TENSOR_NOT_REQUIRED );
9294+ flags );
92849295 // SHARED_HEAD_HEAD: [embd, vocab]
92859296 layer.nextn.shared_head_head = create_tensor(ctx_for_layer(final_layer),
92869297 tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", final_layer),
92879298 { n_embd, n_vocab },
9288- llama_model_loader::TENSOR_NOT_REQUIRED );
9299+ flags );
92899300 // SHARED_HEAD_NORM: [embd]
92909301 layer.nextn.shared_head_norm = create_tensor(ctx_for_layer(final_layer),
92919302 tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", final_layer),
92929303 { n_embd },
9293- llama_model_loader::TENSOR_NOT_REQUIRED );
9304+ flags );
92949305 }
9295-
92969306 }
92979307 }
92989308 break;
0 commit comments