@@ -4885,8 +4885,9 @@ struct llama_model_loader {
48854885 return cur;
48864886 }
48874887
4888- static const int TENSOR_NOT_REQUIRED = 1;
4889- static const int TENSOR_DUPLICATED = 2;
4888+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
4889+ static const int TENSOR_DUPLICATED = 1 << 1;
4890+ static const int TENSOR_SKIP = 1 << 2;
48904891
48914892 struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
48924893 const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
@@ -4895,6 +4896,17 @@ struct llama_model_loader {
48954896 return NULL;
48964897 }
48974898
4899+ // skip unused tensors
4900+ if (flags & TENSOR_SKIP) {
4901+ const size_t nbytes = ggml_nbytes(cur);
4902+ LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", name.c_str(), nbytes);
4903+
4904+ size_data -= nbytes;
4905+ n_created++;
4906+
4907+ return nullptr;
4908+ }
4909+
48984910 return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
48994911 }
49004912
@@ -7581,6 +7593,10 @@ static bool llm_load_tensors(
75817593
75827594 LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
75837595
7596+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
7597+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
7598+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
7599+
75847600 // create tensors for the weights
75857601 {
75867602 // note: cast to int64_t since we will use these for the tensor dimensions
@@ -9201,63 +9217,69 @@ static bool llm_load_tensors(
92019217 ggml_context * ctx_layer = ctx_for_layer(i);
92029218 ggml_context * ctx_split = ctx_for_layer_split(i);
92039219
9220+ int flags = 0;
9221+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
9222+ // skip all tensors in the NextN layers
9223+ flags |= TENSOR_SKIP;
9224+ }
9225+
92049226 auto & layer = model.layers[i];
92059227
9206- layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0 );
9228+ layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags );
92079229
92089230 // GLM-style attention with bias terms
9209- layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0 );
9210- layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0 );
9211- layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0 );
9212- layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0 );
9213- layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0 );
9214- layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0 );
9231+ layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags );
9232+ layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags );
9233+ layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags );
9234+ layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags );
9235+ layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags );
9236+ layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags );
92159237
9216- layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0 );
9238+ layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags );
92179239
92189240 // K/Q norm tensors (optional for GLM-4.5 355B variant)
92199241 layer.attn_q_norm = create_tensor(ctx_layer,
9220- tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9242+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags );
92219243 layer.attn_k_norm = create_tensor(ctx_layer,
9222- tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9244+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags );
92239245
9224- layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0 );
9246+ layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags );
92259247
92269248 // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
92279249 // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
92289250 const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
92299251
92309252 if (use_moe) {
92319253 // MoE layers
9232- layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0 );
9254+ layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags );
92339255 // gate bias
9234- layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, 0 );
9256+ layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags );
92359257
92369258 // MoE branch
92379259 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
92389260
92399261 layer.ffn_gate_exps = create_tensor(ctx_split,
9240- tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0 );
9262+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags );
92419263 layer.ffn_down_exps = create_tensor(ctx_split,
9242- tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0 );
9264+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags );
92439265 layer.ffn_up_exps = create_tensor(ctx_split,
9244- tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0 );
9266+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags );
92459267
92469268 // Shared expert
92479269 if (n_expert_shared > 0) {
92489270 const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
92499271 layer.ffn_gate_shexp = create_tensor(ctx_split,
9250- tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0 );
9272+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags );
92519273 layer.ffn_down_shexp = create_tensor(ctx_split,
9252- tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0 );
9274+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags );
92539275 layer.ffn_up_shexp = create_tensor(ctx_split,
9254- tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0 );
9276+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags );
92559277 }
92569278 } else {
92579279 // Dense layers (first k layers) - GLM uses separate gate/up projections
9258- layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0 );
9259- layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0 );
9260- layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0 );
9280+ layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags );
9281+ layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags );
9282+ layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags );
92619283 }
92629284 // --- NextN / MTP tensors (preserved but unused), on the final layer ---
92639285 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
@@ -9266,33 +9288,32 @@ static bool llm_load_tensors(
92669288 layer.nextn.eh_proj = create_tensor(ctx_for_layer(final_layer),
92679289 tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", final_layer),
92689290 { 2*n_embd, n_embd },
9269- llama_model_loader::TENSOR_NOT_REQUIRED );
9291+ flags );
92709292 // EMBED_TOKENS: [embd, vocab]
92719293 layer.nextn.embed_tokens = create_tensor(ctx_for_layer(final_layer),
92729294 tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", final_layer),
92739295 { n_embd, n_vocab },
9274- llama_model_loader::TENSOR_NOT_REQUIRED );
9296+ flags );
92759297 // ENORM, HNORM: [embd]
92769298 layer.nextn.enorm = create_tensor(ctx_for_layer(final_layer),
92779299 tn(LLM_TENSOR_NEXTN_ENORM, "weight", final_layer),
92789300 { n_embd },
9279- llama_model_loader::TENSOR_NOT_REQUIRED );
9301+ flags );
92809302 layer.nextn.hnorm = create_tensor(ctx_for_layer(final_layer),
92819303 tn(LLM_TENSOR_NEXTN_HNORM, "weight", final_layer),
92829304 { n_embd },
9283- llama_model_loader::TENSOR_NOT_REQUIRED );
9305+ flags );
92849306 // SHARED_HEAD_HEAD: [embd, vocab]
92859307 layer.nextn.shared_head_head = create_tensor(ctx_for_layer(final_layer),
92869308 tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", final_layer),
92879309 { n_embd, n_vocab },
9288- llama_model_loader::TENSOR_NOT_REQUIRED );
9310+ flags );
92899311 // SHARED_HEAD_NORM: [embd]
92909312 layer.nextn.shared_head_norm = create_tensor(ctx_for_layer(final_layer),
92919313 tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", final_layer),
92929314 { n_embd },
9293- llama_model_loader::TENSOR_NOT_REQUIRED );
9315+ flags );
92949316 }
9295-
92969317 }
92979318 }
92989319 break;
0 commit comments