Skip to content

Commit fca1387

Browse files
authored
Merge pull request #17 from Thireus/glm-4.5
Glm 4.5
2 parents 25e40ca + fb08cdf commit fca1387

File tree

1 file changed

+52
-31
lines changed

1 file changed

+52
-31
lines changed

src/llama.cpp

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4885,8 +4885,9 @@ struct llama_model_loader {
48854885
return cur;
48864886
}
48874887

4888-
static const int TENSOR_NOT_REQUIRED = 1;
4889-
static const int TENSOR_DUPLICATED = 2;
4888+
static const int TENSOR_NOT_REQUIRED = 1 << 0;
4889+
static const int TENSOR_DUPLICATED = 1 << 1;
4890+
static const int TENSOR_SKIP = 1 << 2;
48904891

48914892
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
48924893
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
@@ -4895,6 +4896,17 @@ struct llama_model_loader {
48954896
return NULL;
48964897
}
48974898

4899+
// skip unused tensors
4900+
if (flags & TENSOR_SKIP) {
4901+
const size_t nbytes = ggml_nbytes(cur);
4902+
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", name.c_str(), nbytes);
4903+
4904+
size_data -= nbytes;
4905+
n_created++;
4906+
4907+
return nullptr;
4908+
}
4909+
48984910
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
48994911
}
49004912

@@ -7581,6 +7593,10 @@ static bool llm_load_tensors(
75817593

75827594
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
75837595

7596+
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
7597+
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
7598+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
7599+
75847600
// create tensors for the weights
75857601
{
75867602
// note: cast to int64_t since we will use these for the tensor dimensions
@@ -9201,63 +9217,69 @@ static bool llm_load_tensors(
92019217
ggml_context * ctx_layer = ctx_for_layer(i);
92029218
ggml_context * ctx_split = ctx_for_layer_split(i);
92039219

9220+
int flags = 0;
9221+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
9222+
// skip all tensors in the NextN layers
9223+
flags |= TENSOR_SKIP;
9224+
}
9225+
92049226
auto & layer = model.layers[i];
92059227

9206-
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
9228+
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
92079229

92089230
// GLM-style attention with bias terms
9209-
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
9210-
layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
9211-
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
9212-
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0);
9213-
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0);
9214-
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0);
9231+
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
9232+
layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
9233+
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
9234+
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
9235+
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
9236+
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
92159237

9216-
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
9238+
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
92179239

92189240
// K/Q norm tensors (optional for GLM-4.5 355B variant)
92199241
layer.attn_q_norm = create_tensor(ctx_layer,
9220-
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9242+
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
92219243
layer.attn_k_norm = create_tensor(ctx_layer,
9222-
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED);
9244+
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
92239245

9224-
layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
9246+
layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
92259247

92269248
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
92279249
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
92289250
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
92299251

92309252
if (use_moe) {
92319253
// MoE layers
9232-
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
9254+
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
92339255
// gate bias
9234-
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, 0);
9256+
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
92359257

92369258
// MoE branch
92379259
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
92389260

92399261
layer.ffn_gate_exps = create_tensor(ctx_split,
9240-
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
9262+
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
92419263
layer.ffn_down_exps = create_tensor(ctx_split,
9242-
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
9264+
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
92439265
layer.ffn_up_exps = create_tensor(ctx_split,
9244-
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
9266+
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
92459267

92469268
// Shared expert
92479269
if (n_expert_shared > 0) {
92489270
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
92499271
layer.ffn_gate_shexp = create_tensor(ctx_split,
9250-
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
9272+
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
92519273
layer.ffn_down_shexp = create_tensor(ctx_split,
9252-
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
9274+
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
92539275
layer.ffn_up_shexp = create_tensor(ctx_split,
9254-
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
9276+
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
92559277
}
92569278
} else {
92579279
// Dense layers (first k layers) - GLM uses separate gate/up projections
9258-
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
9259-
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
9260-
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
9280+
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
9281+
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
9282+
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
92619283
}
92629284
// --- NextN / MTP tensors (preserved but unused), on the final layer ---
92639285
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
@@ -9266,33 +9288,32 @@ static bool llm_load_tensors(
92669288
layer.nextn.eh_proj = create_tensor(ctx_for_layer(final_layer),
92679289
tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", final_layer),
92689290
{ 2*n_embd, n_embd },
9269-
llama_model_loader::TENSOR_NOT_REQUIRED);
9291+
flags);
92709292
// EMBED_TOKENS: [embd, vocab]
92719293
layer.nextn.embed_tokens = create_tensor(ctx_for_layer(final_layer),
92729294
tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", final_layer),
92739295
{ n_embd, n_vocab },
9274-
llama_model_loader::TENSOR_NOT_REQUIRED);
9296+
flags);
92759297
// ENORM, HNORM: [embd]
92769298
layer.nextn.enorm = create_tensor(ctx_for_layer(final_layer),
92779299
tn(LLM_TENSOR_NEXTN_ENORM, "weight", final_layer),
92789300
{ n_embd },
9279-
llama_model_loader::TENSOR_NOT_REQUIRED);
9301+
flags);
92809302
layer.nextn.hnorm = create_tensor(ctx_for_layer(final_layer),
92819303
tn(LLM_TENSOR_NEXTN_HNORM, "weight", final_layer),
92829304
{ n_embd },
9283-
llama_model_loader::TENSOR_NOT_REQUIRED);
9305+
flags);
92849306
// SHARED_HEAD_HEAD: [embd, vocab]
92859307
layer.nextn.shared_head_head = create_tensor(ctx_for_layer(final_layer),
92869308
tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", final_layer),
92879309
{ n_embd, n_vocab },
9288-
llama_model_loader::TENSOR_NOT_REQUIRED);
9310+
flags);
92899311
// SHARED_HEAD_NORM: [embd]
92909312
layer.nextn.shared_head_norm = create_tensor(ctx_for_layer(final_layer),
92919313
tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", final_layer),
92929314
{ n_embd },
9293-
llama_model_loader::TENSOR_NOT_REQUIRED);
9315+
flags);
92949316
}
9295-
92969317
}
92979318
}
92989319
break;

0 commit comments

Comments
 (0)