Skip to content

Commit b928f8c

Browse files
committed
use old arch to run z1(reuse eos & half rope and so on)
1 parent 5106764 commit b928f8c

File tree

4 files changed

+34
-18
lines changed

4 files changed

+34
-18
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4929,23 +4929,7 @@ def prepare_tensors(self):
49294929
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
49304930

49314931

4932-
@Model.register("Glm4ForCausalLM")
4933-
class Glm4Model(Model):
4934-
model_arch = gguf.MODEL_ARCH.GLM4
4935-
4936-
def set_vocab(self):
4937-
self._set_vocab_gpt2()
4938-
4939-
def set_gguf_parameters(self):
4940-
super().set_gguf_parameters()
4941-
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
4942-
if self.hparams["rope_scaling"].get("type") == "yarn":
4943-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
4944-
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
4945-
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
4946-
4947-
4948-
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
4932+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration", "Glm4ForCausalLM")
49494933
class ChatGLMModel(Model):
49504934
model_arch = gguf.MODEL_ARCH.CHATGLM
49514935

@@ -5085,6 +5069,11 @@ def set_gguf_parameters(self):
50855069
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
50865070
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
50875071
self.gguf_writer.add_add_bos_token(False)
5072+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
5073+
if self.hparams["rope_scaling"].get("type") == "yarn":
5074+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
5075+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
5076+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
50885077
rope_freq = 10000
50895078
if "rope_ratio" in self.hparams:
50905079
rope_freq = rope_freq * self.hparams["rope_ratio"]

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,6 +1570,8 @@ class MODEL_TENSOR(IntEnum):
15701570
MODEL_TENSOR.FFN_NORM,
15711571
MODEL_TENSOR.FFN_DOWN,
15721572
MODEL_TENSOR.FFN_UP,
1573+
MODEL_TENSOR.ATTN_POST_NORM,
1574+
MODEL_TENSOR.FFN_POST_NORM,
15731575
],
15741576
MODEL_ARCH.GLM4 : [
15751577
MODEL_TENSOR.TOKEN_EMBD,

src/llama-arch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
11551155
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
11561156
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
11571157
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1158+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1159+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
11581160
},
11591161
},
11601162
{

src/llama-model.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12041204
type = LLM_TYPE_9B;
12051205
}
12061206
} break;
1207+
case 61: type = LLM_TYPE_32B; break;
12071208
default: type = LLM_TYPE_UNKNOWN;
12081209
}
12091210
} break;
@@ -3475,7 +3476,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
34753476
// output
34763477
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
34773478
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3478-
3479+
// if output is NULL, init from the input tok embed
34793480
for (int i = 0; i < n_layer; ++i) {
34803481
auto & layer = layers[i];
34813482

@@ -3494,11 +3495,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
34943495

34953496
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
34963497

3498+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3499+
34973500
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
34983501

34993502
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
35003503

35013504
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3505+
3506+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
35023507
}
35033508
} break;
35043509
case LLM_ARCH_GLM4:
@@ -10911,12 +10916,22 @@ struct llm_build_chatglm : public llm_graph_context {
1091110916
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1091210917
}
1091310918

10919+
// Post-attention norm (new!)
10920+
if (model.layers[il].attn_post_norm){
10921+
cur = build_norm(cur,
10922+
model.layers[il].attn_post_norm,
10923+
NULL,
10924+
LLM_NORM_RMS, il);
10925+
cb(cur, "post_attn_norm", il);
10926+
}
10927+
1091410928
// Add the input
1091510929
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1091610930
cb(ffn_inp, "ffn_inp", il);
1091710931

1091810932
// FF
1091910933
{
10934+
// Pre-MLP norm
1092010935
cur = build_norm(ffn_inp,
1092110936
model.layers[il].ffn_norm,
1092210937
NULL,
@@ -10931,6 +10946,14 @@ struct llm_build_chatglm : public llm_graph_context {
1093110946
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
1093210947
cb(cur, "ffn_out", il);
1093310948

10949+
// Post-MLP norm
10950+
if(model.layers[il].ffn_post_norm){
10951+
cur = build_norm(cur,
10952+
model.layers[il].ffn_post_norm,
10953+
NULL,
10954+
LLM_NORM_RMS, il);
10955+
cb(cur, "post_mlp_norm", il);
10956+
}
1093410957
}
1093510958

1093610959
inpL = ggml_add(ctx0, cur, ffn_inp);

0 commit comments

Comments
 (0)