Skip to content

Commit 5a02bd4

Browse files
pwilkinCISC
andauthored
Apply suggestions from code review
Co-authored-by: Sigbjørn Skjæret <[email protected]>
1 parent 58e6e0f commit 5a02bd4

File tree

5 files changed

+26
-75
lines changed

5 files changed

+26
-75
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8772,7 +8772,7 @@ class ApertusModel(LlamaModel):
87728772

87738773
def modify_tensors(self, data_torch, name, bid):
87748774
# Handle xIELU activation parameters
8775-
n_layers = self.hparams.get("num_hidden_layers")
8775+
n_layers = self.hparams["num_hidden_layers"]
87768776
if name.endswith(".act_fn.alpha_n"):
87778777
self._alpha_n[bid] = data_torch.to("cpu").float().item()
87788778
if (len(self._alpha_n) == n_layers):

gguf-py/gguf/constants.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,10 @@ class Diffusion:
295295
SHIFT_LOGITS = "diffusion.shift_logits"
296296

297297
class xIELU:
298-
XIELU_ALPHA_P = "xielu.alpha_p"
299-
XIELU_ALPHA_N = "xielu.alpha_n"
300-
XIELU_BETA = "xielu.beta"
301-
XIELU_EPS = "xielu.eps"
298+
ALPHA_P = "xielu.alpha_p"
299+
ALPHA_N = "xielu.alpha_n"
300+
BETA = "xielu.beta"
301+
EPS = "xielu.eps"
302302

303303

304304
#
@@ -458,10 +458,6 @@ class MODEL_TENSOR(IntEnum):
458458
FFN_GATE_SHEXP = auto()
459459
FFN_DOWN_SHEXP = auto()
460460
FFN_UP_SHEXP = auto()
461-
FFN_ACT_ALPHA_N = auto()
462-
FFN_ACT_ALPHA_P = auto()
463-
FFN_ACT_BETA = auto()
464-
FFN_ACT_EPS = auto()
465461
FFN_EXP_PROBS_B = auto()
466462
ATTN_Q_NORM = auto()
467463
ATTN_K_NORM = auto()

gguf-py/gguf/gguf_writer.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,17 +1075,17 @@ def add_audio_num_mel_bins(self, value: int) -> None:
10751075
def add_audio_stack_factor(self, value: int) -> None:
10761076
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
10771077

1078-
def add_xielu_alpha_p(self, value: Sequence[float]):
1079-
self.add_array(Keys.xIELU.XIELU_ALPHA_P, value)
1078+
def add_xielu_alpha_p(self, values: Sequence[float]):
1079+
self.add_array(Keys.xIELU.ALPHA_P, values)
10801080

1081-
def add_xielu_alpha_n(self, value: Sequence[float]):
1082-
self.add_array(Keys.xIELU.XIELU_ALPHA_N, value)
1081+
def add_xielu_alpha_n(self, values: Sequence[float]):
1082+
self.add_array(Keys.xIELU.ALPHA_N, values)
10831083

1084-
def add_xielu_beta(self, value: Sequence[float]):
1085-
self.add_array(Keys.xIELU.XIELU_BETA, value)
1084+
def add_xielu_beta(self, values: Sequence[float]):
1085+
self.add_array(Keys.xIELU.BETA, values)
10861086

1087-
def add_xielu_eps(self, value: Sequence[float]):
1088-
self.add_array(Keys.xIELU.XIELU_EPS, value)
1087+
def add_xielu_eps(self, values: Sequence[float]):
1088+
self.add_array(Keys.xIELU.EPS, values)
10891089

10901090
# diffusion models
10911091

gguf-py/gguf/tensor_mapping.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -451,22 +451,6 @@ class TensorNameMap:
451451
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
452452
),
453453

454-
MODEL_TENSOR.FFN_ACT_ALPHA_N: (
455-
"model.layers.{bid}.mlp.act_fn.alpha_n", # apertus xIELU
456-
),
457-
458-
MODEL_TENSOR.FFN_ACT_ALPHA_P: (
459-
"model.layers.{bid}.mlp.act_fn.alpha_p", # apertus xIELU
460-
),
461-
462-
MODEL_TENSOR.FFN_ACT_BETA: (
463-
"model.layers.{bid}.mlp.act_fn.beta", # apertus xIELU
464-
),
465-
466-
MODEL_TENSOR.FFN_ACT_EPS: (
467-
"model.layers.{bid}.mlp.act_fn.eps", # apertus xIELU
468-
),
469-
470454
MODEL_TENSOR.FFN_GATE_EXP: (
471455
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
472456
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
@@ -1491,34 +1475,6 @@ class TensorNameMap:
14911475
"model.layers.{bid}.post_attention_layernorm",
14921476
),
14931477
},
1494-
MODEL_ARCH.APERTUS: {
1495-
MODEL_TENSOR.ATTN_NORM: (
1496-
"model.layers.{bid}.attention_layernorm",
1497-
),
1498-
MODEL_TENSOR.ATTN_Q_NORM: (
1499-
"model.layers.{bid}.attention.query_layernorm",
1500-
"model.layers.{bid}.self_attn.q_norm",
1501-
),
1502-
MODEL_TENSOR.ATTN_K_NORM: (
1503-
"model.layers.{bid}.attention.key_layernorm",
1504-
"model.layers.{bid}.self_attn.k_norm",
1505-
),
1506-
MODEL_TENSOR.FFN_NORM: (
1507-
"model.layers.{bid}.feedforward_layernorm",
1508-
),
1509-
MODEL_TENSOR.FFN_ACT_ALPHA_N: (
1510-
"model.layers.{bid}.mlp.act_fn.alpha_n",
1511-
),
1512-
MODEL_TENSOR.FFN_ACT_ALPHA_P: (
1513-
"model.layers.{bid}.mlp.act_fn.alpha_p",
1514-
),
1515-
MODEL_TENSOR.FFN_ACT_BETA: (
1516-
"model.layers.{bid}.mlp.act_fn.beta",
1517-
),
1518-
MODEL_TENSOR.FFN_ACT_EPS: (
1519-
"model.layers.{bid}.mlp.act_fn.eps",
1520-
),
1521-
},
15221478
}
15231479

15241480
mapping: dict[str, tuple[MODEL_TENSOR, str]]

src/llama-model.cpp

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -513,10 +513,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
513513
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
514514
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
515515

516-
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0);
517-
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0);
518-
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0);
519-
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0);
516+
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
517+
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
518+
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
519+
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
520520

521521
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
522522
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
@@ -2014,10 +2014,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
20142014
case LLM_ARCH_APERTUS:
20152015
{
20162016
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2017-
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
2018-
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
2019-
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
2020-
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
2017+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
2018+
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
2019+
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
2020+
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
20212021

20222022
switch (hparams.n_layer) {
20232023
case 32: type = LLM_TYPE_8B; break;
@@ -5858,19 +5858,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
58585858

58595859
// output
58605860
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5861-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5861+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
58625862

58635863
for (int i = 0; i < n_layer; ++i) {
58645864
auto & layer = layers[i];
58655865

58665866
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
58675867

58685868
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
5869-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5870-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5871-
}
5872-
else {
5873-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5869+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5870+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5871+
} else {
5872+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
58745873
}
58755874

58765875
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);

0 commit comments

Comments
 (0)