Skip to content

Commit 64add82

Browse files
committed
First attempt
1 parent a812838 commit 64add82

File tree

12 files changed

+551
-31
lines changed

12 files changed

+551
-31
lines changed

convert_hf_to_gguf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8566,6 +8566,17 @@ def prepare_tensors(self):
85668566
raise ValueError(f"Unprocessed experts: {experts}")
85678567

85688568

8569+
@ModelBase.register("ApertusForCausalLM")
8570+
class ApertusModel(LlamaModel):
8571+
model_arch = gguf.MODEL_ARCH.APERTUS
8572+
8573+
def modify_tensors(self, data_torch, name, bid):
8574+
# Handle xIELU activation parameters
8575+
if name.endswith(".act_fn.alpha_n") or name.endswith(".act_fn.alpha_p") or name.endswith(".act_fn.beta") or name.endswith(".act_fn.eps"):
8576+
return [(self.map_tensor_name(name), data_torch)]
8577+
8578+
return super().modify_tensors(data_torch, name, bid)
8579+
85698580
class MistralModel(LlamaModel):
85708581
model_arch = gguf.MODEL_ARCH.LLAMA
85718582
model_name = "Mistral"

ggml/include/ggml.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ extern "C" {
554554
GGML_OP_OPT_STEP_SGD,
555555

556556
GGML_OP_GLU,
557+
GGML_OP_XIELU,
557558

558559
GGML_OP_COUNT,
559560
};
@@ -1148,6 +1149,16 @@ extern "C" {
11481149
struct ggml_context * ctx,
11491150
struct ggml_tensor * a);
11501151

1152+
// xIELU activation function
1153+
// x = x * (alpha_n + alpha_p * sigmoid(beta * x)) + eps * (x > 0)
1154+
GGML_API struct ggml_tensor * ggml_xielu(
1155+
struct ggml_context * ctx,
1156+
struct ggml_tensor * a,
1157+
float alpha_n,
1158+
float alpha_p,
1159+
float beta,
1160+
float eps);
1161+
11511162
// gated linear unit ops
11521163
// A: n columns, r rows,
11531164
// result is n / 2 columns, r rows,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1971,6 +1971,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
19711971
{
19721972
ggml_compute_forward_unary(params, tensor);
19731973
} break;
1974+
case GGML_OP_XIELU:
1975+
{
1976+
ggml_compute_forward_xielu(params, tensor);
1977+
} break;
19741978
case GGML_OP_GLU:
19751979
{
19761980
ggml_compute_forward_glu(params, tensor);
@@ -2160,6 +2164,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21602164
case GGML_OP_REPEAT:
21612165
case GGML_OP_REPEAT_BACK:
21622166
case GGML_OP_LEAKY_RELU:
2167+
case GGML_OP_XIELU:
21632168
{
21642169
n_tasks = 1;
21652170
} break;

ggml/src/ggml-cpu/unary-ops.cpp

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
5252
return sqrtf(x);
5353
}
5454

55+
static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
56+
if (x > 0.0f) {
57+
return alpha_p * x * x + beta * x;
58+
} else {
59+
const float min_x_eps = fminf(x, eps);
60+
return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
61+
}
62+
}
63+
5564
static inline float op_sin(float x) {
5665
return sinf(x);
5766
}
@@ -64,8 +73,8 @@ static inline float op_log(float x) {
6473
return logf(x);
6574
}
6675

67-
template <float (*op)(float), typename src0_t, typename dst_t>
68-
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
76+
template <typename Op, typename src0_t, typename dst_t>
77+
static inline void vec_unary_op(const Op& op, int64_t n, dst_t * y, const src0_t * x) {
6978
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
7079
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
7180

@@ -74,8 +83,8 @@ static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
7483
}
7584
}
7685

77-
template <float (*op)(float), typename src0_t, typename dst_t>
78-
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
86+
template <typename Op, typename src0_t, typename dst_t>
87+
static void apply_unary_op(const Op& op, const ggml_compute_params * params, ggml_tensor * dst) {
7988
const ggml_tensor * src0 = dst->src[0];
8089

8190
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
@@ -95,25 +104,25 @@ static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst
95104
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
96105
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
97106

98-
vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
107+
vec_unary_op<decltype(op), src0_t, dst_t>(op, ne0, dst_ptr, src0_ptr);
99108
}
100109
}
101110

102111
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
103-
template <float (*op)(float)>
104-
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
112+
template <typename Op>
113+
static void unary_op(const Op& op, const ggml_compute_params * params, ggml_tensor * dst) {
105114
const ggml_tensor * src0 = dst->src[0];
106115

107116
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
108-
apply_unary_op<op, float, float>(params, dst);
117+
apply_unary_op<decltype(op), float, float>(op, params, dst);
109118
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
110-
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
119+
apply_unary_op<decltype(op), ggml_fp16_t, ggml_fp16_t>(op, params, dst);
111120
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
112-
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
121+
apply_unary_op<decltype(op), ggml_bf16_t, ggml_bf16_t>(op, params, dst);
113122
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
114-
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
123+
apply_unary_op<decltype(op), ggml_bf16_t, float>(op, params, dst);
115124
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
116-
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
125+
apply_unary_op<decltype(op), ggml_fp16_t, float>(op, params, dst);
117126
} else {
118127
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
119128
ggml_type_name(dst->type), ggml_type_name(src0->type));
@@ -122,65 +131,89 @@ static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
122131
}
123132

124133
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
125-
unary_op<op_abs>(params, dst);
134+
unary_op(op_abs, params, dst);
126135
}
127136

128137
void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
129-
unary_op<op_sgn>(params, dst);
138+
unary_op(op_sgn, params, dst);
130139
}
131140

132141
void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
133-
unary_op<op_neg>(params, dst);
142+
unary_op(op_neg, params, dst);
134143
}
135144

136145
void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
137-
unary_op<op_step>(params, dst);
146+
unary_op(op_step, params, dst);
138147
}
139148

140149
void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
141-
unary_op<op_tanh>(params, dst);
150+
unary_op(op_tanh, params, dst);
142151
}
143152

144153
void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
145-
unary_op<op_elu>(params, dst);
154+
unary_op(op_elu, params, dst);
146155
}
147156

148157
void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
149-
unary_op<op_relu>(params, dst);
158+
unary_op(op_relu, params, dst);
150159
}
151160

152161
void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
153-
unary_op<op_sigmoid>(params, dst);
162+
unary_op(op_sigmoid, params, dst);
154163
}
155164

156165
void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
157-
unary_op<op_hardsigmoid>(params, dst);
166+
unary_op(op_hardsigmoid, params, dst);
158167
}
159168

160169
void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
161-
unary_op<op_exp>(params, dst);
170+
unary_op(op_exp, params, dst);
162171
}
163172

164173
void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
165-
unary_op<op_hardswish>(params, dst);
174+
unary_op(op_hardswish, params, dst);
166175
}
167176

168177
void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
169-
unary_op<op_sqr>(params, dst);
178+
unary_op(op_sqr, params, dst);
170179
}
171180

172181
void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
173-
unary_op<op_sqrt>(params, dst);
182+
unary_op(op_sqrt, params, dst);
174183
}
175184

176185
void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
177-
unary_op<op_sin>(params, dst);
186+
unary_op(op_sin, params, dst);
178187
}
179188

180189
void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
181-
unary_op<op_cos>(params, dst);
190+
unary_op(op_cos, params, dst);
182191
}
183192

184193
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
185-
unary_op<op_log>(params, dst);
194+
unary_op(op_log, params, dst);
195+
}
196+
197+
static float softplus(float input, float beta=1.0f, float threshold=20.0f) {
198+
if (input * beta > threshold) return input;
199+
return (1/beta) * logf(1 + expf(beta * input));
186200
}
201+
202+
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
203+
// Get the XIELU parameters from the operation
204+
const float * op_params = (const float*)dst->op_params;
205+
float alpha_n = op_params[0];
206+
float alpha_p = op_params[1];
207+
const float beta = op_params[2];
208+
const float eps = op_params[3];
209+
210+
// alpha_p = softplus(alpha_p);
211+
// alpha_n = beta + softplus(alpha_n);
212+
213+
const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
214+
return op_xielu(f, alpha_n, alpha_p, beta, eps);
215+
};
216+
217+
unary_op(xielu_op_params, params, dst);
218+
}
219+

ggml/src/ggml-cpu/unary-ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
2222
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2323
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2424
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25+
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2526

2627
#ifdef __cplusplus
2728
}

ggml/src/ggml.c

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,9 +1017,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
10171017
"OPT_STEP_SGD",
10181018

10191019
"GLU",
1020+
"XIELU",
10201021
};
10211022

1022-
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
1023+
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 90");
10231024

10241025
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
10251026
"none",
@@ -1121,9 +1122,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
11211122
"sgd(x)",
11221123

11231124
"glu(x)",
1125+
"xielu(x)",
11241126
};
11251127

1126-
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
1128+
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 90");
11271129

11281130
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
11291131

@@ -1147,7 +1149,6 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
11471149

11481150
static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
11491151

1150-
11511152
static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
11521153
"REGLU",
11531154
"GEGLU",
@@ -2646,6 +2647,26 @@ struct ggml_tensor * ggml_silu(
26462647
return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
26472648
}
26482649

2650+
// ggml_xielu
2651+
struct ggml_tensor * ggml_xielu(
2652+
struct ggml_context * ctx,
2653+
struct ggml_tensor * a,
2654+
float alpha_n,
2655+
float alpha_p,
2656+
float beta,
2657+
float eps) {
2658+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
2659+
2660+
// Store the parameters as operation parameters
2661+
float params[] = { alpha_n, alpha_p, beta, eps };
2662+
ggml_set_op_params(result, params, sizeof(params));
2663+
2664+
result->op = GGML_OP_XIELU;
2665+
result->src[0] = a;
2666+
2667+
return result;
2668+
}
2669+
26492670
struct ggml_tensor * ggml_silu_inplace(
26502671
struct ggml_context * ctx,
26512672
struct ggml_tensor * a) {

gguf-py/gguf/constants.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ class MODEL_ARCH(IntEnum):
391391
SMALLTHINKER = auto()
392392
LLADA = auto()
393393
SEED_OSS = auto()
394+
APERTUS = auto()
394395

395396

396397
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -440,6 +441,10 @@ class MODEL_TENSOR(IntEnum):
440441
FFN_GATE_SHEXP = auto()
441442
FFN_DOWN_SHEXP = auto()
442443
FFN_UP_SHEXP = auto()
444+
FFN_ACT_ALPHA_N = auto()
445+
FFN_ACT_ALPHA_P = auto()
446+
FFN_ACT_BETA = auto()
447+
FFN_ACT_EPS = auto()
443448
FFN_EXP_PROBS_B = auto()
444449
ATTN_Q_NORM = auto()
445450
ATTN_K_NORM = auto()
@@ -727,6 +732,7 @@ class MODEL_TENSOR(IntEnum):
727732
MODEL_ARCH.SMALLTHINKER: "smallthinker",
728733
MODEL_ARCH.LLADA: "llada",
729734
MODEL_ARCH.SEED_OSS: "seed_oss",
735+
MODEL_ARCH.APERTUS: "apertus",
730736
}
731737

732738
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -773,12 +779,20 @@ class MODEL_TENSOR(IntEnum):
773779
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
774780
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
775781
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
782+
MODEL_TENSOR.FFN_ACT_ALPHA_N: "blk.{bid}.ffn_act_alpha_n",
783+
MODEL_TENSOR.FFN_ACT_ALPHA_P: "blk.{bid}.ffn_act_alpha_p",
784+
MODEL_TENSOR.FFN_ACT_BETA: "blk.{bid}.ffn_act_beta",
785+
MODEL_TENSOR.FFN_ACT_EPS: "blk.{bid}.ffn_act_eps",
776786
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
777787
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
778788
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
779789
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
780790
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
781791
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
792+
MODEL_TENSOR.FFN_ACT_ALPHA_N: "blk.{bid}.ffn_act_alpha_n",
793+
MODEL_TENSOR.FFN_ACT_ALPHA_P: "blk.{bid}.ffn_act_alpha_p",
794+
MODEL_TENSOR.FFN_ACT_BETA: "blk.{bid}.ffn_act_beta",
795+
MODEL_TENSOR.FFN_ACT_EPS: "blk.{bid}.ffn_act_eps",
782796
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
783797
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
784798
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
@@ -2683,6 +2697,28 @@ class MODEL_TENSOR(IntEnum):
26832697
MODEL_TENSOR.FFN_DOWN_EXP,
26842698
MODEL_TENSOR.FFN_UP_EXP,
26852699
],
2700+
MODEL_ARCH.APERTUS: [
2701+
MODEL_TENSOR.TOKEN_EMBD,
2702+
MODEL_TENSOR.OUTPUT_NORM,
2703+
MODEL_TENSOR.OUTPUT,
2704+
MODEL_TENSOR.ROPE_FREQS,
2705+
MODEL_TENSOR.ATTN_NORM,
2706+
MODEL_TENSOR.ATTN_Q,
2707+
MODEL_TENSOR.ATTN_K,
2708+
MODEL_TENSOR.ATTN_V,
2709+
MODEL_TENSOR.ATTN_OUT,
2710+
MODEL_TENSOR.ATTN_ROT_EMBD,
2711+
MODEL_TENSOR.ATTN_Q_NORM,
2712+
MODEL_TENSOR.ATTN_K_NORM,
2713+
MODEL_TENSOR.FFN_NORM,
2714+
MODEL_TENSOR.FFN_GATE,
2715+
MODEL_TENSOR.FFN_DOWN,
2716+
MODEL_TENSOR.FFN_UP,
2717+
MODEL_TENSOR.FFN_ACT_ALPHA_N,
2718+
MODEL_TENSOR.FFN_ACT_ALPHA_P,
2719+
MODEL_TENSOR.FFN_ACT_BETA,
2720+
MODEL_TENSOR.FFN_ACT_EPS,
2721+
],
26862722
# TODO
26872723
}
26882724

0 commit comments

Comments
 (0)