Skip to content

Commit 1aed3d7

Browse files
committed
Merge remote-tracking branch 'origin/master' into qwen3_next
2 parents fbe0e22 + dd62dcf commit 1aed3d7

File tree

6 files changed

+317
-5
lines changed

6 files changed

+317
-5
lines changed

ggml/include/ggml.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,58 @@ extern "C" {
12581258
struct ggml_context * ctx,
12591259
struct ggml_tensor * a);
12601260

1261+
GGML_API struct ggml_tensor * ggml_floor(
1262+
struct ggml_context * ctx,
1263+
struct ggml_tensor * a);
1264+
1265+
GGML_API struct ggml_tensor * ggml_floor_inplace(
1266+
struct ggml_context * ctx,
1267+
struct ggml_tensor * a);
1268+
1269+
GGML_API struct ggml_tensor * ggml_ceil(
1270+
struct ggml_context * ctx,
1271+
struct ggml_tensor * a);
1272+
1273+
GGML_API struct ggml_tensor * ggml_ceil_inplace(
1274+
struct ggml_context * ctx,
1275+
struct ggml_tensor * a);
1276+
1277+
GGML_API struct ggml_tensor * ggml_round(
1278+
struct ggml_context * ctx,
1279+
struct ggml_tensor * a);
1280+
1281+
GGML_API struct ggml_tensor * ggml_round_inplace(
1282+
struct ggml_context * ctx,
1283+
struct ggml_tensor * a);
1284+
1285+
/**
1286+
* Truncates the fractional part of each element in the tensor (towards zero).
1287+
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1288+
* Similar to std::trunc in C/C++.
1289+
*/
1290+
1291+
GGML_API struct ggml_tensor * ggml_trunc(
1292+
struct ggml_context * ctx,
1293+
struct ggml_tensor * a);
1294+
1295+
GGML_API struct ggml_tensor * ggml_trunc_inplace(
1296+
struct ggml_context * ctx,
1297+
struct ggml_tensor * a);
1298+
1299+
1300+
1301+
// xIELU activation function
1302+
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
1303+
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1304+
// that constrain the positive and negative source alpha values respectively
1305+
GGML_API struct ggml_tensor * ggml_xielu(
1306+
struct ggml_context * ctx,
1307+
struct ggml_tensor * a,
1308+
float alpha_n,
1309+
float alpha_p,
1310+
float beta,
1311+
float eps);
1312+
12611313
// gated linear unit ops
12621314
// A: n columns, r rows,
12631315
// result is n / 2 columns, r rows,
@@ -1732,6 +1784,13 @@ extern "C" {
17321784
float scale,
17331785
float max_bias);
17341786

1787+
GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1788+
struct ggml_context * ctx,
1789+
struct ggml_tensor * a,
1790+
struct ggml_tensor * mask,
1791+
float scale,
1792+
float max_bias);
1793+
17351794
GGML_API void ggml_soft_max_add_sinks(
17361795
struct ggml_tensor * a,
17371796
struct ggml_tensor * sinks);

ggml/src/ggml-cpu/ops.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9104,6 +9104,26 @@ void ggml_compute_forward_unary(
91049104
{
91059105
ggml_compute_forward_trunc(params, dst);
91069106
} break;
9107+
case GGML_UNARY_OP_XIELU:
9108+
{
9109+
ggml_compute_forward_xielu(params, dst);
9110+
} break;
9111+
case GGML_UNARY_OP_FLOOR:
9112+
{
9113+
ggml_compute_forward_floor(params, dst);
9114+
} break;
9115+
case GGML_UNARY_OP_CEIL:
9116+
{
9117+
ggml_compute_forward_ceil(params, dst);
9118+
} break;
9119+
case GGML_UNARY_OP_ROUND:
9120+
{
9121+
ggml_compute_forward_round(params, dst);
9122+
} break;
9123+
case GGML_UNARY_OP_TRUNC:
9124+
{
9125+
ggml_compute_forward_trunc(params, dst);
9126+
} break;
91079127
case GGML_UNARY_OP_EXPM1:
91089128
{
91099129
ggml_compute_forward_expm1(params, dst);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,6 +2343,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23432343
case GGML_UNARY_OP_ELU:
23442344
ggml_cuda_op_elu(ctx, dst);
23452345
break;
2346+
case GGML_UNARY_OP_XIELU:
2347+
ggml_cuda_op_xielu(ctx, dst);
2348+
break;
23462349
case GGML_UNARY_OP_EXPM1:
23472350
ggml_cuda_op_expm1(ctx, dst);
23482351
break;

ggml/src/ggml.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2716,8 +2716,8 @@ struct ggml_tensor * ggml_xielu(
27162716
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
27172717

27182718
ggml_set_op_params_i32(result, 0, (int32_t) GGML_UNARY_OP_XIELU);
2719-
ggml_set_op_params_f32(result, 1, beta + ggml_compute_softplus_f32(alpha_n));
2720-
ggml_set_op_params_f32(result, 2, ggml_compute_softplus_f32(alpha_p));
2719+
ggml_set_op_params_f32(result, 1, beta + ggml_softplus(alpha_n));
2720+
ggml_set_op_params_f32(result, 2, ggml_softplus(alpha_p));
27212721
ggml_set_op_params_f32(result, 3, beta);
27222722
ggml_set_op_params_f32(result, 4, eps);
27232723

src/llama-model.cpp

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6230,6 +6230,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
62306230
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
62316231
}
62326232
} break;
6233+
case LLM_ARCH_GROVEMOE:
6234+
{
6235+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6236+
6237+
// output
6238+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6239+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6240+
// if output is NULL, init from the input tok embed
6241+
if (output == NULL) {
6242+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6243+
}
6244+
6245+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6246+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6247+
GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6248+
6249+
for (int i = 0; i < n_layer; ++i) {
6250+
auto & layer = layers[i];
6251+
6252+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6253+
6254+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6255+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6256+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6257+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6258+
6259+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6260+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6261+
6262+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6263+
6264+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6265+
6266+
// MoE branch
6267+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6268+
const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6269+
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6270+
6271+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6272+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6273+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6274+
6275+
layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6276+
layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
6277+
layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6278+
}
6279+
} break;
6280+
case LLM_ARCH_APERTUS:
6281+
{
6282+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6283+
6284+
// output
6285+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6286+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
6287+
6288+
for (int i = 0; i < n_layer; ++i) {
6289+
auto & layer = layers[i];
6290+
6291+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6292+
6293+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6294+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6295+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6296+
} else {
6297+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6298+
}
6299+
6300+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6301+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6302+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6303+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6304+
6305+
// optional bias tensors
6306+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6307+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6308+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6309+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6310+
6311+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6312+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6313+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6314+
6315+
// Q and K layernorms for Apertus
6316+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6317+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6318+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6319+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6320+
}
6321+
} break;
62336322
default:
62346323
throw std::runtime_error("unknown architecture");
62356324
}
@@ -17404,6 +17493,150 @@ struct llm_build_bailingmoe2 : public llm_graph_context {
1740417493
}
1740517494
};
1740617495

17496+
struct llm_build_bailingmoe2 : public llm_graph_context {
17497+
llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17498+
const int64_t n_embd_head = hparams.n_embd_head_v;
17499+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
17500+
17501+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17502+
17503+
ggml_tensor * cur;
17504+
ggml_tensor * inpL;
17505+
17506+
inpL = build_inp_embd(model.tok_embd);
17507+
17508+
// inp_pos - contains the positions
17509+
ggml_tensor * inp_pos = build_inp_pos();
17510+
17511+
auto * inp_attn = build_attn_inp_kv();
17512+
17513+
ggml_tensor * inp_out_ids = build_inp_out_ids();
17514+
17515+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
17516+
for (int il = 0; il < n_transformer_layers; ++il) {
17517+
ggml_tensor * inpSA = inpL;
17518+
17519+
// norm
17520+
cur = build_norm(inpL,
17521+
model.layers[il].attn_norm, NULL,
17522+
LLM_NORM_RMS, il);
17523+
cb(cur, "attn_norm", il);
17524+
17525+
// self_attention
17526+
{
17527+
cur = build_lora_mm(model.layers[il].wqkv, cur);
17528+
cb(cur, "wqkv", il);
17529+
17530+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
17531+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
17532+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
17533+
17534+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17535+
cb(Qcur, "Qcur_normed", il);
17536+
17537+
Qcur = ggml_rope_ext(
17538+
ctx0, Qcur, inp_pos, nullptr,
17539+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17540+
ext_factor, attn_factor, beta_fast, beta_slow
17541+
);
17542+
17543+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
17544+
cb(Kcur, "Kcur_normed", il);
17545+
17546+
Kcur = ggml_rope_ext(
17547+
ctx0, Kcur, inp_pos, nullptr,
17548+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17549+
ext_factor, attn_factor, beta_fast, beta_slow
17550+
);
17551+
17552+
cb(Qcur, "Qcur", il);
17553+
cb(Kcur, "Kcur", il);
17554+
cb(Vcur, "Vcur", il);
17555+
17556+
cur = build_attn(inp_attn,
17557+
model.layers[il].wo, model.layers[il].bo,
17558+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17559+
}
17560+
17561+
if (il == n_transformer_layers - 1 && inp_out_ids) {
17562+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17563+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17564+
}
17565+
17566+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
17567+
cb(sa_out, "sa_out", il);
17568+
17569+
// MoE branch
17570+
cur = build_norm(sa_out,
17571+
model.layers[il].ffn_norm, NULL,
17572+
LLM_NORM_RMS, il);
17573+
cb(cur, "ffn_norm", il);
17574+
17575+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
17576+
cur = build_ffn(cur,
17577+
model.layers[il].ffn_up, NULL, NULL,
17578+
model.layers[il].ffn_gate, NULL, NULL,
17579+
model.layers[il].ffn_down, NULL, NULL,
17580+
NULL,
17581+
LLM_FFN_SILU, LLM_FFN_PAR, il);
17582+
cb(cur, "ffn_out", il);
17583+
} else {
17584+
ggml_tensor * moe_out =
17585+
build_moe_ffn(cur,
17586+
model.layers[il].ffn_gate_inp,
17587+
model.layers[il].ffn_up_exps,
17588+
model.layers[il].ffn_gate_exps,
17589+
model.layers[il].ffn_down_exps,
17590+
model.layers[il].ffn_exp_probs_b,
17591+
n_expert, n_expert_used,
17592+
LLM_FFN_SILU, hparams.expert_weights_norm,
17593+
true, hparams.expert_weights_scale,
17594+
(llama_expert_gating_func_type) hparams.expert_gating_func,
17595+
il);
17596+
cb(moe_out, "ffn_moe_out", il);
17597+
17598+
{
17599+
ggml_tensor * ffn_shexp = build_ffn(cur,
17600+
model.layers[il].ffn_up_shexp, NULL, NULL,
17601+
model.layers[il].ffn_gate_shexp, NULL, NULL,
17602+
model.layers[il].ffn_down_shexp, NULL, NULL,
17603+
NULL,
17604+
LLM_FFN_SILU, LLM_FFN_PAR, il);
17605+
cb(ffn_shexp, "ffn_shexp", il);
17606+
17607+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
17608+
cb(cur, "ffn_out", il);
17609+
}
17610+
}
17611+
17612+
cur = ggml_add(ctx0, cur, sa_out);
17613+
17614+
cur = build_cvec(cur, il);
17615+
cb(cur, "l_out", il);
17616+
17617+
// input for next layer
17618+
inpL = cur;
17619+
}
17620+
17621+
cur = inpL;
17622+
17623+
cur = build_norm(cur,
17624+
model.output_norm, NULL,
17625+
LLM_NORM_RMS, -1);
17626+
17627+
cb(cur, "result_norm", -1);
17628+
res->t_embd = cur;
17629+
17630+
// lm_head
17631+
cur = build_lora_mm(model.output, cur);
17632+
17633+
cb(cur, "result_output", -1);
17634+
res->t_logits = cur;
17635+
17636+
ggml_build_forward_expand(gf, cur);
17637+
}
17638+
};
17639+
1740717640
struct llm_build_dots1 : public llm_graph_context {
1740817641
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
1740917642
const int64_t n_embd_head = hparams.n_embd_head_v;

tools/main/main.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -717,9 +717,6 @@ int main(int argc, char ** argv) {
717717

718718
embd.push_back(id);
719719

720-
// Print cache statistics after each token generation
721-
token_count++;
722-
723720
if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
724721
assistant_ss << common_token_to_piece(ctx, id, false);
725722
}

0 commit comments

Comments
 (0)