@@ -5312,6 +5312,11 @@ struct llm_build_bert : public llm_graph_context {
53125312 cur = build_lora_mm(model.layers[il].wqkv, cur);
53135313 cb(cur, "wqkv", il);
53145314
5315+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5316+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5317+ cb(cur, "bqkv", il);
5318+ }
5319+
53155320 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
53165321 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
53175322 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5364,29 +5369,46 @@ struct llm_build_bert : public llm_graph_context {
53645369 cb(ffn_inp, "ffn_inp", il);
53655370
53665371 // feed-forward network
5367- if (model.arch == LLM_ARCH_BERT) {
5372+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5373+ // MoE branch
5374+ cur = build_moe_ffn(cur,
5375+ model.layers[il].ffn_gate_inp,
5376+ model.layers[il].ffn_up_exps,
5377+ nullptr,
5378+ model.layers[il].ffn_down_exps,
5379+ nullptr,
5380+ hparams.n_expert,
5381+ hparams.n_expert_used,
5382+ LLM_FFN_GELU,
5383+ true, false,
5384+ 0.0f,
5385+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
5386+ cb(cur, "ffn_moe_out", il);
5387+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
53685388 cur = build_ffn(cur,
53695389 model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
53705390 NULL, NULL, NULL,
53715391 model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
53725392 NULL,
53735393 LLM_FFN_GELU, LLM_FFN_SEQ, il);
5394+ cb(cur, "ffn_out", il);
53745395 } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
53755396 cur = build_ffn(cur,
53765397 model.layers[il].ffn_up, NULL, NULL,
53775398 model.layers[il].ffn_gate, NULL, NULL,
53785399 model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
53795400 NULL,
53805401 LLM_FFN_GELU, LLM_FFN_PAR, il);
5402+ cb(cur, "ffn_out", il);
53815403 } else {
53825404 cur = build_ffn(cur,
53835405 model.layers[il].ffn_up, NULL, NULL,
53845406 model.layers[il].ffn_gate, NULL, NULL,
53855407 model.layers[il].ffn_down, NULL, NULL,
53865408 NULL,
53875409 LLM_FFN_SILU, LLM_FFN_PAR, il);
5410+ cb(cur, "ffn_out", il);
53885411 }
5389- cb(cur, "ffn_out", il);
53905412
53915413 // attentions bypass the intermediate layer
53925414 cur = ggml_add(ctx0, cur, ffn_inp);
0 commit comments