@@ -127,7 +127,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
127127 llama_model_loader & ml;
128128 llama_model & model;
129129
130- ggml_tensor * create_tensor (ggml_context * ctx, const std::string & name, const std::vector<int64_t > & ne, int flags = 0 );
130+ ggml_tensor * create_tensor (ggml_context * ctx, const std::string & name, const std::vector<int64_t > & ne, int flags = 0 ,
131+ ggml_context ** actual_ctx = nullptr );
131132
132133 void create_default_embd_output (const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias);
133134 void create_embd_output (const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true );
@@ -198,7 +199,8 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
198199 }
199200}
200201
201- ggml_tensor * create_tensors_helper::create_tensor (ggml_context * ctx, const std::string & name, const std::vector<int64_t > & ne, int flags) {
202+ ggml_tensor * create_tensors_helper::create_tensor (ggml_context * ctx, const std::string & name, const std::vector<int64_t > & ne,
203+ int flags, ggml_context ** actual_context) {
202204 if (ml.tensor_buft_overrides ) {
203205 for (const auto * overrides = ml.tensor_buft_overrides ; overrides->pattern != nullptr ; ++overrides) {
204206 std::regex pattern (overrides->pattern );
@@ -209,6 +211,7 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
209211 }
210212 }
211213 }
214+ if (actual_context) *actual_context = ctx;
212215 return ml.create_tensor (ctx, name, ne, flags);
213216}
214217
@@ -2311,21 +2314,36 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
23112314
23122315 layer.attn_sinks = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_SINKS, " weight" , i), {n_head}, 0 );
23132316
2317+ ggml_context *ctx_ffn_gate, *ctx_ffn_up, *ctx_ffn_down;
23142318 layer.ffn_gate_inp = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_GATE_INP, " weight" , i), { n_embd, n_expert}, 0 );
2315- layer.ffn_gate_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_GATE_EXPS, " weight" , i), { n_embd, n_ff_exp, n_expert}, 0 );
2316- layer.ffn_down_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN_EXPS, " weight" , i), {n_ff_exp, n_embd, n_expert}, 0 );
2317- layer.ffn_up_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP_EXPS, " weight" , i), { n_embd, n_ff_exp, n_expert}, 0 );
2319+ layer.ffn_gate_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_GATE_EXPS, " weight" , i), { n_embd, n_ff_exp, n_expert}, 0 , &ctx_ffn_gate );
2320+ layer.ffn_down_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN_EXPS, " weight" , i), {n_ff_exp, n_embd, n_expert}, 0 , &ctx_ffn_down );
2321+ layer.ffn_up_exps = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP_EXPS, " weight" , i), { n_embd, n_ff_exp, n_expert}, 0 , &ctx_ffn_up );
23182322
23192323 // bias
23202324 layer.bq = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_head * n_rot}, 0 );
23212325 layer.bk = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_head_kv * n_rot}, 0 );
23222326 layer.bv = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_head_kv * n_rot}, 0 );
23232327 layer.bo = create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, 0 );
23242328
2329+ ggml_context *ctx_ffn_gate_b, *ctx_ffn_up_b, *ctx_ffn_down_b;
23252330 layer.ffn_gate_inp_b = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_GATE_INP, " bias" , i), {n_expert}, 0 );
2326- layer.ffn_gate_exps_b = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_GATE_EXPS, " bias" , i), {n_ff_exp, n_expert}, 0 );
2327- layer.ffn_down_exps_b = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN_EXPS, " bias" , i), { n_embd, n_expert}, 0 );
2328- layer.ffn_up_exps_b = create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP_EXPS, " bias" , i), {n_ff_exp, n_expert}, 0 );
2331+ layer.ffn_gate_exps_b = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_GATE_EXPS, " bias" , i), {n_ff_exp, n_expert}, 0 , &ctx_ffn_gate_b);
2332+ layer.ffn_down_exps_b = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN_EXPS, " bias" , i), { n_embd, n_expert}, 0 , &ctx_ffn_down_b);
2333+ layer.ffn_up_exps_b = create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP_EXPS, " bias" , i), {n_ff_exp, n_expert}, 0 , &ctx_ffn_up_b);
2334+
2335+ if (ctx_ffn_gate_b != ctx_ffn_gate) {
2336+ layer.ffn_gate_exps_b_dup = create_tensor (ctx_ffn_gate, tn (LLM_TENSOR_FFN_GATE_EXPS, " bias" , i), {n_ff_exp, n_expert},
2337+ llama_model_loader::TENSOR_DUPLICATED);
2338+ }
2339+ if (ctx_ffn_up_b != ctx_ffn_up) {
2340+ layer.ffn_up_exps_b_dup = create_tensor (ctx_ffn_up, tn (LLM_TENSOR_FFN_UP_EXPS, " bias" , i), {n_ff_exp, n_expert},
2341+ llama_model_loader::TENSOR_DUPLICATED);
2342+ }
2343+ if (ctx_ffn_down_b != ctx_ffn_down) {
2344+ layer.ffn_down_exps_b_dup = create_tensor (ctx_ffn_down, tn (LLM_TENSOR_FFN_DOWN_EXPS, " bias" , i), { n_embd, n_expert},
2345+ llama_model_loader::TENSOR_DUPLICATED);
2346+ }
23292347 }
23302348 return use_mmap_buffer;
23312349}
0 commit comments