@@ -3423,8 +3423,8 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
34233423}
34243424
34253425template<typename F>
3426- static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t * buft_list, const F & fn) {
3427- for (const auto & cur : * buft_list) {
3426+ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
3427+ for (const auto & cur : buft_list) {
34283428 ggml_backend_dev_t cur_dev = cur.first;
34293429 ggml_backend_buffer_type_t cur_buft = cur.second;
34303430 if (buft_supported(cur_buft, cur_dev, fn)) {
@@ -3499,7 +3499,7 @@ static bool llama_kv_cache_init(
34993499 } else {
35003500 buft_list = &model.cpu_buft_list;
35013501 }
3502- ggml_backend_buffer_type_t buft = select_buft(buft_list,
3502+ ggml_backend_buffer_type_t buft = select_buft(* buft_list,
35033503 [&](ggml_context * ctx) {
35043504 ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
35053505 if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
@@ -6955,7 +6955,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
69556955 }
69566956}
69576957
6958- //////// TODO: move elsewhere, maybe
69596958enum llm_tensor_layer {
69606959 LLM_TENSOR_LAYER_INPUT,
69616960 LLM_TENSOR_LAYER_REPEATING,
@@ -7093,7 +7092,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
70937092};
70947093
70957094// checks if the weight tensor can be used with the specified buffer type and device
7096- static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
7095+ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
70977096 GGML_ASSERT(w != nullptr);
70987097
70997098 if (op == GGML_OP_NONE) {
@@ -7125,7 +7124,7 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
71257124 } break;
71267125 case GGML_OP_MUL_MAT_ID:
71277126 {
7128- int n_expert_used = 2; // TODO: from model
7127+ int n_expert_used = hparams.n_expert_used;
71297128 ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
71307129 ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
71317130 op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
@@ -7147,8 +7146,8 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
71477146 } break;
71487147 case GGML_OP_ROPE:
71497148 {
7150- int n_embd_head = 64; // TODO: from model
7151- int n_head = 16 ;
7149+ int n_embd_head = hparams.n_embd_head_v;
7150+ int n_head = hparams.n_head() ;
71527151 ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
71537152 ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
71547153 op_tensor = ggml_rope_ext(
@@ -7190,12 +7189,12 @@ static bool weight_buft_supported(ggml_tensor * w, ggml_op op, ggml_backend_buff
71907189}
71917190
71927191// find the first buffer type in the list that can use the tensor
7193- static ggml_backend_buffer_type_t select_weight_buft(ggml_tensor * tensor, ggml_op op, llama_model::buft_list_t * buft_list) {
7194- GGML_ASSERT(!buft_list-> empty());
7195- for (auto & cur : * buft_list) {
7192+ static ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) {
7193+ GGML_ASSERT(!buft_list. empty());
7194+ for (const auto & cur : buft_list) {
71967195 ggml_backend_dev_t cur_dev = cur.first;
71977196 ggml_backend_buffer_type_t cur_buft = cur.second;
7198- if (weight_buft_supported(tensor, op, cur_buft, cur_dev)) {
7197+ if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
71997198 return cur_buft;
72007199 }
72017200 }
@@ -7420,8 +7419,6 @@ static bool llm_load_tensors(
74207419 ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
74217420 ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
74227421
7423-
7424- constexpr auto * func = __func__;
74257422 auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
74267423 ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
74277424
@@ -7482,7 +7479,7 @@ static bool llm_load_tensors(
74827479 GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
74837480 }
74847481
7485- ggml_backend_buffer_type_t buft = select_weight_buft(t_meta, op, buft_list);
7482+ ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, * buft_list);
74867483 if (!buft) {
74877484 throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
74887485 }
@@ -7512,8 +7509,7 @@ static bool llm_load_tensors(
75127509 return t;
75137510 }
75147511 }
7515- ggml_tensor * t = ml.create_tensor(ctx, tn, ne, flags);
7516- return t;
7512+ return ml.create_tensor(ctx, tn, ne, flags);
75177513 };
75187514
75197515 model.layers.resize(n_layer);
@@ -9064,11 +9060,10 @@ static bool llm_load_tensors(
90649060 }
90659061
90669062 if (n_moved_tensors > 0) {
9067- LLAMA_LOG_WARN ("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
9068- func , first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
9069- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
9063+ LLAMA_LOG_DEBUG ("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
9064+ __func__ , first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
9065+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
90709066 }
9071-
90729067 }
90739068
90749069 ml.done_getting_tensors();
@@ -9146,7 +9141,7 @@ static bool llm_load_tensors(
91469141
91479142 for (auto & buf : bufs) {
91489143 // indicate that this buffer contains weights
9149- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
9144+ // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
91509145 ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
91519146 }
91529147
@@ -19517,7 +19512,7 @@ struct llama_context * llama_new_context_with_model(
1951719512 GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
1951819513
1951919514 if (!hparams.vocab_only) {
19520- // initialize backends
19515+ // GPU backends
1952119516 for (auto * dev : model->devices) {
1952219517 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
1952319518 if (backend == nullptr) {
@@ -19528,7 +19523,7 @@ struct llama_context * llama_new_context_with_model(
1952819523 ctx->backends.push_back(backend);
1952919524 }
1953019525
19531- // add other backends (such as BLAS)
19526+ // add ACCEL backends (such as BLAS)
1953219527 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1953319528 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1953419529 if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
@@ -19542,6 +19537,7 @@ struct llama_context * llama_new_context_with_model(
1954219537 }
1954319538 }
1954419539
19540+ // add CPU backend
1954519541 ctx->backend_cpu = ggml_backend_cpu_init();
1954619542 if (ctx->backend_cpu == nullptr) {
1954719543 LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -19638,11 +19634,6 @@ struct llama_context * llama_new_context_with_model(
1963819634 continue;
1963919635 }
1964019636 auto * dev = ggml_backend_get_device(backend);
19641- if (!dev) {
19642- // backend is using old interface, not supported
19643- pipeline_parallel = false;
19644- break;
19645- }
1964619637 ggml_backend_dev_props props;
1964719638 ggml_backend_dev_get_props(dev, &props);
1964819639 if (!props.caps.async || !props.caps.events) {
@@ -19667,17 +19658,19 @@ struct llama_context * llama_new_context_with_model(
1966719658 llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
1966819659 ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
1966919660
19661+ // reserve pp graph first so that buffers are only allocated once
1967019662 ggml_backend_sched_reserve(ctx->sched, gf_pp);
1967119663 int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched);
1967219664 int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
1967319665
19666+ // reserve with tg graph to get the number of splits and nodes
1967419667 llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
1967519668 ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
1967619669 ggml_backend_sched_reserve(ctx->sched, gf_tg);
1967719670 int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched);
1967819671 int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
1967919672
19680- // restore
19673+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
1968119674 gf_pp = llama_build_graph(*ctx, ubatch_pp, false);
1968219675 if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) {
1968319676 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
@@ -19989,7 +19982,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
1998919982 cvec.tensors.reserve(model.hparams.n_layer);
1999019983 cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
1999119984 for (size_t il = 1; il < model.hparams.n_layer; il++) {
19992- ggml_backend_buffer_type_t buft = select_buft(model.dev_layer.at(il).buft_list,
19985+ ggml_backend_buffer_type_t buft = select_buft(* model.dev_layer.at(il).buft_list,
1999319986 [&](ggml_context * ctx) {
1999419987 ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
1999519988 ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
0 commit comments