101101#endif
102102
103103// bump if necessary
104- #define LLAMA_MAX_NODES 8192
105104#define LLAMA_MAX_LAYERS 512
106105#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
107106
@@ -3567,6 +3566,15 @@ namespace GGUFMeta {
35673566
35683567using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
35693568
3569+ // TODO: update when needed or think of some clever automatic way to do this
3570+ static size_t llama_model_max_nodes(const llama_model & /*model*/) {
3571+ //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
3572+ // return 32768;
3573+ //}
3574+
3575+ return 8192;
3576+ }
3577+
35703578struct llama_model_loader {
35713579 int n_kv = 0;
35723580 int n_tensors = 0;
@@ -8396,7 +8404,7 @@ struct llm_build_context {
83968404 }
83978405
83988406 struct ggml_cgraph * build_k_shift() {
8399- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8407+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
84008408
84018409 GGML_ASSERT(kv_self.size == n_ctx);
84028410
@@ -8427,7 +8435,7 @@ struct llm_build_context {
84278435 }
84288436
84298437 struct ggml_cgraph * build_s_copy() {
8430- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8438+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
84318439
84328440 GGML_ASSERT(kv_self.recurrent);
84338441
@@ -8450,7 +8458,7 @@ struct llm_build_context {
84508458 }
84518459
84528460 struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
8453- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8461+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
84548462
84558463 for (uint32_t i = 0; i < ids.size(); ++i) {
84568464 const uint32_t id = ids[i];
@@ -8691,7 +8699,7 @@ struct llm_build_context {
86918699 }
86928700
86938701 struct ggml_cgraph * build_llama() {
8694- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8702+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
86958703
86968704 // mutable variable, needed during the last layer of the computation to skip unused tokens
86978705 int32_t n_tokens = this->n_tokens;
@@ -8834,7 +8842,7 @@ struct llm_build_context {
88348842 }
88358843
88368844 struct ggml_cgraph * build_baichuan() {
8837- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8845+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
88388846
88398847 const int64_t n_embd_head = hparams.n_embd_head_v;
88408848 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8949,7 +8957,7 @@ struct llm_build_context {
89498957 }
89508958
89518959 struct ggml_cgraph * build_xverse() {
8952- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
8960+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
89538961
89548962 const int64_t n_embd_head = hparams.n_embd_head_v;
89558963 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9052,7 +9060,7 @@ struct llm_build_context {
90529060 }
90539061
90549062 struct ggml_cgraph * build_falcon() {
9055- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9063+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
90569064
90579065 const int64_t n_embd_head = hparams.n_embd_head_v;
90589066 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9172,7 +9180,7 @@ struct llm_build_context {
91729180 }
91739181
91749182 struct ggml_cgraph * build_grok() {
9175- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9183+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
91769184
91779185 // mutable variable, needed during the last layer of the computation to skip unused tokens
91789186 int32_t n_tokens = this->n_tokens;
@@ -9329,7 +9337,7 @@ struct llm_build_context {
93299337 }
93309338
93319339 struct ggml_cgraph * build_dbrx() {
9332- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9340+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
93339341
93349342 // mutable variable, needed during the last layer of the computation to skip unused tokens
93359343 int32_t n_tokens = this->n_tokens;
@@ -9455,7 +9463,7 @@ struct llm_build_context {
94559463 }
94569464
94579465 struct ggml_cgraph * build_starcoder() {
9458- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9466+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
94599467
94609468 const int64_t n_embd_head = hparams.n_embd_head_v;
94619469 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9559,7 +9567,7 @@ struct llm_build_context {
95599567 }
95609568
95619569 struct ggml_cgraph * build_refact() {
9562- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9570+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
95639571
95649572 const int64_t n_embd_head = hparams.n_embd_head_v;
95659573 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9653,7 +9661,7 @@ struct llm_build_context {
96539661 }
96549662
96559663 struct ggml_cgraph * build_bert() {
9656- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9664+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
96579665
96589666 const int64_t n_embd_head = hparams.n_embd_head_v;
96599667 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9847,7 +9855,7 @@ struct llm_build_context {
98479855 }
98489856
98499857 struct ggml_cgraph * build_bloom() {
9850- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9858+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
98519859
98529860 const int64_t n_embd_head = hparams.n_embd_head_v;
98539861 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9948,7 +9956,7 @@ struct llm_build_context {
99489956 }
99499957
99509958 struct ggml_cgraph * build_mpt() {
9951- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
9959+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
99529960
99539961 const int64_t n_embd_head = hparams.n_embd_head_v;
99549962 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10238,7 +10246,7 @@ struct llm_build_context {
1023810246 }
1023910247
1024010248 struct ggml_cgraph * build_qwen() {
10241- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10249+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1024210250
1024310251 const int64_t n_embd_head = hparams.n_embd_head_v;
1024410252 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10350,7 +10358,7 @@ struct llm_build_context {
1035010358 }
1035110359
1035210360 struct ggml_cgraph * build_qwen2() {
10353- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10361+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1035410362
1035510363 const int64_t n_embd_head = hparams.n_embd_head_v;
1035610364 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10462,7 +10470,7 @@ struct llm_build_context {
1046210470 }
1046310471
1046410472 struct ggml_cgraph * build_qwen2moe() {
10465- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10473+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1046610474
1046710475 // mutable variable, needed during the last layer of the computation to skip unused tokens
1046810476 int32_t n_tokens = this->n_tokens;
@@ -10608,7 +10616,7 @@ struct llm_build_context {
1060810616 }
1060910617
1061010618 struct ggml_cgraph * build_phi2() {
10611- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10619+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1061210620
1061310621 const int64_t n_embd_head = hparams.n_embd_head_v;
1061410622 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10729,7 +10737,7 @@ struct llm_build_context {
1072910737 }
1073010738
1073110739 struct ggml_cgraph * build_phi3() {
10732- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10740+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1073310741
1073410742 const int64_t n_embd_head = hparams.n_embd_head_v;
1073510743 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10961,7 +10969,7 @@ struct llm_build_context {
1096110969 }
1096210970
1096310971 struct ggml_cgraph * build_gpt2() {
10964- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
10972+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1096510973
1096610974 const int64_t n_embd_head = hparams.n_embd_head_v;
1096710975 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11066,7 +11074,7 @@ struct llm_build_context {
1106611074 }
1106711075
1106811076 struct ggml_cgraph * build_codeshell() {
11069- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11077+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1107011078
1107111079 const int64_t n_embd_head = hparams.n_embd_head_v;
1107211080 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11177,7 +11185,7 @@ struct llm_build_context {
1117711185 }
1117811186
1117911187 struct ggml_cgraph * build_orion() {
11180- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11188+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1118111189
1118211190 const int64_t n_embd_head = hparams.n_embd_head_v;
1118311191 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11295,7 +11303,7 @@ struct llm_build_context {
1129511303 }
1129611304
1129711305 struct ggml_cgraph * build_internlm2() {
11298- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11306+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1129911307
1130011308 const int64_t n_embd_head = hparams.n_embd_head_v;
1130111309 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11416,7 +11424,7 @@ struct llm_build_context {
1141611424 // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
1141711425 // based on the original build_llama() function
1141811426 struct ggml_cgraph * build_minicpm() {
11419- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11427+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1142011428
1142111429 const int64_t n_embd_head = hparams.n_embd_head_v;
1142211430 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11560,7 +11568,7 @@ struct llm_build_context {
1156011568 }
1156111569
1156211570 struct ggml_cgraph * build_gemma() {
11563- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11571+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1156411572
1156511573 const int64_t n_embd_head_k = hparams.n_embd_head_k;
1156611574
@@ -11668,7 +11676,7 @@ struct llm_build_context {
1166811676 }
1166911677
1167011678 struct ggml_cgraph * build_gemma2() {
11671- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11679+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1167211680
1167311681 const int64_t n_embd_head_k = hparams.n_embd_head_k;
1167411682
@@ -11803,7 +11811,7 @@ struct llm_build_context {
1180311811
1180411812
1180511813 struct ggml_cgraph * build_starcoder2() {
11806- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11814+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1180711815
1180811816 const int64_t n_embd_head = hparams.n_embd_head_v;
1180911817 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11922,7 +11930,7 @@ struct llm_build_context {
1192211930 }
1192311931
1192411932 struct ggml_cgraph * build_mamba() {
11925- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
11933+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1192611934
1192711935 const int64_t d_model = n_embd;
1192811936 const int64_t d_conv = hparams.ssm_d_conv;
@@ -12071,7 +12079,7 @@ struct llm_build_context {
1207112079
1207212080 struct ggml_cgraph * build_command_r() {
1207312081
12074- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12082+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1207512083
1207612084 const int64_t n_embd_head = hparams.n_embd_head_v;
1207712085 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12225,7 +12233,7 @@ struct llm_build_context {
1222512233 // * removed bias
1222612234 // * removed MoE
1222712235 struct ggml_cgraph * build_olmo() {
12228- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12236+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1222912237
1223012238 // mutable variable, needed during the last layer of the computation to skip unused tokens
1223112239 int32_t n_tokens = this->n_tokens;
@@ -12349,7 +12357,7 @@ struct llm_build_context {
1234912357 }
1235012358
1235112359 struct ggml_cgraph * build_openelm() {
12352- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12360+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1235312361
1235412362 const int64_t n_embd_head = hparams.n_embd_head_v;
1235512363 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12474,7 +12482,7 @@ struct llm_build_context {
1247412482 }
1247512483
1247612484 struct ggml_cgraph * build_gptneox() {
12477- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12485+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1247812486
1247912487 const int64_t n_embd_head = hparams.n_embd_head_v;
1248012488 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -12616,7 +12624,7 @@ struct llm_build_context {
1261612624 }
1261712625
1261812626 struct ggml_cgraph * build_arctic() {
12619- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12627+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1262012628
1262112629 // mutable variable, needed during the last layer of the computation to skip unused tokens
1262212630 int32_t n_tokens = this->n_tokens;
@@ -12748,7 +12756,7 @@ struct llm_build_context {
1274812756 }
1274912757
1275012758 struct ggml_cgraph * build_deepseek2() {
12751- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12759+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1275212760
1275312761 // mutable variable, needed during the last layer of the computation to skip unused tokens
1275412762 int32_t n_tokens = this->n_tokens;
@@ -12976,7 +12984,7 @@ struct llm_build_context {
1297612984 }
1297712985
1297812986 struct ggml_cgraph * build_bitnet() {
12979- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
12987+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1298012988
1298112989 const int64_t n_embd_head = hparams.n_embd_head_v;
1298212990 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13116,7 +13124,7 @@ struct llm_build_context {
1311613124 }
1311713125
1311813126 struct ggml_cgraph * build_t5() {
13119- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
13127+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1312013128
1312113129 // mutable variable, needed during the last layer of the computation to skip unused tokens
1312213130 int32_t n_tokens = this->n_tokens;
@@ -13433,7 +13441,7 @@ struct llm_build_context {
1343313441 }
1343413442
1343513443 struct ggml_cgraph * build_jais() {
13436- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
13444+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1343713445
1343813446 const int64_t n_embd_head = hparams.n_embd_head_v;
1343913447 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -13525,7 +13533,7 @@ struct llm_build_context {
1352513533 }
1352613534
1352713535 struct ggml_cgraph * build_chatglm() {
13528- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES , false);
13536+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model) , false);
1352913537
1353013538 const int64_t n_embd_head = hparams.n_embd_head_v;
1353113539 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -14870,9 +14878,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
1487014878 // each move requires 6*n_layer tensors (see build_defrag)
1487114879 // - source view, destination view, copy operation
1487214880 // - x2 for keys and values
14873- //const uint32_t max_moves = LLAMA_MAX_NODES /(6*n_layer);
14881+ //const uint32_t max_moves = llama_model_max_nodes(model) /(6*n_layer);
1487414882 // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
14875- const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
14883+ const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
1487614884
1487714885 // determine which KV cells to move where
1487814886 //
@@ -16762,8 +16770,10 @@ struct llama_context * llama_new_context_with_model(
1676216770 }
1676316771 }
1676416772
16773+ const size_t max_nodes = llama_model_max_nodes(*model);
16774+
1676516775 // buffer used to store the computation graph and the tensor meta data
16766- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES , false));
16776+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes , false));
1676716777
1676816778 // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
1676916779 bool pipeline_parallel =
@@ -16776,7 +16786,7 @@ struct llama_context * llama_new_context_with_model(
1677616786 // currently this is only implemented in the CUDA backend
1677716787 pipeline_parallel = false;
1677816788#endif
16779- ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES , pipeline_parallel);
16789+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes , pipeline_parallel);
1678016790
1678116791 if (pipeline_parallel) {
1678216792 LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
0 commit comments