@@ -316,6 +316,10 @@ static void llama_params_fit_impl(
316316 if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
317317 throw std::runtime_error (" changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort" );
318318 }
319+ if (hp_ngl < 2 *nd) {
320+ throw std::runtime_error (" model has only " + std::to_string (hp_ngl) + " layers but need at least "
321+ + std::to_string (2 *nd) + " to fit memory for " + std::to_string (nd) + " devices, abort" );
322+ }
319323 }
320324 if (!tensor_buft_overrides) {
321325 throw std::runtime_error (" did not provide buffer to set tensor_buft_overrides, abort" );
@@ -382,17 +386,22 @@ static void llama_params_fit_impl(
382386 auto set_ngl_tensor_split_tbo = [&](
383387 const std::vector<ngl_t > & ngl_per_device,
384388 const std::vector<ggml_backend_buffer_type_t > & overflow_bufts,
385- llama_model_params & mparams) {
389+ llama_model_params & mparams,
390+ const bool add_nonrepeating) {
386391 mparams.n_gpu_layers = 0 ;
387392 for (size_t id = 0 ; id < nd; id++) {
388393 mparams.n_gpu_layers += ngl_per_device[id].n_layer ;
389394 if (nd > 1 ) {
390395 tensor_split[id] = ngl_per_device[id].n_layer ;
391396 }
392397 }
393- assert (uint32_t (mparams.n_gpu_layers ) <= hp_ngl + 1 );
394- uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers ; // start index for tensor buft overrides
398+ assert (uint32_t (mparams.n_gpu_layers ) <= hp_ngl);
399+ uint32_t il0 = hp_ngl - mparams.n_gpu_layers ; // start index for tensor buft overrides
395400
401+ if (add_nonrepeating) {
402+ mparams.n_gpu_layers += 1 ;
403+ tensor_split[nd - 1 ] += 1 ;
404+ }
396405 mparams.tensor_split = tensor_split;
397406
398407 size_t itbo = 0 ;
@@ -423,9 +432,10 @@ static void llama_params_fit_impl(
423432 auto get_memory_for_layers = [&](
424433 const char * func_name,
425434 const std::vector<ngl_t > & ngl_per_device,
426- const std::vector<ggml_backend_buffer_type_t > & overflow_bufts) -> std::vector<int64_t > {
435+ const std::vector<ggml_backend_buffer_type_t > & overflow_bufts,
436+ const bool add_nonrepeating) -> std::vector<int64_t > {
427437 llama_model_params mparams_copy = *mparams;
428- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, mparams_copy);
438+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating );
429439
430440 const dmds_t dmd_nl = llama_get_device_memory_data (
431441 path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -483,6 +493,9 @@ static void llama_params_fit_impl(
483493 LLAMA_LOG_DEBUG (" %s: id=%zu, target=%" PRId64 " MiB\n " , __func__, id, targets[id]/MiB);
484494 }
485495
496+ // whether for the optimal memory use we expect to load at least some MoE tensors:
497+ const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0 ;
498+
486499 std::vector<ggml_backend_buffer_type_t > overflow_bufts; // which bufts the partial layers of a device overflow to:
487500 overflow_bufts.reserve (nd);
488501 for (size_t id = 0 ; id < nd - 1 ; ++id) {
@@ -491,7 +504,7 @@ static void llama_params_fit_impl(
491504 overflow_bufts.push_back (ggml_backend_cpu_buffer_type ());
492505
493506 std::vector<ngl_t > ngl_per_device (nd);
494- std::vector<int64_t > mem = get_memory_for_layers (__func__, ngl_per_device, overflow_bufts);
507+ std::vector<int64_t > mem = get_memory_for_layers (__func__, ngl_per_device, overflow_bufts, partial_moe );
495508 if (hp_nex > 0 ) {
496509 for (size_t id = 0 ; id < nd; id++) {
497510 ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -504,14 +517,13 @@ static void llama_params_fit_impl(
504517 // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
505518 // - check memory use of our guess, replace either the low or high bound
506519 // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
507- // - the last device has the output layer, which cannot be a partial layer
508520 if (hp_nex == 0 ) {
509521 LLAMA_LOG_INFO (" %s: filling dense layers back-to-front:\n " , __func__);
510522 } else {
511523 LLAMA_LOG_INFO (" %s: filling dense-only layers back-to-front:\n " , __func__);
512524 }
513525 for (int id = nd - 1 ; id >= 0 ; id--) {
514- uint32_t n_unassigned = hp_ngl + 1 ;
526+ uint32_t n_unassigned = hp_ngl;
515527 for (size_t jd = id + 1 ; jd < nd; ++jd) {
516528 assert (n_unassigned >= ngl_per_device[jd].n_layer );
517529 n_unassigned -= ngl_per_device[jd].n_layer ;
@@ -520,10 +532,10 @@ static void llama_params_fit_impl(
520532 std::vector<ngl_t > ngl_per_device_high = ngl_per_device;
521533 ngl_per_device_high[id].n_layer = n_unassigned;
522534 if (hp_nex > 0 ) {
523- ngl_per_device_high[id].n_part = size_t (id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id]. n_layer - 1 ;
535+ ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer ;
524536 }
525537 if (ngl_per_device_high[id].n_layer > 0 ) {
526- std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts);
538+ std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts, partial_moe );
527539 if (mem_high[id] > targets[id]) {
528540 assert (ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer );
529541 uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer ;
@@ -538,7 +550,7 @@ static void llama_params_fit_impl(
538550 if (hp_nex) {
539551 ngl_per_device_test[id].n_part += step_size;
540552 }
541- const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
553+ const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
542554
543555 if (mem_test[id] <= targets[id]) {
544556 ngl_per_device = ngl_per_device_test;
@@ -565,7 +577,7 @@ static void llama_params_fit_impl(
565577 __func__, dev_names[id].c_str (), ngl_per_device[id].n_layer , mem[id]/MiB, projected_margin/MiB);
566578 }
567579 if (hp_nex == 0 || global_surplus_cpu_moe <= 0 ) {
568- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams);
580+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams, partial_moe );
569581 return ;
570582 }
571583
@@ -588,13 +600,13 @@ static void llama_params_fit_impl(
588600 for (size_t id = 0 ; id <= id_dense_start; id++) {
589601 std::vector<ngl_t > ngl_per_device_high = ngl_per_device;
590602 for (size_t jd = id_dense_start; jd < nd; jd++) {
591- const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd]. n_layer - 1 ;
603+ const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer ;
592604 ngl_per_device_high[id].n_layer += n_layer_move;
593605 ngl_per_device_high[jd].n_layer -= n_layer_move;
594606 ngl_per_device_high[jd].n_part = 0 ;
595607 }
596608 size_t id_dense_start_high = nd - 1 ;
597- std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts);
609+ std::vector<int64_t > mem_high = get_memory_for_layers (__func__, ngl_per_device_high, overflow_bufts, partial_moe );
598610
599611 if (mem_high[id] > targets[id]) {
600612 assert (ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part );
@@ -622,7 +634,7 @@ static void llama_params_fit_impl(
622634 break ;
623635 }
624636 }
625- const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
637+ const std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
626638
627639 if (mem_test[id] <= targets[id]) {
628640 ngl_per_device = ngl_per_device_test;
@@ -649,7 +661,7 @@ static void llama_params_fit_impl(
649661 }
650662
651663 // try to fit at least part of one more layer
652- if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1 ) ) {
664+ if (ngl_per_device[id_dense_start].n_layer > 0 ) {
653665 std::vector<ngl_t > ngl_per_device_test = ngl_per_device;
654666 size_t id_dense_start_test = id_dense_start;
655667 ngl_per_device_test[id_dense_start_test].n_layer --;
@@ -661,7 +673,7 @@ static void llama_params_fit_impl(
661673 }
662674 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
663675 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n " , __func__);
664- std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
676+ std::vector<int64_t > mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
665677 if (mem_test[id] < targets[id]) {
666678 ngl_per_device = ngl_per_device_test;
667679 mem = mem_test;
@@ -671,7 +683,7 @@ static void llama_params_fit_impl(
671683
672684 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
673685 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n " , __func__);
674- mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
686+ mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
675687 if (mem_test[id] < targets[id]) {
676688 ngl_per_device = ngl_per_device_test;
677689 mem = mem_test;
@@ -682,7 +694,7 @@ static void llama_params_fit_impl(
682694 } else {
683695 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
684696 LLAMA_LOG_DEBUG (" %s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n " , __func__);
685- mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts);
697+ mem_test = get_memory_for_layers (__func__, ngl_per_device_test, overflow_bufts, partial_moe );
686698 if (mem_test[id] < targets[id]) {
687699 ngl_per_device = ngl_per_device_test;
688700 mem = mem_test;
@@ -699,7 +711,7 @@ static void llama_params_fit_impl(
699711 __func__, dev_names[id].c_str (), ngl_per_device[id].n_layer , ngl_per_device[id].n_part , mem[id]/MiB, projected_margin/MiB);
700712 }
701713
702- set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams);
714+ set_ngl_tensor_split_tbo (ngl_per_device, overflow_bufts, *mparams, partial_moe );
703715}
704716
705717bool llama_params_fit (
0 commit comments