Skip to content

Commit e2f4449

Browse files
committed
revert broken change from ggml-org#18148 pending fix
1 parent 2e57e5e commit e2f4449

File tree

2 files changed

+37
-27
lines changed

2 files changed

+37
-27
lines changed

src/llama-model.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2480,7 +2480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24802480
}
24812481

24822482
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2483-
int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2483+
int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
24842484

24852485
#if defined(GGML_USE_CLBLAST)
24862486
printf("\nOpenCL GPU Offload Fallback...\n");
@@ -2491,9 +2491,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24912491
if (cpu_dev == nullptr) {
24922492
throw std::runtime_error(format("%s: no CPU backend found", __func__));
24932493
}
2494-
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2494+
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
24952495
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2496-
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
2496+
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
24972497
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
24982498
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
24992499
return {cpu_dev, &pimpl->cpu_buft_list};
@@ -6852,12 +6852,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
68526852
if (llama_supports_gpu_offload()) {
68536853
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
68546854

6855-
int n_repeating = n_gpu;
6856-
if (n_repeating > 0) {
6855+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6856+
if (n_gpu_layers > (int) hparams.n_layer) {
68576857
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6858-
n_repeating--;
68596858
}
6860-
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
68616859

68626860
const int max_backend_supported_layers = hparams.n_layer + 1;
68636861
const int max_offloadable_layers = hparams.n_layer + 1;

src/llama.cpp

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,10 @@ static void llama_params_fit_impl(
316316
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
317317
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
318318
}
319+
if (hp_ngl < 2*nd) {
320+
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
321+
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
322+
}
319323
}
320324
if (!tensor_buft_overrides) {
321325
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
@@ -382,17 +386,22 @@ static void llama_params_fit_impl(
382386
auto set_ngl_tensor_split_tbo = [&](
383387
const std::vector<ngl_t> & ngl_per_device,
384388
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
385-
llama_model_params & mparams) {
389+
llama_model_params & mparams,
390+
const bool add_nonrepeating) {
386391
mparams.n_gpu_layers = 0;
387392
for (size_t id = 0; id < nd; id++) {
388393
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
389394
if (nd > 1) {
390395
tensor_split[id] = ngl_per_device[id].n_layer;
391396
}
392397
}
393-
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
394-
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
398+
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
399+
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
395400

401+
if (add_nonrepeating) {
402+
mparams.n_gpu_layers += 1;
403+
tensor_split[nd - 1] += 1;
404+
}
396405
mparams.tensor_split = tensor_split;
397406

398407
size_t itbo = 0;
@@ -423,9 +432,10 @@ static void llama_params_fit_impl(
423432
auto get_memory_for_layers = [&](
424433
const char * func_name,
425434
const std::vector<ngl_t> & ngl_per_device,
426-
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
435+
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
436+
const bool add_nonrepeating) -> std::vector<int64_t> {
427437
llama_model_params mparams_copy = *mparams;
428-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
438+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
429439

430440
const dmds_t dmd_nl = llama_get_device_memory_data(
431441
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -483,6 +493,9 @@ static void llama_params_fit_impl(
483493
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
484494
}
485495

496+
// whether for the optimal memory use we expect to load at least some MoE tensors:
497+
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
498+
486499
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
487500
overflow_bufts.reserve(nd);
488501
for (size_t id = 0; id < nd - 1; ++id) {
@@ -491,7 +504,7 @@ static void llama_params_fit_impl(
491504
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
492505

493506
std::vector<ngl_t> ngl_per_device(nd);
494-
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
507+
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
495508
if (hp_nex > 0) {
496509
for (size_t id = 0; id < nd; id++) {
497510
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -504,14 +517,13 @@ static void llama_params_fit_impl(
504517
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
505518
// - check memory use of our guess, replace either the low or high bound
506519
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
507-
// - the last device has the output layer, which cannot be a partial layer
508520
if (hp_nex == 0) {
509521
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
510522
} else {
511523
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
512524
}
513525
for (int id = nd - 1; id >= 0; id--) {
514-
uint32_t n_unassigned = hp_ngl + 1;
526+
uint32_t n_unassigned = hp_ngl;
515527
for (size_t jd = id + 1; jd < nd; ++jd) {
516528
assert(n_unassigned >= ngl_per_device[jd].n_layer);
517529
n_unassigned -= ngl_per_device[jd].n_layer;
@@ -520,10 +532,10 @@ static void llama_params_fit_impl(
520532
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
521533
ngl_per_device_high[id].n_layer = n_unassigned;
522534
if (hp_nex > 0) {
523-
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
535+
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
524536
}
525537
if (ngl_per_device_high[id].n_layer > 0) {
526-
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
538+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
527539
if (mem_high[id] > targets[id]) {
528540
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
529541
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
@@ -538,7 +550,7 @@ static void llama_params_fit_impl(
538550
if (hp_nex) {
539551
ngl_per_device_test[id].n_part += step_size;
540552
}
541-
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
553+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
542554

543555
if (mem_test[id] <= targets[id]) {
544556
ngl_per_device = ngl_per_device_test;
@@ -565,7 +577,7 @@ static void llama_params_fit_impl(
565577
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
566578
}
567579
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
568-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
580+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
569581
return;
570582
}
571583

@@ -588,13 +600,13 @@ static void llama_params_fit_impl(
588600
for (size_t id = 0; id <= id_dense_start; id++) {
589601
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
590602
for (size_t jd = id_dense_start; jd < nd; jd++) {
591-
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
603+
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
592604
ngl_per_device_high[id].n_layer += n_layer_move;
593605
ngl_per_device_high[jd].n_layer -= n_layer_move;
594606
ngl_per_device_high[jd].n_part = 0;
595607
}
596608
size_t id_dense_start_high = nd - 1;
597-
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
609+
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
598610

599611
if (mem_high[id] > targets[id]) {
600612
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
@@ -622,7 +634,7 @@ static void llama_params_fit_impl(
622634
break;
623635
}
624636
}
625-
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
637+
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
626638

627639
if (mem_test[id] <= targets[id]) {
628640
ngl_per_device = ngl_per_device_test;
@@ -649,7 +661,7 @@ static void llama_params_fit_impl(
649661
}
650662

651663
// try to fit at least part of one more layer
652-
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
664+
if (ngl_per_device[id_dense_start].n_layer > 0) {
653665
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
654666
size_t id_dense_start_test = id_dense_start;
655667
ngl_per_device_test[id_dense_start_test].n_layer--;
@@ -661,7 +673,7 @@ static void llama_params_fit_impl(
661673
}
662674
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
663675
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
664-
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
676+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
665677
if (mem_test[id] < targets[id]) {
666678
ngl_per_device = ngl_per_device_test;
667679
mem = mem_test;
@@ -671,7 +683,7 @@ static void llama_params_fit_impl(
671683

672684
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
673685
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
674-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
686+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
675687
if (mem_test[id] < targets[id]) {
676688
ngl_per_device = ngl_per_device_test;
677689
mem = mem_test;
@@ -682,7 +694,7 @@ static void llama_params_fit_impl(
682694
} else {
683695
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
684696
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
685-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
697+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
686698
if (mem_test[id] < targets[id]) {
687699
ngl_per_device = ngl_per_device_test;
688700
mem = mem_test;
@@ -699,7 +711,7 @@ static void llama_params_fit_impl(
699711
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
700712
}
701713

702-
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
714+
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
703715
}
704716

705717
bool llama_params_fit(

0 commit comments

Comments
 (0)