Skip to content

Commit 172c694

Browse files
add debug logging
1 parent 2f3a9c9 commit 172c694

File tree

1 file changed

+59
-14
lines changed

1 file changed

+59
-14
lines changed

src/llama.cpp

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
119119
hp_n_ctx_train = model->hparams.n_ctx_train;
120120
hp_n_expert = model->hparams.n_expert;
121121

122+
llama_memory_breakdown_print(ctx); // goes to debug log
123+
122124
llama_free(ctx);
123125
llama_model_free(model);
124126
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
@@ -142,6 +144,7 @@ bool llama_params_fit(
142144

143145
// step 1: get data for default parameters and check whether any changes are necessary in the first place
144146

147+
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
145148
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
146149
const size_t nd = devs.size(); // number of devices
147150
if (nd == 0) {
@@ -316,6 +319,7 @@ bool llama_params_fit(
316319
tensor_buft_overides[1] = {nullptr, nullptr};
317320
mparams->tensor_buft_overrides = tensor_buft_overides;
318321

322+
LLAMA_LOG_DEBUG("%s: getting device memory data for all MoE tensors in system memory:\n", __func__);
319323
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
320324
int64_t global_surplus = 0;
321325
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
@@ -328,34 +332,59 @@ bool llama_params_fit(
328332
// step 3: for MoE models, if at least the dense tensors can be fit, try fitting as many full layers as possible
329333

330334
const uint32_t nl_scaling = hp_ngl / nd;
331-
const std::vector<memory_scaling> spl_part = get_memory_scaling( // size per device and per partial == Moe only layer
332-
get_memory_for_const_layer(1), get_memory_for_const_layer(nl_scaling), nl_scaling);
335+
std::vector<memory_scaling> spl_part; // size per device and per partial == Moe only layer
336+
{
337+
LLAMA_LOG_DEBUG("%s: getting device memory data for 1 layer + all MoE tensors in system memory:\n", __func__);
338+
auto tmp1 = get_memory_for_const_layer(1);
339+
LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " layers + all MoE tensors in system memory:\n", __func__, nl_scaling);
340+
auto tmpn = get_memory_for_const_layer(nl_scaling);
341+
spl_part = get_memory_scaling(tmp1, tmpn, nl_scaling);
342+
}
343+
for (size_t id = 0; id < nd; id++) {
344+
LLAMA_LOG_DEBUG("%s: spl_part[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n",
345+
__func__, id, spl_part[id].base/MiB, spl_part[id].per_layer/MiB);
346+
}
333347

334348
// for spl_part all MoE tensors were still on CPU, reset the TBOs so that all tensors are on the devices again
335349
tensor_buft_overides[0] = {nullptr, nullptr};
336350
mparams->tensor_buft_overrides = tensor_buft_overides;
337351

338-
const std::vector<memory_scaling> spl_full = get_memory_scaling( // size per device and per full layer
339-
get_memory_for_const_layer(1), get_memory_for_const_layer(nl_scaling), nl_scaling);
352+
std::vector<memory_scaling> spl_full; // size per device and per full layer
353+
{
354+
LLAMA_LOG_DEBUG("%s: getting device memory data for 1 layer + all tensors in device memory:\n", __func__);
355+
auto tmp1 = get_memory_for_const_layer(1);
356+
LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " layers + all tensors in device memory:\n", __func__, nl_scaling);
357+
auto tmpn = get_memory_for_const_layer(nl_scaling);
358+
spl_full = get_memory_scaling(tmp1, tmpn, nl_scaling);
359+
}
360+
for (size_t id = 0; id < nd; id++) {
361+
LLAMA_LOG_DEBUG("%s: spl_full[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n",
362+
__func__, id, spl_full[id].base/MiB, spl_full[id].per_layer/MiB);
363+
}
340364

341365
// the non-repeating tensors (e.g. output matrix) are difficult to quantify,
342366
// get memory use with all tensors on the last device and use that as the starting point for the last device only
343367
for (size_t id = 0; id < nd - 1; id++) {
344368
tensor_split[id] = 0.0f;
345369
}
346370
tensor_split[nd - 1] = 1.0f;
371+
LLAMA_LOG_DEBUG("%s: getting device memory data with entire model on last device:\n", __func__);
347372
const dmds_t dmds_last = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
348373
tensor_split[nd - 1] = 0.0f;
349374

350375
struct ngl {
351376
uint32_t part = 0;
352377
uint32_t full = 0;
378+
379+
explicit operator std::string() const {
380+
return "[" + std::to_string(part) + ", " + std::to_string(full) + "]";
381+
}
353382
};
354383

355384
// utility function that distributes layers to devices and returns whether the memory margin can be met on all devices
356385
// - ngl_per_device: resulting distribution of dense-only/full layers across devices
357386
// - global_ngl_part: total number of sense-only layers
358-
auto distribute_layers = [&](std::vector<ngl> & ngl_per_device, const uint32_t global_ngl_part) -> bool {
387+
auto distribute_layers = [&](std::vector<ngl> & ngl_per_device, std::vector<int64_t> & usable_memory, const uint32_t global_ngl_part) -> bool {
359388
// reset result to initial state, initially put entire model on the last device
360389
for (size_t id = 0; id < nd - 1; id++) {
361390
ngl_per_device[id] = {0, 0};
@@ -364,16 +393,14 @@ bool llama_params_fit(
364393
ngl_per_device.back().full = hp_ngl + 1;
365394

366395
// usable_memory: free memory above margin that can be used for further allocations
367-
std::vector<int64_t> usable_memory;
368-
usable_memory.reserve(nd);
369396
for (size_t id = 0; id < nd - 1; id++) {
370397
int64_t um = dmds_last[id].free - margin - spl_full[id].base;
371398
um = std::max(um, int64_t(0));
372-
usable_memory.push_back(um);
399+
usable_memory[id] = um;
373400
}
374401
{
375402
const llama_memory_breakdown_data & mb = dmds_last.back().mb;
376-
usable_memory.push_back(dmds_last.back().free - int64_t(mb.model + mb.context + mb.context) - margin);
403+
usable_memory.back() = dmds_last.back().free - int64_t(mb.model + mb.context + mb.context) - margin;
377404
}
378405

379406
// convert some layers on the last device from full layers to dense-only layers
@@ -425,9 +452,21 @@ bool llama_params_fit(
425452

426453
// iteratively increase the number of partial layers until the memory consumption is low enough
427454
std::vector<ngl> ngl_per_device(nd);
428-
for (uint32_t global_ngl_part = 0; global_ngl_part < hp_ngl; global_ngl_part++) {
429-
if (distribute_layers(ngl_per_device, global_ngl_part)) {
430-
break;
455+
{
456+
std::vector<int64_t> usable_memory(nd);
457+
for (uint32_t global_ngl_part = 0; global_ngl_part < hp_ngl; global_ngl_part++) {
458+
const bool success = distribute_layers(ngl_per_device, usable_memory, global_ngl_part);
459+
std::string ngl_per_device_str = std::string(ngl_per_device[0]);
460+
std::string usable_memory_str = std::to_string(usable_memory[0]/MiB);
461+
for (size_t id = 1; id < nd; id++) {
462+
ngl_per_device_str += ", " + std::string(ngl_per_device[id]);
463+
usable_memory_str += ", " + std::to_string(usable_memory[id]/MiB);
464+
}
465+
LLAMA_LOG_DEBUG("%s: global_ngl_part=%" PRIu32 ", success=%d, ngl_per_device=[%s], usable_memory[MiB]=[%s]\n",
466+
__func__, global_ngl_part, success ? 1 : 0, ngl_per_device_str.c_str(), usable_memory_str.c_str());
467+
if (success) {
468+
break;
469+
}
431470
}
432471
}
433472

@@ -504,8 +543,14 @@ bool llama_params_fit(
504543
// all layers are the same so simply determine how many layers will fit per device
505544

506545
const uint32_t nl_scaling = hp_ngl / nd;
507-
const std::vector<memory_scaling> ms = get_memory_scaling(
508-
get_memory_for_const_layer(1), get_memory_for_const_layer(nl_scaling), nl_scaling);
546+
std::vector<memory_scaling> ms;
547+
{
548+
LLAMA_LOG_DEBUG("%s: getting device memory data for 1 full layer:\n", __func__);
549+
auto tmp1 = get_memory_for_const_layer(1);
550+
LLAMA_LOG_DEBUG("%s: getting device memory data for %" PRIu32 " full layers:\n", __func__, nl_scaling);
551+
auto tmpn = get_memory_for_const_layer(nl_scaling);
552+
ms = get_memory_scaling(tmp1, tmpn, nl_scaling);
553+
}
509554

510555
mparams->n_gpu_layers = 0;
511556
std::vector<uint32_t> ngl_per_device;

0 commit comments

Comments
 (0)