Skip to content

Commit 4958a6c

Browse files
committed
[refactor] Create backend buffers
Refactor backend buffer creation (for model loading) into functions.
1 parent 5c9be64 commit 4958a6c

File tree

2 files changed

+43
-18
lines changed

2 files changed

+43
-18
lines changed

src/llama-model.cpp

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4288,6 +4288,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
42884288
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
42894289
pimpl->mappings.reserve(ml.mappings.size());
42904290

4291+
return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
4292+
}
4293+
4294+
bool llama_model::create_backend_buffers(std::size_t size_data,
4295+
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
4296+
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
4297+
const int32_t n_gpu_layers, bool do_print_backend_buffers_info) {
42914298
// create the backend buffers
42924299
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
42934300
ctx_bufs.reserve(ctx_map.size());
@@ -4296,7 +4303,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
42964303
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
42974304
pimpl->bufs.reserve(n_max_backend_buffer);
42984305

4299-
for (auto & it : ctx_map) {
4306+
for (const auto & it : ctx_map) {
43004307
ggml_backend_buffer_type_t buft = it.first;
43014308
ggml_context * ctx = it.second;
43024309

@@ -4372,23 +4379,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
43724379
ctx_bufs.emplace_back(ctx, buf_map);
43734380
}
43744381

4375-
if (llama_supports_gpu_offload()) {
4376-
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4377-
4378-
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
4379-
if (n_gpu_layers > (int) hparams.n_layer) {
4380-
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
4381-
}
4382-
4383-
const int max_backend_supported_layers = hparams.n_layer + 1;
4384-
const int max_offloadable_layers = hparams.n_layer + 1;
4385-
4386-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
4387-
}
4388-
4389-
// print memory requirements per buffer type
4390-
for (auto & buf : pimpl->bufs) {
4391-
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4382+
if(do_print_backend_buffers_info) {
4383+
print_backend_buffers_info(n_gpu_layers);
43924384
}
43934385

43944386
// populate tensors_by_name
@@ -4416,6 +4408,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
44164408
return true;
44174409
}
44184410

4411+
void llama_model::print_backend_buffers_info(const int32_t n_gpu_layers) {
4412+
if (llama_supports_gpu_offload()) {
4413+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4414+
4415+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
4416+
if (n_gpu_layers > (int) hparams.n_layer) {
4417+
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
4418+
}
4419+
4420+
const int max_backend_supported_layers = hparams.n_layer + 1;
4421+
const int max_offloadable_layers = hparams.n_layer + 1;
4422+
4423+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers),
4424+
max_backend_supported_layers);
4425+
}
4426+
4427+
// print memory requirements per buffer type
4428+
for (auto & buf : pimpl->bufs) {
4429+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()),
4430+
ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4431+
}
4432+
}
4433+
44194434
std::string llama_model::arch_name() const {
44204435
return llm_arch_name(arch);
44214436
}

src/llama-model.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77
#include "llama-memory.h"
88
#include "llama-vocab.h"
99

10+
#include <cstdint>
1011
#include <memory>
1112
#include <string>
1213
#include <unordered_map>
1314
#include <vector>
15+
#include <map>
1416

1517
struct llama_cparams;
1618
struct llama_ubatch;
@@ -373,6 +375,14 @@ struct llama_model {
373375
explicit llama_model(const struct llama_model_params & params);
374376
~llama_model();
375377

378+
/// @brief Create backend buffers for all tensors
379+
bool create_backend_buffers(std::size_t size_data,
380+
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
381+
llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
382+
bool do_print_backend_buffers_info = true);
383+
384+
void print_backend_buffers_info(int32_t n_gpu_layers);
385+
376386
void load_stats (llama_model_loader & ml);
377387
void load_arch (llama_model_loader & ml);
378388
void load_hparams(llama_model_loader & ml);

0 commit comments

Comments
 (0)