@@ -4288,6 +4288,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4288
4288
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
4289
4289
pimpl->mappings.reserve(ml.mappings.size());
4290
4290
4291
+ return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
4292
+ }
4293
+
4294
+ bool llama_model::create_backend_buffers(std::size_t size_data,
4295
+ const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
4296
+ llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
4297
+ const int32_t n_gpu_layers, bool do_print_backend_buffers_info) {
4291
4298
// create the backend buffers
4292
4299
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
4293
4300
ctx_bufs.reserve(ctx_map.size());
@@ -4296,7 +4303,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4296
4303
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
4297
4304
pimpl->bufs.reserve(n_max_backend_buffer);
4298
4305
4299
- for (auto & it : ctx_map) {
4306
+ for (const auto & it : ctx_map) {
4300
4307
ggml_backend_buffer_type_t buft = it.first;
4301
4308
ggml_context * ctx = it.second;
4302
4309
@@ -4372,23 +4379,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4372
4379
ctx_bufs.emplace_back(ctx, buf_map);
4373
4380
}
4374
4381
4375
- if (llama_supports_gpu_offload()) {
4376
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4377
-
4378
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
4379
- if (n_gpu_layers > (int) hparams.n_layer) {
4380
- LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
4381
- }
4382
-
4383
- const int max_backend_supported_layers = hparams.n_layer + 1;
4384
- const int max_offloadable_layers = hparams.n_layer + 1;
4385
-
4386
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
4387
- }
4388
-
4389
- // print memory requirements per buffer type
4390
- for (auto & buf : pimpl->bufs) {
4391
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4382
+ if(do_print_backend_buffers_info) {
4383
+ print_backend_buffers_info(n_gpu_layers);
4392
4384
}
4393
4385
4394
4386
// populate tensors_by_name
@@ -4416,6 +4408,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4416
4408
return true;
4417
4409
}
4418
4410
4411
+ void llama_model::print_backend_buffers_info(const int32_t n_gpu_layers) {
4412
+ if (llama_supports_gpu_offload()) {
4413
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4414
+
4415
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
4416
+ if (n_gpu_layers > (int) hparams.n_layer) {
4417
+ LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
4418
+ }
4419
+
4420
+ const int max_backend_supported_layers = hparams.n_layer + 1;
4421
+ const int max_offloadable_layers = hparams.n_layer + 1;
4422
+
4423
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers),
4424
+ max_backend_supported_layers);
4425
+ }
4426
+
4427
+ // print memory requirements per buffer type
4428
+ for (auto & buf : pimpl->bufs) {
4429
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()),
4430
+ ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
4431
+ }
4432
+ }
4433
+
4419
4434
std::string llama_model::arch_name() const {
4420
4435
return llm_arch_name(arch);
4421
4436
}
0 commit comments