Skip to content

Commit 9324c76

Browse files
committed
Refactored llama-model to adapt upstream's unique_ptr vs raw pointer approach
1 parent bcdd5ca commit 9324c76

File tree

3 files changed

+29
-17
lines changed

3 files changed

+29
-17
lines changed

src/llama-model-load.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,16 +96,17 @@ struct IncrementalSplitsTensorLoad {
9696
throw std::runtime_error("failed to create ggml context for split-file");
9797
}
9898

99-
ctx_split_map[key] = ctx;
100-
model_impl->ctxs.emplace_back(ctx);
99+
ctx_split_map[key] = ggml_context_ptr(ctx);
100+
// Contexts are cleaned up when create_split_backend_buffers is called
101+
// Review: this will be an issue if this ctx_split_map is used after create_split_backend_buffers is called
101102

102103
return ctx;
103104
}
104-
return it->second;
105+
return it->second.get();
105106
}
106107

107108
// public so that it can be processed by the backend storage allocator
108-
std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> ctx_split_map;
109+
std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> ctx_split_map;
109110

110111
private:
111112
struct TensorInfo {

src/llama-model.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2284,12 +2284,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
22842284
max_n_tensors += n_layer*2; // duplicated rope freq tensors
22852285
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
22862286

2287-
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2288-
struct ggml_backend_buft_comparator {
2289-
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2290-
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2291-
}
2292-
};
22932287
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
22942288

22952289
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
@@ -6354,14 +6348,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
63546348
}
63556349

63566350
bool llama_model::create_split_backend_buffers(
6357-
const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
6351+
const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> & ctx_split_map,
63586352
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
63596353
// Extract contexts for the given split index from ctx_split_map into a new map
6360-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
6361-
for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
6354+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
6355+
for (auto it = ctx_split_map.begin(); it != ctx_split_map.end();) {
6356+
const auto & [buft_split_idx, ctx_ptr] = *it;
63626357
const auto & [buft, split_idx] = buft_split_idx;
63636358
if (split_idx == idx) {
6364-
ctx_map[buft] = ctx;
6359+
// Move the context from ctx_split_map to ctx_map
6360+
ctx_map[buft] = std::move(it->second);
6361+
// Remove from ctx_split_map since ownership has been transferred
6362+
it = ctx_split_map.erase(it);
6363+
} else {
6364+
++it;
63656365
}
63666366
}
63676367

@@ -6370,12 +6370,15 @@ bool llama_model::create_split_backend_buffers(
63706370
constexpr bool do_print_backend_buffers_info = false;
63716371
const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
63726372
n_gpu_layers, do_print_backend_buffers_info);
6373+
6374+
// Note: create_backend_buffers moves the contexts into ctxs_bufs, taking ownership
6375+
// The contexts in ctx_map are now empty after the move, which is expected
63736376

63746377
return creation_success;
63756378
}
63766379

63776380
bool llama_model::create_backend_buffers(std::size_t size_data,
6378-
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
6381+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> & ctx_map,
63796382
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
63806383
const int32_t n_gpu_layers, bool do_print_backend_buffers_info) {
63816384
// create the backend buffers

src/llama-model.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "llama-vocab.h"
99

1010
#include <cstdint>
11+
#include <cstring>
1112
#include <memory>
1213
#include <string>
1314
#include <unordered_map>
@@ -408,6 +409,13 @@ struct llama_layer {
408409
struct llama_layer_nextn nextn;
409410
};
410411

412+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
413+
struct ggml_backend_buft_comparator {
414+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
415+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
416+
}
417+
};
418+
411419
struct llama_model {
412420
llm_type type = LLM_TYPE_UNKNOWN;
413421
llm_arch arch = LLM_ARCH_UNKNOWN;
@@ -475,13 +483,13 @@ struct llama_model {
475483

476484
/// @brief Create backend buffers for all tensors
477485
bool create_backend_buffers(std::size_t size_data,
478-
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
486+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> & ctx_map,
479487
llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers,
480488
bool do_print_backend_buffers_info = true);
481489

482490
/// @brief Create backend buffers for tensors on a split file idenfified by `idx`. Removes the split from the map.
483491
bool create_split_backend_buffers(
484-
uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
492+
uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context_ptr> & ctx_split_map,
485493
llama_model_loader & ml, bool use_mmap_buffer, bool use_mlock, int32_t n_gpu_layers);
486494

487495
void print_backend_buffers_info(int32_t n_gpu_layers);

0 commit comments

Comments
 (0)