@@ -421,11 +421,8 @@ struct llama_model::impl {
421421 llama_mlocks mlock_bufs;
422422 llama_mlocks mlock_mmaps;
423423
424- // contexts where the model tensors metadata is stored
425- std::vector<ggml_context_ptr> ctxs;
426-
427- // the model memory buffers for the tensor data
428- std::vector<ggml_backend_buffer_ptr> bufs;
424+ // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
425+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
429426
430427 buft_list_t cpu_buft_list;
431428 std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21822179 max_n_tensors += n_layer*2; // duplicated rope freq tensors
21832180 const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
21842181
2185- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
2182+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2183+ struct ggml_backend_buft_comparator {
2184+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2185+ return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2186+ }
2187+ };
2188+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2189+
21862190 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
21872191 auto it = ctx_map.find(buft);
21882192 if (it == ctx_map.end()) {
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21972201 throw std::runtime_error(format("failed to create ggml context"));
21982202 }
21992203
2200- ctx_map[buft] = ctx;
2201- pimpl->ctxs.emplace_back(ctx);
2204+ ctx_map.emplace(buft, ctx);
22022205
22032206 return ctx;
22042207 }
2205- return it->second;
2208+ return it->second.get() ;
22062209 };
22072210
22082211 const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
60376040 pimpl->mappings.reserve(ml.mappings.size());
60386041
60396042 // create the backend buffers
6040- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs ;
6041- ctx_bufs .reserve(ctx_map.size());
6043+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps ;
6044+ ctx_buf_maps .reserve(ctx_map.size());
60426045
60436046 // Ensure we have enough capacity for the maximum backend buffer we will potentially create
60446047 const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6045- pimpl->bufs .reserve(n_max_backend_buffer);
6048+ pimpl->ctxs_bufs .reserve(n_max_backend_buffer);
60466049
6047- for (auto & it : ctx_map) {
6048- ggml_backend_buffer_type_t buft = it.first;
6049- ggml_context * ctx = it.second;
6050+ for (auto & [buft, ctx_ptr] : ctx_map) {
6051+ ggml_context * ctx = ctx_ptr.get();
60506052
60516053 // skip contexts without tensors
60526054 if (ggml_get_first_tensor(ctx) == nullptr) {
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
60706072 bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
60716073 bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
60726074
6075+ ggml_backend_buffer_t buf = nullptr;
60736076 if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
60746077 for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
60756078 // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
60826085 continue;
60836086 }
60846087 const size_t max_size = ggml_get_max_tensor_size(ctx);
6085- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6088+ buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
60866089 if (buf == nullptr) {
60876090 throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
60886091 }
6089- pimpl->bufs.emplace_back(buf);
60906092 buf_map.emplace(idx, buf);
60916093 }
60926094 }
60936095 else {
6094- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6096+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
60956097 if (buf == nullptr) {
60966098 throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
60976099 }
6098- pimpl->bufs.emplace_back(buf);
60996100 if (use_mlock && ggml_backend_buffer_is_host(buf)) {
61006101 pimpl->mlock_bufs.emplace_back(new llama_mlock);
61016102 auto & mlock_buf = pimpl->mlock_bufs.back();
@@ -6106,18 +6107,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61066107 buf_map.emplace(idx, buf);
61076108 }
61086109 }
6109-
6110- if (pimpl->bufs.empty()) {
6111- throw std::runtime_error("failed to allocate buffer");
6112- }
6110+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
61136111
61146112 for (auto & buf : buf_map) {
61156113 // indicate that this buffer contains weights
61166114 // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
61176115 ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
61186116 }
61196117
6120- ctx_bufs .emplace_back(ctx, buf_map);
6118+ ctx_buf_maps .emplace_back(ctx, buf_map);
61216119 }
61226120
61236121 if (llama_supports_gpu_offload()) {
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61356133 }
61366134
61376135 // print memory requirements per buffer type
6138- for (auto & buf : pimpl->bufs ) {
6136+ for (auto & [_, buf] : pimpl->ctxs_bufs ) {
61396137 LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
61406138 }
61416139
61426140 // populate tensors_by_name
6143- for (auto & ctx : pimpl->ctxs ) {
6141+ for (auto & [ ctx, _] : pimpl->ctxs_bufs ) {
61446142 for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
61456143 tensors_by_name.emplace_back(ggml_get_name(cur), cur);
61466144 }
61476145 }
61486146
61496147 // load tensor data
6150- for (auto & it : ctx_bufs) {
6151- ggml_context * ctx = it.first;
6152- auto & bufs = it.second;
6153- if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6148+ for (auto & [ctx, buf_map] : ctx_buf_maps) {
6149+ if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
61546150 return false;
61556151 }
61566152 }
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
61906186
61916187std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
61926188 std::map<ggml_backend_buffer_type_t, size_t> ret;
6193- for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs ) {
6194- ret[ggml_backend_buffer_get_type(buf_ptr .get())] += ggml_backend_buffer_get_size(buf_ptr .get());
6189+ for (const auto & [_, buf] : pimpl->ctxs_bufs ) {
6190+ ret[ggml_backend_buffer_get_type(buf .get())] += ggml_backend_buffer_get_size(buf .get());
61956191 }
61966192 return ret;
61976193}
0 commit comments