1515
1616#include <algorithm>
1717#include <cassert>
18- #include <cmath>
1918#include <cfloat>
2019#include <cstring>
2120#include <cmath>
@@ -438,7 +437,7 @@ struct llama_model::impl {
438437    llama_mlocks mlock_mmaps;
439438
440439    // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
441-     std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
440+     std::vector<std::pair<ggml_context_ptr, std::vector< ggml_backend_buffer_ptr> >> ctxs_bufs;
442441
443442    buft_list_t cpu_buft_list;
444443    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61866185        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
61876186        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
61886187
6189-         ggml_backend_buffer_t buf = nullptr ;
6188+         std::vector<ggml_backend_buffer_ptr> bufs ;
61906189        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
61916190            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
61926191                // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
61996198                    continue;
62006199                }
62016200                const size_t max_size = ggml_get_max_tensor_size(ctx);
6202-                 buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6201+                 ggml_backend_buffer_t  buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
62036202                if (buf == nullptr) {
62046203                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
62056204                }
6205+                 bufs.emplace_back(buf);
62066206                buf_map.emplace(idx, buf);
62076207            }
62086208        }
62096209        else {
6210-             buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6210+             ggml_backend_buffer_t  buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
62116211            if (buf == nullptr) {
62126212                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
62136213            }
@@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
62176217                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
62186218                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
62196219            }
6220+             bufs.emplace_back(buf);
62206221            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
62216222                buf_map.emplace(idx, buf);
62226223            }
62236224        }
6224-         pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf );
6225+         pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs) );
62256226
62266227        for (auto & buf : buf_map) {
62276228            // indicate that this buffer contains weights
@@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
62476248    }
62486249
62496250    // print memory requirements per buffer type
6250-     for (auto & [_, buf] : pimpl->ctxs_bufs) {
6251-         LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6251+     for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6252+         for (auto & buf: bufs) {
6253+             LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6254+                 __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6255+         }
62526256    }
62536257
62546258    // populate tensors_by_name
@@ -6300,8 +6304,10 @@ size_t llama_model::n_devices() const {
63006304
63016305std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
63026306    std::map<ggml_backend_buffer_type_t, size_t> ret;
6303-     for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6304-         ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6307+     for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6308+         for (const auto & buf : bufs) {
6309+             ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6310+         }
63056311    }
63066312    return ret;
63076313}
0 commit comments