17
17
#include <cassert>
18
18
#include <cmath>
19
19
#include <cfloat>
20
+ #include <cstdint>
20
21
#include <cstring>
21
22
#include <cmath>
22
23
#include <functional>
23
- #include <map>
24
24
#include <regex>
25
25
#include <sstream>
26
26
#include <stdexcept>
@@ -1589,6 +1589,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1589
1589
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
1590
1590
1591
1591
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1592
+ std::set<uint16_t> created_backend_buffer_splits;
1593
+
1592
1594
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1593
1595
auto it = ctx_map.find(buft);
1594
1596
if (it == ctx_map.end()) {
@@ -1643,9 +1645,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1643
1645
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
1644
1646
1645
1647
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1646
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
1647
-
1648
+ const std::string& tensor_name = tn.str();
1649
+ ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str());
1650
+ std::optional<uint16_t> split_idx;
1651
+ if (!t_meta && (flags & TENSOR_NOT_REQUIRED) &&
1652
+ IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) {
1653
+ return nullptr;
1654
+ }
1655
+ if (ml.incremental_splits_tensor_load.has_value()) {
1656
+ split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta);
1657
+ LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx);
1658
+ }
1648
1659
if (!t_meta) {
1660
+ LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str());
1649
1661
if (flags & TENSOR_NOT_REQUIRED) {
1650
1662
return nullptr;
1651
1663
}
@@ -1758,7 +1770,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1758
1770
}
1759
1771
}
1760
1772
1761
- ggml_context * ctx = ctx_for_buft(buft);
1773
+ ggml_context * ctx =
1774
+ split_idx.has_value() ?
1775
+ ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) :
1776
+ ctx_for_buft(buft);
1762
1777
1763
1778
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
1764
1779
if (flags & TENSOR_DUPLICATED) {
@@ -1767,7 +1782,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1767
1782
return t;
1768
1783
}
1769
1784
}
1770
- return ml.create_tensor(ctx, tn, ne, flags);
1785
+ struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags);
1786
+
1787
+ if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx) &&
1788
+ created_backend_buffer_splits.find(*split_idx) == created_backend_buffer_splits.end()) {
1789
+ // Upload right now.
1790
+ if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml,
1791
+ use_mmap_buffer, use_mlock, n_gpu_layers)) {
1792
+ throw std::runtime_error("Failed to create incremental backend buffers");
1793
+ }
1794
+ IncrementalSplitsTensorLoad::release_split(ml, *split_idx);
1795
+ created_backend_buffer_splits.insert(*split_idx);
1796
+ }
1797
+
1798
+ return tensor;
1771
1799
};
1772
1800
1773
1801
layers.resize(n_layer);
@@ -4285,12 +4313,49 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4285
4313
4286
4314
ml.done_getting_tensors();
4287
4315
4316
+ if (ml.incremental_splits_tensor_load.has_value()) {
4317
+ // Already did incremental load.
4318
+ print_backend_buffers_info(n_gpu_layers);
4319
+ return true;
4320
+ }
4321
+
4288
4322
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
4289
4323
pimpl->mappings.reserve(ml.mappings.size());
4290
4324
4291
4325
return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
4292
4326
}
4293
4327
4328
+ bool llama_model::create_split_backend_buffers(
4329
+ const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
4330
+ llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
4331
+ // Extract contexts for the given split index from ctx_split_map into a new map
4332
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4333
+ for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
4334
+ const auto & [buft, split_idx] = buft_split_idx;
4335
+ if (split_idx == idx) {
4336
+ ctx_map[buft] = ctx;
4337
+ }
4338
+ }
4339
+
4340
+ const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx);
4341
+ LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size);
4342
+ constexpr bool do_print_backend_buffers_info = false;
4343
+ const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
4344
+ n_gpu_layers, do_print_backend_buffers_info);
4345
+
4346
+ if (creation_success) {
4347
+ for (auto it = ctx_split_map.begin(); it != ctx_split_map.end();) {
4348
+ if (it->first.second == idx) {
4349
+ it = ctx_split_map.erase(it);
4350
+ } else {
4351
+ ++it;
4352
+ }
4353
+ }
4354
+ }
4355
+
4356
+ return creation_success;
4357
+ }
4358
+
4294
4359
bool llama_model::create_backend_buffers(std::size_t size_data,
4295
4360
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
4296
4361
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
0 commit comments