17
17
#include <cassert>
18
18
#include <cmath>
19
19
#include <cfloat>
20
+ #include <cstdint>
20
21
#include <cstring>
21
22
#include <cmath>
22
23
#include <functional>
23
- #include <map>
24
24
#include <regex>
25
25
#include <sstream>
26
26
#include <stdexcept>
@@ -1589,6 +1589,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1589
1589
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
1590
1590
1591
1591
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1592
+
1592
1593
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1593
1594
auto it = ctx_map.find(buft);
1594
1595
if (it == ctx_map.end()) {
@@ -1643,9 +1644,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1643
1644
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
1644
1645
1645
1646
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1646
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
1647
-
1647
+ const std::string& tensor_name = tn.str();
1648
+ ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str());
1649
+ std::optional<uint16_t> split_idx;
1650
+ if (!t_meta && (flags & TENSOR_NOT_REQUIRED) &&
1651
+ IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) {
1652
+ return nullptr;
1653
+ }
1654
+ if (ml.incremental_splits_tensor_load.has_value()) {
1655
+ split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta);
1656
+ LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx);
1657
+ }
1648
1658
if (!t_meta) {
1659
+ LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str());
1649
1660
if (flags & TENSOR_NOT_REQUIRED) {
1650
1661
return nullptr;
1651
1662
}
@@ -1758,7 +1769,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1758
1769
}
1759
1770
}
1760
1771
1761
- ggml_context * ctx = ctx_for_buft(buft);
1772
+ ggml_context * ctx =
1773
+ split_idx.has_value() ?
1774
+ ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) :
1775
+ ctx_for_buft(buft);
1762
1776
1763
1777
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
1764
1778
if (flags & TENSOR_DUPLICATED) {
@@ -1767,7 +1781,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1767
1781
return t;
1768
1782
}
1769
1783
}
1770
- return ml.create_tensor(ctx, tn, ne, flags);
1784
+ struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags);
1785
+
1786
+ if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx)) {
1787
+ // Upload right now.
1788
+ if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml,
1789
+ use_mmap_buffer, use_mlock, n_gpu_layers)) {
1790
+ throw std::runtime_error("Failed to create incremental backend buffers");
1791
+ }
1792
+ IncrementalSplitsTensorLoad::release_split(ml, *split_idx);
1793
+ }
1794
+
1795
+ return tensor;
1771
1796
};
1772
1797
1773
1798
layers.resize(n_layer);
@@ -4285,12 +4310,49 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4285
4310
4286
4311
ml.done_getting_tensors();
4287
4312
4313
+ if (ml.incremental_splits_tensor_load.has_value()) {
4314
+ // Already did incremental load.
4315
+ print_backend_buffers_info(n_gpu_layers);
4316
+ return true;
4317
+ }
4318
+
4288
4319
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
4289
4320
pimpl->mappings.reserve(ml.mappings.size());
4290
4321
4291
4322
return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
4292
4323
}
4293
4324
4325
+ bool llama_model::create_split_backend_buffers(
4326
+ const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map,
4327
+ llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) {
4328
+ // Extract contexts for the given split index from ctx_split_map into a new map
4329
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4330
+ for (const auto & [buft_split_idx, ctx] : ctx_split_map) {
4331
+ const auto & [buft, split_idx] = buft_split_idx;
4332
+ if (split_idx == idx) {
4333
+ ctx_map[buft] = ctx;
4334
+ }
4335
+ }
4336
+
4337
+ const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx);
4338
+ LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size);
4339
+ constexpr bool do_print_backend_buffers_info = false;
4340
+ const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock,
4341
+ n_gpu_layers, do_print_backend_buffers_info);
4342
+
4343
+ if (creation_success) {
4344
+ for (auto it = ctx_split_map.begin(); it != ctx_split_map.end();) {
4345
+ if (it->first.second == idx) {
4346
+ it = ctx_split_map.erase(it);
4347
+ } else {
4348
+ ++it;
4349
+ }
4350
+ }
4351
+ }
4352
+
4353
+ return creation_success;
4354
+ }
4355
+
4294
4356
bool llama_model::create_backend_buffers(std::size_t size_data,
4295
4357
const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
4296
4358
llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
0 commit comments