|
17 | 17 | #include <cassert>
|
18 | 18 | #include <cmath>
|
19 | 19 | #include <cfloat>
|
| 20 | +#include <cstdint> |
20 | 21 | #include <cstring>
|
21 | 22 | #include <cmath>
|
22 | 23 | #include <functional>
|
23 |
| -#include <map> |
24 | 24 | #include <regex>
|
25 | 25 | #include <sstream>
|
26 | 26 | #include <stdexcept>
|
@@ -1589,6 +1589,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
1589 | 1589 | const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
1590 | 1590 |
|
1591 | 1591 | std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 1592 | + std::set<uint16_t> created_backend_buffer_splits; |
| 1593 | + |
1592 | 1594 | auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
1593 | 1595 | auto it = ctx_map.find(buft);
|
1594 | 1596 | if (it == ctx_map.end()) {
|
@@ -1643,9 +1645,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
1643 | 1645 | ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
1644 | 1646 |
|
1645 | 1647 | auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
1646 |
| - ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str()); |
1647 |
| - |
| 1648 | + const std::string& tensor_name = tn.str(); |
| 1649 | + ggml_tensor * t_meta = ml.get_tensor_meta(tensor_name.c_str()); |
| 1650 | + std::optional<uint16_t> split_idx; |
| 1651 | + if (!t_meta && (flags & TENSOR_NOT_REQUIRED) && |
| 1652 | + IncrementalSplitsTensorLoad::tensor_ignored(ml.incremental_splits_tensor_load, tensor_name.c_str())) { |
| 1653 | + return nullptr; |
| 1654 | + } |
| 1655 | + if (ml.incremental_splits_tensor_load.has_value()) { |
| 1656 | + split_idx = ml.incremental_splits_tensor_load->load_tensor_metadata(ml, tn.str().c_str(), &t_meta); |
| 1657 | + LLAMA_LOG_CMAKE_DEBUG("split idx for tensor %s: %d\n", tn.str().c_str(), *split_idx); |
| 1658 | + } |
1648 | 1659 | if (!t_meta) {
|
| 1660 | + LLAMA_LOG_ERROR("%s: missing tensor %s\n", __func__ , tn.str().c_str()); |
1649 | 1661 | if (flags & TENSOR_NOT_REQUIRED) {
|
1650 | 1662 | return nullptr;
|
1651 | 1663 | }
|
@@ -1758,16 +1770,33 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
1758 | 1770 | }
|
1759 | 1771 | }
|
1760 | 1772 |
|
1761 |
| - ggml_context * ctx = ctx_for_buft(buft); |
| 1773 | + ggml_context * ctx = |
| 1774 | + split_idx.has_value() ? |
| 1775 | + ml.incremental_splits_tensor_load->get_model_ctx_for_split_buft(buft, *split_idx, pimpl.get()) : |
| 1776 | + ctx_for_buft(buft); |
1762 | 1777 |
|
1763 | 1778 | // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
1764 | 1779 | if (flags & TENSOR_DUPLICATED) {
|
1765 |
| - ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); |
| 1780 | + auto tn_str = tn.str(); |
| 1781 | + ggml_tensor * t = ggml_get_tensor(ctx, tn_str.c_str()); |
1766 | 1782 | if (t) {
|
1767 | 1783 | return t;
|
1768 | 1784 | }
|
| 1785 | + LLAMA_LOG_WARN("%s: duplicated tensor %s not found on existing context\n", tn_str.c_str(), __func__); |
1769 | 1786 | }
|
1770 |
| - return ml.create_tensor(ctx, tn, ne, flags); |
| 1787 | + struct ggml_tensor * tensor = ml.create_tensor(ctx, tn, ne, flags); |
| 1788 | + |
| 1789 | + if (split_idx.has_value() && ml.incremental_splits_tensor_load->all_tensors_are_loaded(*split_idx)) { |
| 1790 | + // Upload right now. |
| 1791 | + if (!create_split_backend_buffers(*split_idx, ml.incremental_splits_tensor_load->ctx_split_map, ml, |
| 1792 | + use_mmap_buffer, use_mlock, n_gpu_layers)) { |
| 1793 | + throw std::runtime_error("Failed to create incremental backend buffers"); |
| 1794 | + } |
| 1795 | + IncrementalSplitsTensorLoad::release_split(ml, *split_idx); |
| 1796 | + created_backend_buffer_splits.insert(*split_idx); |
| 1797 | + } |
| 1798 | + |
| 1799 | + return tensor; |
1771 | 1800 | };
|
1772 | 1801 |
|
1773 | 1802 | layers.resize(n_layer);
|
@@ -4285,12 +4314,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
4285 | 4314 |
|
4286 | 4315 | ml.done_getting_tensors();
|
4287 | 4316 |
|
| 4317 | + if (ml.incremental_splits_tensor_load.has_value()) { |
| 4318 | + // Already did incremental load. |
| 4319 | + print_backend_buffers_info(n_gpu_layers); |
| 4320 | + return true; |
| 4321 | + } |
| 4322 | + |
4288 | 4323 | ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
|
4289 | 4324 | pimpl->mappings.reserve(ml.mappings.size());
|
4290 | 4325 |
|
4291 | 4326 | return create_backend_buffers(ml.size_data, ctx_map, ml, use_mmap_buffer, use_mlock, n_gpu_layers);
|
4292 | 4327 | }
|
4293 | 4328 |
|
| 4329 | +bool llama_model::create_split_backend_buffers( |
| 4330 | + const uint16_t idx, std::map<std::pair<ggml_backend_buffer_type_t, uint16_t>, ggml_context *> & ctx_split_map, |
| 4331 | + llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock, const int32_t n_gpu_layers) { |
| 4332 | + // Extract contexts for the given split index from ctx_split_map into a new map |
| 4333 | + std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |
| 4334 | + for (const auto & [buft_split_idx, ctx] : ctx_split_map) { |
| 4335 | + const auto & [buft, split_idx] = buft_split_idx; |
| 4336 | + if (split_idx == idx) { |
| 4337 | + ctx_map[buft] = ctx; |
| 4338 | + } |
| 4339 | + } |
| 4340 | + |
| 4341 | + const std::size_t split_data_size = ml.incremental_splits_tensor_load->get_split_data_size(idx); |
| 4342 | + LLAMA_LOG_CMAKE_DEBUG("%s: creating backend buffers for split %d with size %zu\n", __func__, idx, split_data_size); |
| 4343 | + constexpr bool do_print_backend_buffers_info = false; |
| 4344 | + const bool creation_success = create_backend_buffers(split_data_size, ctx_map, ml, use_mmap_buffer, use_mlock, |
| 4345 | + n_gpu_layers, do_print_backend_buffers_info); |
| 4346 | + |
| 4347 | + return creation_success; |
| 4348 | +} |
| 4349 | + |
4294 | 4350 | bool llama_model::create_backend_buffers(std::size_t size_data,
|
4295 | 4351 | const std::map<ggml_backend_buffer_type_t, ggml_context *> & ctx_map,
|
4296 | 4352 | llama_model_loader & ml, const bool use_mmap_buffer, const bool use_mlock,
|
|
0 commit comments