|
2 | 2 | #include <atomic> |
3 | 3 | #include <chrono> |
4 | 4 | #include <cstdarg> |
| 5 | +#include <cstdint> |
5 | 6 | #include <fstream> |
6 | 7 | #include <functional> |
7 | 8 | #include <mutex> |
@@ -768,6 +769,99 @@ void ModelLoader::process_model_files(bool enable_mmap) { |
768 | 769 | LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f); |
769 | 770 | } |
770 | 771 |
|
| 772 | +std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors, |
| 773 | + std::set<std::string> ignore_tensors) |
| 774 | +{ |
| 775 | + process_model_files(true); |
| 776 | + |
| 777 | + std::vector<MmapTensorStore> result; |
| 778 | + uint64_t mapped_bytes = 0; |
| 779 | + size_t mapped_tensors = 0; |
| 780 | + |
| 781 | + LOG_DEBUG("memory-mapping tensors..."); |
| 782 | + |
| 783 | + int64_t t_start = ggml_time_ms(); |
| 784 | + |
| 785 | + for (const auto& fdata : file_data) { |
| 786 | + if (!fdata.mmapped) continue; |
| 787 | + |
| 788 | + const std::vector<TensorStorage>& file_tensors = fdata.tensors; |
| 789 | + std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped; |
| 790 | + |
| 791 | + uint8_t * mmap_data = const_cast<uint8_t*>(mmapped->data()); |
| 792 | + |
| 793 | + ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size()); |
| 794 | + if (!buf_mmap) { |
| 795 | + LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str()); |
| 796 | + continue; |
| 797 | + } |
| 798 | + ggml_backend_buffer_set_usage(buf_mmap, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); |
| 799 | + |
| 800 | + size_t file_mapped_bytes = 0; |
| 801 | + size_t file_mapped_tensors = 0; |
| 802 | + |
| 803 | + for (const auto& tensor_storage : file_tensors) { |
| 804 | + const std::string& name = tensor_storage.name; |
| 805 | + |
| 806 | + bool is_ignored = false; |
| 807 | + for (const auto& ignore_prefix : ignore_tensors) { |
| 808 | + if (starts_with(name, ignore_prefix)) { |
| 809 | + is_ignored = true; |
| 810 | + break; |
| 811 | + } |
| 812 | + } |
| 813 | + if (is_ignored) |
| 814 | + continue; |
| 815 | + |
| 816 | + auto it = tensors.find(name); |
| 817 | + if (it == tensors.end()) |
| 818 | + continue; |
| 819 | + |
| 820 | + ggml_tensor* dst_tensor = it->second; |
| 821 | + if (dst_tensor == nullptr) |
| 822 | + continue; |
| 823 | + |
| 824 | + if (tensor_storage.type != dst_tensor->type) |
| 825 | + continue; |
| 826 | + |
| 827 | + size_t tensor_size = tensor_storage.nbytes(); |
| 828 | + size_t tensor_offset = tensor_storage.offset; |
| 829 | + |
| 830 | + if (tensor_storage.ne[0] != dst_tensor->ne[0] || |
| 831 | + tensor_storage.ne[1] != dst_tensor->ne[1] || |
| 832 | + tensor_storage.ne[2] != dst_tensor->ne[2] || |
| 833 | + tensor_storage.ne[3] != dst_tensor->ne[3] || |
| 834 | + tensor_size != ggml_nbytes(dst_tensor)) { |
| 835 | + // let load_tensors worry about this |
| 836 | + continue; |
| 837 | + } |
| 838 | + |
| 839 | + dst_tensor->buffer = buf_mmap; |
| 840 | + dst_tensor->data = mmap_data + tensor_offset; |
| 841 | + |
| 842 | + file_mapped_bytes += tensor_size; |
| 843 | + file_mapped_tensors++; |
| 844 | + } |
| 845 | + |
| 846 | + if (file_mapped_bytes > 0) { |
| 847 | + mapped_tensors += file_mapped_tensors; |
| 848 | + mapped_bytes += file_mapped_bytes; |
| 849 | + result.push_back({mmapped, buf_mmap}); |
| 850 | + } |
| 851 | + } |
| 852 | + |
| 853 | + int64_t t_end = ggml_time_ms(); |
| 854 | + int64_t duration_ms = t_end - t_start; |
| 855 | + |
| 856 | + LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs", |
| 857 | + mapped_tensors, |
| 858 | + result.size(), |
| 859 | + mapped_bytes / (1024.0 * 1024.0), |
| 860 | + duration_ms / 1000.0); |
| 861 | + |
| 862 | + return result; |
| 863 | +} |
| 864 | + |
771 | 865 | bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) { |
772 | 866 |
|
773 | 867 | process_model_files(enable_mmap); |
@@ -860,6 +954,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread |
860 | 954 | continue; |
861 | 955 | } |
862 | 956 |
|
| 957 | + // skip mmapped tensors |
| 958 | + if (dst_tensor->buffer != nullptr |
| 959 | + && ggml_backend_buffer_get_usage(dst_tensor->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { |
| 960 | + continue; |
| 961 | + } |
| 962 | + |
863 | 963 | size_t nbytes_to_read = tensor_storage.nbytes_to_read(); |
864 | 964 |
|
865 | 965 | auto read_data = [&](char* buf, size_t n) { |
|
0 commit comments