Skip to content

Commit 4375578

Browse files
committed
feat: initial support for memory-mapping model weights
1 parent d3b89f0 commit 4375578

3 files changed

Lines changed: 118 additions & 0 deletions

File tree

src/model.cpp

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <atomic>
33
#include <chrono>
44
#include <cstdarg>
5+
#include <cstdint>
56
#include <fstream>
67
#include <functional>
78
#include <mutex>
@@ -768,6 +769,99 @@ void ModelLoader::process_model_files(bool enable_mmap) {
768769
LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
769770
}
770771

772+
std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
773+
std::set<std::string> ignore_tensors)
774+
{
775+
process_model_files(true);
776+
777+
std::vector<MmapTensorStore> result;
778+
uint64_t mapped_bytes = 0;
779+
size_t mapped_tensors = 0;
780+
781+
LOG_DEBUG("memory-mapping tensors...");
782+
783+
int64_t t_start = ggml_time_ms();
784+
785+
for (const auto& fdata : file_data) {
786+
if (!fdata.mmapped) continue;
787+
788+
const std::vector<TensorStorage>& file_tensors = fdata.tensors;
789+
std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;
790+
791+
uint8_t * mmap_data = const_cast<uint8_t*>(mmapped->data());
792+
793+
ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
794+
if (!buf_mmap) {
795+
LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str());
796+
continue;
797+
}
798+
ggml_backend_buffer_set_usage(buf_mmap, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
799+
800+
size_t file_mapped_bytes = 0;
801+
size_t file_mapped_tensors = 0;
802+
803+
for (const auto& tensor_storage : file_tensors) {
804+
const std::string& name = tensor_storage.name;
805+
806+
bool is_ignored = false;
807+
for (const auto& ignore_prefix : ignore_tensors) {
808+
if (starts_with(name, ignore_prefix)) {
809+
is_ignored = true;
810+
break;
811+
}
812+
}
813+
if (is_ignored)
814+
continue;
815+
816+
auto it = tensors.find(name);
817+
if (it == tensors.end())
818+
continue;
819+
820+
ggml_tensor* dst_tensor = it->second;
821+
if (dst_tensor == nullptr)
822+
continue;
823+
824+
if (tensor_storage.type != dst_tensor->type)
825+
continue;
826+
827+
size_t tensor_size = tensor_storage.nbytes();
828+
size_t tensor_offset = tensor_storage.offset;
829+
830+
if (tensor_storage.ne[0] != dst_tensor->ne[0] ||
831+
tensor_storage.ne[1] != dst_tensor->ne[1] ||
832+
tensor_storage.ne[2] != dst_tensor->ne[2] ||
833+
tensor_storage.ne[3] != dst_tensor->ne[3] ||
834+
tensor_size != ggml_nbytes(dst_tensor)) {
835+
// let load_tensors worry about this
836+
continue;
837+
}
838+
839+
dst_tensor->buffer = buf_mmap;
840+
dst_tensor->data = mmap_data + tensor_offset;
841+
842+
file_mapped_bytes += tensor_size;
843+
file_mapped_tensors++;
844+
}
845+
846+
if (file_mapped_bytes > 0) {
847+
mapped_tensors += file_mapped_tensors;
848+
mapped_bytes += file_mapped_bytes;
849+
result.push_back({mmapped, buf_mmap});
850+
}
851+
}
852+
853+
int64_t t_end = ggml_time_ms();
854+
int64_t duration_ms = t_end - t_start;
855+
856+
LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs",
857+
mapped_tensors,
858+
result.size(),
859+
mapped_bytes / (1024.0 * 1024.0),
860+
duration_ms / 1000.0);
861+
862+
return result;
863+
}
864+
771865
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
772866

773867
process_model_files(enable_mmap);
@@ -860,6 +954,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
860954
continue;
861955
}
862956

957+
// skip mmapped tensors
958+
if (dst_tensor->buffer != nullptr
959+
&& ggml_backend_buffer_get_usage(dst_tensor->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
960+
continue;
961+
}
962+
863963
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
864964

865965
auto read_data = [&](char* buf, size_t n) {

src/model.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,11 @@ struct ModelFileData {
199199
bool is_zip;
200200
};
201201

202+
struct MmapTensorStore {
203+
std::shared_ptr<MmapWrapper> mmapped;
204+
ggml_backend_buffer_t buffer;
205+
};
206+
202207
class ModelLoader {
203208
protected:
204209
SDVersion version_ = VERSION_COUNT;
@@ -228,6 +233,8 @@ class ModelLoader {
228233
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
229234
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
230235
void process_model_files(bool enable_mmap = false);
236+
std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
237+
std::set<std::string> ignore_tensors = {});
231238
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
232239
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
233240
std::set<std::string> ignore_tensors = {},

src/stable-diffusion.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
107107

108108
class StableDiffusionGGML {
109109
public:
110+
std::vector<MmapTensorStore> mmap_tensor_store;
110111
ggml_backend_t backend = nullptr; // general backend
111112
ggml_backend_t clip_backend = nullptr;
112113
ggml_backend_t control_net_backend = nullptr;
@@ -825,6 +826,16 @@ class StableDiffusionGGML {
825826
ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
826827
}
827828

829+
if (sd_ctx_params->enable_mmap) {
830+
if (!(offload_params_to_cpu || ggml_backend_is_cpu(backend))) {
831+
LOG_DEBUG("cannot memory-map model weights: only supported with CPU or --offload-to-cpu");
832+
} else if (apply_lora_immediately) {
833+
LOG_DEBUG("cannot memory-map model weights: only supported with --lora-apply-mode at_runtime");
834+
} else {
835+
mmap_tensor_store = model_loader.mmap_tensors(tensors, ignore_tensors);
836+
}
837+
}
838+
828839
if (clip_vision) {
829840
clip_vision->alloc_params_buffer();
830841
}

0 commit comments

Comments
 (0)