feat: enable memory-mapped tensors with --lora-apply-mode immediately

wbruna · wbruna · commit 97190f665935 · 2026-04-19T08:30:58.000-03:00
In this case, insstead of disabling mmap, we turn the mapping writable.
diff --git a/src/model.cpp b/src/model.cpp
@@ -706,7 +706,7 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
     }
 }
 
-void ModelLoader::process_model_files(bool enable_mmap) {
+void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
 
     if (model_files_processed) {
         return;
@@ -746,7 +746,7 @@ void ModelLoader::process_model_files(bool enable_mmap) {
         std::unique_ptr<MmapWrapper> mmapped;
         if (enable_mmap && !is_zip) {
             LOG_DEBUG("using mmap for I/O");
-            mmapped = MmapWrapper::create(file_path);
+            mmapped = MmapWrapper::create(file_path, writable_mmap);
             if (!mmapped) {
                 LOG_WARN("failed to memory-map '%s'", file_path.c_str());
             }
@@ -770,9 +770,9 @@ void ModelLoader::process_model_files(bool enable_mmap) {
 }
 
 std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
-                                                       std::set<std::string> ignore_tensors)
+                                                       std::set<std::string> ignore_tensors, bool writable_mmap)
 {
-    process_model_files(true);
+    process_model_files(true, writable_mmap);
 
     std::vector<MmapTensorStore> result;
     uint64_t mapped_bytes = 0;
@@ -788,7 +788,7 @@ std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggm
         const std::vector<TensorStorage>& file_tensors = fdata.tensors;
         std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;
 
-        uint8_t * mmap_data = const_cast<uint8_t*>(mmapped->data());
+        uint8_t * mmap_data = mmapped->writable_data();
 
         ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
         if (!buf_mmap) {
@@ -864,7 +864,7 @@ std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggm
 
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
 
-    process_model_files(enable_mmap);
+    process_model_files(enable_mmap, false);
 
     std::atomic<int64_t> read_time_ms(0);
     std::atomic<int64_t> memcpy_time_ms(0);
diff --git a/src/model.h b/src/model.h
@@ -232,9 +232,10 @@ class ModelLoader {
     std::map<ggml_type, uint32_t> get_vae_wtype_stat();
     String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
     void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
-    void process_model_files(bool enable_mmap = false);
+    void process_model_files(bool enable_mmap = false, bool writable_mmap = true);
     std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
-                                              std::set<std::string> ignore_tensors = {});
+                                              std::set<std::string> ignore_tensors = {},
+                                              bool writable = true);
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
     bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -414,19 +414,20 @@ class StableDiffusionGGML {
         std::map<std::string, ggml_tensor*> mmap_able_tensors;
         bool enable_mmap_tensors = false;
         bool main_backend_mmap   = false;
+        bool needs_writable_mmap = false;
         if (sd_ctx_params->enable_mmap) {
             if (apply_lora_immediately) {
-                LOG_DEBUG("cannot memory-map model weights: only supported with --lora-apply-mode at_runtime");
+                needs_writable_mmap = true;
+                LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
+            }
+            enable_mmap_tensors = true;
+            if (offload_params_to_cpu) {
+                main_backend_mmap = true;
             } else {
-                enable_mmap_tensors = true;
-                if (offload_params_to_cpu) {
-                    main_backend_mmap = true;
-                } else {
-                    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
-                    struct ggml_backend_dev_props props;
-                    ggml_backend_dev_get_props(dev, &props);
-                    main_backend_mmap = props.caps.buffer_from_host_ptr;
-                }
+                ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+                struct ggml_backend_dev_props props;
+                ggml_backend_dev_get_props(dev, &props);
+                main_backend_mmap = props.caps.buffer_from_host_ptr;
             }
         }
 
@@ -876,7 +877,7 @@ class StableDiffusionGGML {
             if (mmap_able_tensors.empty()) {
                 LOG_DEBUG("no tensors could be memory-mapped");
             } else {
-                mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors);
+                mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors, needs_writable_mmap);
             }
         }
 
diff --git a/src/util.cpp b/src/util.cpp
@@ -111,7 +111,7 @@ class MmapWrapperImpl : public MmapWrapper {
     HANDLE hmapping_;
 };
 
-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
     void* mapped_data = nullptr;
     size_t file_size  = 0;
 
@@ -136,14 +136,18 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     file_size = static_cast<size_t>(size.QuadPart);
 
-    HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
+    DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY;
+
+    HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, page_prot, 0, 0, NULL);
 
     if (mapping_handle == NULL) {
         CloseHandle(file_handle);
         return nullptr;
     }
 
-    mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
+    DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ;
+
+    mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size);
 
     if (mapped_data == NULL) {
         CloseHandle(mapping_handle);
@@ -181,7 +185,7 @@ class MmapWrapperImpl : public MmapWrapper {
     }
 };
 
-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
     int file_descriptor = open(filename.c_str(), O_RDONLY);
     if (file_descriptor == -1) {
         return nullptr;
@@ -203,7 +207,9 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
 
     size_t file_size = sb.st_size;
 
-    void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
+    int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0);
+
+    void* mapped_data = mmap(NULL, file_size, mmap_prot, mmap_flags, file_descriptor, 0);
 
     close(file_descriptor);
 
diff --git a/src/util.h b/src/util.h
@@ -41,7 +41,7 @@ sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_wid
 
 class MmapWrapper {
 public:
-    static std::unique_ptr<MmapWrapper> create(const std::string& filename);
+    static std::unique_ptr<MmapWrapper> create(const std::string& filename, bool writable = false);
 
     virtual ~MmapWrapper() = default;
 
@@ -51,6 +51,7 @@ class MmapWrapper {
     MmapWrapper& operator=(MmapWrapper&&)      = delete;
 
     const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
+    uint8_t* writable_data() { return static_cast<uint8_t*>(data_); }
     size_t size() const { return size_; }
     bool copy_data(void* buf, size_t n, size_t offset) const;