Skip to content

Commit 6e4f647

Browse files
committed
feat: initial support for memory-mapping model weights
1 parent 8c1e30d commit 6e4f647

3 files changed

Lines changed: 118 additions & 0 deletions

File tree

src/model.cpp

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <atomic>
33
#include <chrono>
44
#include <cstdarg>
5+
#include <cstdint>
56
#include <fstream>
67
#include <functional>
78
#include <mutex>
@@ -1381,6 +1382,99 @@ void ModelLoader::process_model_files(bool enable_mmap) {
13811382
LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
13821383
}
13831384

1385+
std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
1386+
std::set<std::string> ignore_tensors)
1387+
{
1388+
process_model_files(true);
1389+
1390+
std::vector<MmapTensorStore> result;
1391+
uint64_t mapped_bytes = 0;
1392+
size_t mapped_tensors = 0;
1393+
1394+
LOG_DEBUG("memory-mapping tensors...");
1395+
1396+
int64_t t_start = ggml_time_ms();
1397+
1398+
for (const auto& fdata : file_data) {
1399+
if (!fdata.mmapped) continue;
1400+
1401+
const std::vector<TensorStorage>& file_tensors = fdata.tensors;
1402+
std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;
1403+
1404+
uint8_t * mmap_data = const_cast<uint8_t*>(mmapped->data());
1405+
1406+
ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
1407+
if (!buf_mmap) {
1408+
LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str());
1409+
continue;
1410+
}
1411+
ggml_backend_buffer_set_usage(buf_mmap, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1412+
1413+
size_t file_mapped_bytes = 0;
1414+
size_t file_mapped_tensors = 0;
1415+
1416+
for (const auto& tensor_storage : file_tensors) {
1417+
const std::string& name = tensor_storage.name;
1418+
1419+
bool is_ignored = false;
1420+
for (const auto& ignore_prefix : ignore_tensors) {
1421+
if (starts_with(name, ignore_prefix)) {
1422+
is_ignored = true;
1423+
break;
1424+
}
1425+
}
1426+
if (is_ignored)
1427+
continue;
1428+
1429+
auto it = tensors.find(name);
1430+
if (it == tensors.end())
1431+
continue;
1432+
1433+
ggml_tensor* dst_tensor = it->second;
1434+
if (dst_tensor == nullptr)
1435+
continue;
1436+
1437+
if (tensor_storage.type != dst_tensor->type)
1438+
continue;
1439+
1440+
size_t tensor_size = tensor_storage.nbytes();
1441+
size_t tensor_offset = tensor_storage.offset;
1442+
1443+
if (tensor_storage.ne[0] != dst_tensor->ne[0] ||
1444+
tensor_storage.ne[1] != dst_tensor->ne[1] ||
1445+
tensor_storage.ne[2] != dst_tensor->ne[2] ||
1446+
tensor_storage.ne[3] != dst_tensor->ne[3] ||
1447+
tensor_size != ggml_nbytes(dst_tensor)) {
1448+
// let load_tensors worry about this
1449+
continue;
1450+
}
1451+
1452+
dst_tensor->buffer = buf_mmap;
1453+
dst_tensor->data = mmap_data + tensor_offset;
1454+
1455+
file_mapped_bytes += tensor_size;
1456+
file_mapped_tensors++;
1457+
}
1458+
1459+
if (file_mapped_bytes > 0) {
1460+
mapped_tensors += file_mapped_tensors;
1461+
mapped_bytes += file_mapped_bytes;
1462+
result.push_back({mmapped, buf_mmap});
1463+
}
1464+
}
1465+
1466+
int64_t t_end = ggml_time_ms();
1467+
int64_t duration_ms = t_end - t_start;
1468+
1469+
LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs",
1470+
mapped_tensors,
1471+
result.size(),
1472+
mapped_bytes / (1024.0 * 1024.0),
1473+
duration_ms / 1000.0);
1474+
1475+
return result;
1476+
}
1477+
13841478
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
13851479

13861480
process_model_files(enable_mmap);
@@ -1473,6 +1567,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
14731567
continue;
14741568
}
14751569

1570+
// skip mmapped tensors
1571+
if (dst_tensor->buffer != nullptr
1572+
&& ggml_backend_buffer_get_usage(dst_tensor->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1573+
continue;
1574+
}
1575+
14761576
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
14771577

14781578
auto read_data = [&](char* buf, size_t n) {

src/model.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ struct ModelFileData {
298298
bool is_zip;
299299
};
300300

301+
struct MmapTensorStore {
302+
std::shared_ptr<MmapWrapper> mmapped;
303+
ggml_backend_buffer_t buffer;
304+
};
305+
301306
class ModelLoader {
302307
protected:
303308
SDVersion version_ = VERSION_COUNT;
@@ -334,6 +339,8 @@ class ModelLoader {
334339
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
335340
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
336341
void process_model_files(bool enable_mmap = false);
342+
std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
343+
std::set<std::string> ignore_tensors = {});
337344
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
338345
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
339346
std::set<std::string> ignore_tensors = {},

src/stable-diffusion.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
104104

105105
class StableDiffusionGGML {
106106
public:
107+
std::vector<MmapTensorStore> mmap_tensor_store;
107108
ggml_backend_t backend = nullptr; // general backend
108109
ggml_backend_t clip_backend = nullptr;
109110
ggml_backend_t control_net_backend = nullptr;
@@ -809,6 +810,16 @@ class StableDiffusionGGML {
809810
ignore_tensors.insert("conditioner.embedders.3");
810811
}
811812

813+
if (sd_ctx_params->enable_mmap) {
814+
if (!(offload_params_to_cpu || ggml_backend_is_cpu(backend))) {
815+
LOG_DEBUG("cannot memory-map model weights: only supported with CPU or --offload-to-cpu");
816+
} else if (apply_lora_immediately) {
817+
LOG_DEBUG("cannot memory-map model weights: only supported with --lora-apply-mode at_runtime");
818+
} else {
819+
mmap_tensor_store = model_loader.mmap_tensors(tensors, ignore_tensors);
820+
}
821+
}
822+
812823
if (clip_vision) {
813824
clip_vision->alloc_params_buffer();
814825
}

0 commit comments

Comments
 (0)