Skip to content

Commit db5305b

Browse files
committed
feat: enable memory-mapped tensors for all compatible backends
1 parent 4375578 commit db5305b

1 file changed

Lines changed: 58 additions & 14 deletions

File tree

src/stable-diffusion.cpp

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,50 @@ class StableDiffusionGGML {
411411
apply_lora_immediately = false;
412412
}
413413

414+
std::map<std::string, ggml_tensor*> mmap_able_tensors;
415+
bool enable_mmap_tensors = false;
416+
bool main_backend_mmap = false;
417+
if (sd_ctx_params->enable_mmap) {
418+
if (apply_lora_immediately) {
419+
LOG_DEBUG("cannot memory-map model weights: only supported with --lora-apply-mode at_runtime");
420+
} else {
421+
enable_mmap_tensors = true;
422+
if (offload_params_to_cpu) {
423+
main_backend_mmap = true;
424+
} else {
425+
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
426+
struct ggml_backend_dev_props props;
427+
ggml_backend_dev_get_props(dev, &props);
428+
main_backend_mmap = props.caps.buffer_from_host_ptr;
429+
}
430+
}
431+
}
432+
433+
// split definition to avoid msvc choking on the extra parameter handling
434+
auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) {
435+
std::map<std::string, ggml_tensor*> temp;
436+
model->get_param_tensors(temp, prefix);
437+
bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
438+
for (const auto& [key, tensor] : temp) {
439+
tensors[key] = tensor;
440+
if (do_mmap) {
441+
mmap_able_tensors[key] = tensor;
442+
}
443+
}
444+
};
445+
446+
auto get_param_tensors = [&](auto&& model, bool force_cpu = false) {
447+
std::map<std::string, ggml_tensor*> temp;
448+
model->get_param_tensors(temp);
449+
bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
450+
for (const auto& [key, tensor] : temp) {
451+
tensors[key] = tensor;
452+
if (do_mmap) {
453+
mmap_able_tensors[key] = tensor;
454+
}
455+
}
456+
};
457+
414458
if (sd_version_is_control(version)) {
415459
// Might need vae encode for control cond
416460
vae_decode_only = false;
@@ -517,7 +561,7 @@ class StableDiffusionGGML {
517561
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
518562
offload_params_to_cpu,
519563
tensor_storage_map);
520-
clip_vision->get_param_tensors(tensors);
564+
get_param_tensors(clip_vision);
521565
}
522566
} else if (sd_version_is_qwen_image(version)) {
523567
bool enable_vision = false;
@@ -592,16 +636,16 @@ class StableDiffusionGGML {
592636
}
593637
}
594638

595-
cond_stage_model->get_param_tensors(tensors);
639+
get_param_tensors(cond_stage_model, clip_on_cpu);
596640

597-
diffusion_model->get_param_tensors(tensors);
641+
get_param_tensors(diffusion_model);
598642

599643
if (sd_version_is_unet_edit(version)) {
600644
vae_decode_only = false;
601645
}
602646

603647
if (high_noise_diffusion_model) {
604-
high_noise_diffusion_model->get_param_tensors(tensors);
648+
get_param_tensors(high_noise_diffusion_model);
605649
}
606650

607651
if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
@@ -664,6 +708,8 @@ class StableDiffusionGGML {
664708
}
665709
};
666710

711+
bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu;
712+
667713
if (version == VERSION_CHROMA_RADIANCE) {
668714
LOG_INFO("using FakeVAE");
669715
first_stage_model = std::make_shared<FakeVAE>(version,
@@ -672,15 +718,15 @@ class StableDiffusionGGML {
672718
} else if (use_tae && !tae_preview_only) {
673719
LOG_INFO("using TAE for encoding / decoding");
674720
first_stage_model = create_tae();
675-
first_stage_model->get_param_tensors(tensors, "tae");
721+
get_param_tensors_p(first_stage_model, force_vae_cpu, "tae");
676722
} else {
677723
LOG_INFO("using VAE for encoding / decoding");
678724
first_stage_model = create_vae();
679-
first_stage_model->get_param_tensors(tensors, "first_stage_model");
725+
get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model");
680726
if (use_tae && tae_preview_only) {
681727
LOG_INFO("using TAE for preview");
682728
preview_vae = create_tae();
683-
preview_vae->get_param_tensors(tensors, "tae");
729+
get_param_tensors_p(first_stage_model, force_vae_cpu, "tae");
684730
}
685731
}
686732

@@ -745,7 +791,7 @@ class StableDiffusionGGML {
745791
}
746792
}
747793
if (use_pmid) {
748-
pmid_model->get_param_tensors(tensors, "pmid");
794+
get_param_tensors_p(pmid_model, false, "pmid");
749795
}
750796

751797
if (sd_ctx_params->flash_attn) {
@@ -826,13 +872,11 @@ class StableDiffusionGGML {
826872
ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
827873
}
828874

829-
if (sd_ctx_params->enable_mmap) {
830-
if (!(offload_params_to_cpu || ggml_backend_is_cpu(backend))) {
831-
LOG_DEBUG("cannot memory-map model weights: only supported with CPU or --offload-to-cpu");
832-
} else if (apply_lora_immediately) {
833-
LOG_DEBUG("cannot memory-map model weights: only supported with --lora-apply-mode at_runtime");
875+
if (enable_mmap_tensors) {
876+
if (mmap_able_tensors.empty()) {
877+
LOG_DEBUG("no tensors could be memory-mapped");
834878
} else {
835-
mmap_tensor_store = model_loader.mmap_tensors(tensors, ignore_tensors);
879+
mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors);
836880
}
837881
}
838882

0 commit comments

Comments
 (0)