Skip to content

Commit 5de7ed3

Browse files
authored
WIP: update stable-diffusion.cpp to 5900ef6605c6 (new API) (#1669)
* Update stable-diffusion.cpp to 5900ef6605c6 (new API) * Clean up pending LoRA code and simplify LoRA changes to upstream * Move VAE tiling disabling for TAESD to sdtype_adapter.cpp * Move auxiliary ctx functions to sdtype_adapter.cpp * Use ref_images parameter for Kontext images * Drop clip skip workaround (fixed upstream) * Workaround for flash attention with img2img leejet/stable-diffusion.cpp#756 * Workaround for Chroma with flash attention, debug prints * Disable forcing CLIP weights to F32 for reduced memory usage
1 parent 7b5cf71 commit 5de7ed3

26 files changed

+2250
-1865
lines changed

otherarch/sdcpp/clip.hpp

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -545,9 +545,15 @@ class CLIPEmbeddings : public GGMLBlock {
545545
int64_t vocab_size;
546546
int64_t num_positions;
547547

548-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
549-
enum ggml_type token_wtype = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
550-
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
548+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
549+
enum ggml_type token_wtype = GGML_TYPE_F32;
550+
#if 1
551+
// kcpp reduce memory usage (reverts https://github.com/leejet/stable-diffusion.cpp/pull/601)
552+
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
553+
if (tensor_type != tensor_types.end())
554+
token_wtype = tensor_type->second;
555+
#endif
556+
enum ggml_type position_wtype = GGML_TYPE_F32;
551557

552558
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
553559
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
@@ -594,10 +600,10 @@ class CLIPVisionEmbeddings : public GGMLBlock {
594600
int64_t image_size;
595601
int64_t num_patches;
596602
int64_t num_positions;
597-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
598-
enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
599-
enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
600-
enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
603+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
604+
enum ggml_type patch_wtype = GGML_TYPE_F16;
605+
enum ggml_type class_wtype = GGML_TYPE_F32;
606+
enum ggml_type position_wtype = GGML_TYPE_F32;
601607

602608
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
603609
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
@@ -657,9 +663,9 @@ enum CLIPVersion {
657663

658664
class CLIPTextModel : public GGMLBlock {
659665
protected:
660-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
666+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
661667
if (version == OPEN_CLIP_VIT_BIGG_14) {
662-
enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
668+
enum ggml_type wtype = GGML_TYPE_F32;
663669
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
664670
}
665671
}
@@ -678,8 +684,8 @@ class CLIPTextModel : public GGMLBlock {
678684
bool with_final_ln = true;
679685

680686
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
681-
int clip_skip_value = -1,
682-
bool with_final_ln = true)
687+
bool with_final_ln = true,
688+
int clip_skip_value = -1)
683689
: version(version), with_final_ln(with_final_ln) {
684690
if (version == OPEN_CLIP_VIT_H_14) {
685691
hidden_size = 1024;
@@ -701,7 +707,7 @@ class CLIPTextModel : public GGMLBlock {
701707

702708
void set_clip_skip(int skip) {
703709
if (skip <= 0) {
704-
return;
710+
skip = -1;
705711
}
706712
clip_skip = skip;
707713
}
@@ -805,8 +811,8 @@ class CLIPProjection : public UnaryBlock {
805811
int64_t out_features;
806812
bool transpose_weight;
807813

808-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
809-
enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
814+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
815+
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
810816
if (transpose_weight) {
811817
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
812818
} else {
@@ -868,12 +874,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
868874
CLIPTextModel model;
869875

870876
CLIPTextModelRunner(ggml_backend_t backend,
871-
std::map<std::string, enum ggml_type>& tensor_types,
877+
const String2GGMLType& tensor_types,
872878
const std::string prefix,
873879
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
874-
int clip_skip_value = 1,
875-
bool with_final_ln = true)
876-
: GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
880+
bool with_final_ln = true,
881+
int clip_skip_value = -1)
882+
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
877883
model.init(params_ctx, tensor_types, prefix);
878884
}
879885

@@ -949,4 +955,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
949955
}
950956
};
951957

952-
#endif // __CLIP_HPP__
958+
#endif // __CLIP_HPP__

otherarch/sdcpp/common.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ class UpSampleBlock : public GGMLBlock {
5656
// x: [N, channels, h, w]
5757
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
5858

59-
x = ggml_upscale(ctx, x, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
60-
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
59+
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
60+
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
6161
return x;
6262
}
6363
};
@@ -182,9 +182,9 @@ class GEGLU : public GGMLBlock {
182182
int64_t dim_in;
183183
int64_t dim_out;
184184

185-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
186-
enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
187-
enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
185+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
186+
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
187+
enum ggml_type bias_wtype = GGML_TYPE_F32;
188188
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
189189
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
190190
}
@@ -440,9 +440,9 @@ class SpatialTransformer : public GGMLBlock {
440440

441441
class AlphaBlender : public GGMLBlock {
442442
protected:
443-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
443+
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
444444
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
445-
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
445+
enum ggml_type wtype = GGML_TYPE_F32;
446446
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
447447
}
448448

otherarch/sdcpp/conditioner.hpp

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5757
std::vector<std::string> readed_embeddings;
5858

5959
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
60-
std::map<std::string, enum ggml_type>& tensor_types,
60+
const String2GGMLType& tensor_types,
6161
const std::string& embd_dir,
6262
SDVersion version = VERSION_SD1,
6363
PMVersion pv = PM_VERSION_1,
6464
int clip_skip = -1)
6565
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
66-
if (clip_skip <= 0) {
67-
clip_skip = 1;
68-
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
69-
clip_skip = 2;
70-
}
71-
}
7266
if (sd_version_is_sd1(version)) {
73-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
67+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
7468
} else if (sd_version_is_sd2(version)) {
75-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
69+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
7670
} else if (sd_version_is_sdxl(version)) {
77-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
78-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
71+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
72+
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
7973
}
74+
set_clip_skip(clip_skip);
8075
}
8176

8277
void set_clip_skip(int clip_skip) {
78+
if (clip_skip <= 0) {
79+
clip_skip = 1;
80+
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
81+
clip_skip = 2;
82+
}
83+
}
8384
text_model->set_clip_skip(clip_skip);
8485
if (sd_version_is_sdxl(version)) {
8586
text_model2->set_clip_skip(clip_skip);
@@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
458459
if (sd_version_is_sdxl(version)) {
459460
text_model2->compute(n_threads,
460461
input_ids2,
461-
0,
462-
NULL,
462+
num_custom_embeddings,
463+
token_embed_custom.data(),
463464
max_token_idx,
464465
false,
465466
&chunk_hidden_states2, work_ctx);
@@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
469470
if (chunk_idx == 0) {
470471
text_model2->compute(n_threads,
471472
input_ids2,
472-
0,
473-
NULL,
473+
num_custom_embeddings,
474+
token_embed_custom.data(),
474475
max_token_idx,
475476
true,
476477
&pooled,
@@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
617618
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
618619
CLIPVisionModelProjection vision_model;
619620

620-
FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
621+
FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
621622
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
622623
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
623624
}
@@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner {
662663
std::shared_ptr<T5Runner> t5;
663664

664665
SD3CLIPEmbedder(ggml_backend_t backend,
665-
std::map<std::string, enum ggml_type>& tensor_types,
666-
int clip_skip = -1)
666+
const String2GGMLType& tensor_types = {},
667+
int clip_skip = -1)
667668
: clip_g_tokenizer(0) {
668-
if (clip_skip <= 0) {
669-
clip_skip = 2;
670-
}
671-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
672-
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
669+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
670+
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
673671
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
672+
set_clip_skip(clip_skip);
674673
}
675674

676675
void set_clip_skip(int clip_skip) {
676+
if (clip_skip <= 0) {
677+
clip_skip = 2;
678+
}
677679
clip_l->set_clip_skip(clip_skip);
678680
clip_g->set_clip_skip(clip_skip);
679681
}
@@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner {
10081010
size_t chunk_len = 256;
10091011

10101012
FluxCLIPEmbedder(ggml_backend_t backend,
1011-
std::map<std::string, enum ggml_type>& tensor_types,
1012-
int clip_skip = -1) {
1013-
if (clip_skip <= 0) {
1014-
clip_skip = 2;
1015-
}
1016-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
1013+
const String2GGMLType& tensor_types = {},
1014+
int clip_skip = -1) {
1015+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
10171016
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
1017+
set_clip_skip(clip_skip);
10181018
}
10191019

10201020
void set_clip_skip(int clip_skip) {
1021+
if (clip_skip <= 0) {
1022+
clip_skip = 2;
1023+
}
10211024
clip_l->set_clip_skip(clip_skip);
10221025
}
10231026

@@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
12281231
int mask_pad = 1;
12291232

12301233
PixArtCLIPEmbedder(ggml_backend_t backend,
1231-
std::map<std::string, enum ggml_type>& tensor_types,
1232-
int clip_skip = -1,
1233-
bool use_mask = false,
1234-
int mask_pad = 1)
1234+
const String2GGMLType& tensor_types = {},
1235+
int clip_skip = -1,
1236+
bool use_mask = false,
1237+
int mask_pad = 1)
12351238
: use_mask(use_mask), mask_pad(mask_pad) {
12361239
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
12371240
}
@@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
14221425
}
14231426
};
14241427

1425-
#endif
1428+
#endif

otherarch/sdcpp/control.hpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,23 @@ struct ControlNet : public GGMLRunner {
317317
bool guided_hint_cached = false;
318318

319319
ControlNet(ggml_backend_t backend,
320-
std::map<std::string, enum ggml_type>& tensor_types,
321-
SDVersion version = VERSION_SD1)
320+
const String2GGMLType& tensor_types = {},
321+
SDVersion version = VERSION_SD1)
322322
: GGMLRunner(backend), control_net(version) {
323323
control_net.init(params_ctx, tensor_types, "");
324324
}
325325

326+
void enable_conv2d_direct() {
327+
std::vector<GGMLBlock*> blocks;
328+
control_net.get_all_blocks(blocks);
329+
for (auto block : blocks) {
330+
if (block->get_desc() == "Conv2d") {
331+
auto conv_block = (Conv2d*)block;
332+
conv_block->enable_direct();
333+
}
334+
}
335+
}
336+
326337
~ControlNet() {
327338
free_control_ctx();
328339
}

0 commit comments

Comments
 (0)