Skip to content

Commit 186227f

Browse files
committed
sync with sd.cpp
1 parent e5af9b5 commit 186227f

File tree

8 files changed

+233
-81
lines changed

8 files changed

+233
-81
lines changed

otherarch/sdcpp/conditioner.hpp

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
597597
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
598598
tokens.erase(it);
599599
return decode(tokens);
600-
//return prompt; //kcpp we don't care about photomaker trigger words
601600
}
602601

603602
SDCondition get_learned_condition(ggml_context* work_ctx,
@@ -903,6 +902,7 @@ struct SD3CLIPEmbedder : public Conditioner {
903902

904903
t5->compute(n_threads,
905904
input_ids,
905+
NULL,
906906
&chunk_hidden_states_t5,
907907
work_ctx);
908908
{
@@ -1148,6 +1148,7 @@ struct FluxCLIPEmbedder : public Conditioner {
11481148

11491149
t5->compute(n_threads,
11501150
input_ids,
1151+
NULL,
11511152
&chunk_hidden_states,
11521153
work_ctx);
11531154
{
@@ -1223,10 +1224,15 @@ struct PixArtCLIPEmbedder : public Conditioner {
12231224
T5UniGramTokenizer t5_tokenizer;
12241225
std::shared_ptr<T5Runner> t5;
12251226
size_t chunk_len = 512;
1227+
bool use_mask = false;
1228+
int mask_pad = 1;
12261229

12271230
PixArtCLIPEmbedder(ggml_backend_t backend,
12281231
std::map<std::string, enum ggml_type>& tensor_types,
1229-
int clip_skip = -1) {
1232+
int clip_skip = -1,
1233+
bool use_mask = false,
1234+
int mask_pad = 1)
1235+
: use_mask(use_mask), mask_pad(mask_pad) {
12301236
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
12311237
}
12321238

@@ -1323,16 +1329,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
13231329

13241330
size_t chunk_count = t5_tokens.size() / chunk_len;
13251331

1326-
bool use_mask = false;
1327-
const char* SD_CHROMA_USE_T5_MASK = getenv("SD_CHROMA_USE_T5_MASK");
1328-
if (SD_CHROMA_USE_T5_MASK != nullptr) {
1329-
std::string sd_chroma_use_t5_mask_str = SD_CHROMA_USE_T5_MASK;
1330-
if (sd_chroma_use_t5_mask_str == "ON" || sd_chroma_use_t5_mask_str == "TRUE") {
1331-
use_mask = true;
1332-
} else if (sd_chroma_use_t5_mask_str != "OFF" && sd_chroma_use_t5_mask_str != "FALSE") {
1333-
LOG_WARN("SD_CHROMA_USE_T5_MASK environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"OFF\"/\"FALSE\" or\"ON\"/\"TRUE\", got \"%s\")", SD_CHROMA_USE_T5_MASK);
1334-
}
1335-
}
13361332
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
13371333
// t5
13381334
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
@@ -1347,9 +1343,9 @@ struct PixArtCLIPEmbedder : public Conditioner {
13471343

13481344
t5->compute(n_threads,
13491345
input_ids,
1346+
t5_attn_mask_chunk,
13501347
&chunk_hidden_states,
1351-
work_ctx,
1352-
t5_attn_mask_chunk);
1348+
work_ctx);
13531349
{
13541350
auto tensor = chunk_hidden_states;
13551351
float original_mean = ggml_tensor_mean(tensor);
@@ -1391,18 +1387,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
13911387
ggml_set_f32(hidden_states, 0.f);
13921388
}
13931389

1394-
int mask_pad = 1;
1395-
const char* SD_CHROMA_MASK_PAD_OVERRIDE = getenv("SD_CHROMA_MASK_PAD_OVERRIDE");
1396-
if (SD_CHROMA_MASK_PAD_OVERRIDE != nullptr) {
1397-
std::string mask_pad_str = SD_CHROMA_MASK_PAD_OVERRIDE;
1398-
try {
1399-
mask_pad = std::stoi(mask_pad_str);
1400-
} catch (const std::invalid_argument&) {
1401-
LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable is not a valid integer (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
1402-
} catch (const std::out_of_range&) {
1403-
LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable value is out of range for `int` type (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
1404-
}
1405-
}
14061390
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
14071391

14081392
return SDCondition(hidden_states, t5_attn_mask, NULL);

otherarch/sdcpp/diffusion_model.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,9 @@ struct FluxModel : public DiffusionModel {
137137
FluxModel(ggml_backend_t backend,
138138
std::map<std::string, enum ggml_type>& tensor_types,
139139
SDVersion version = VERSION_FLUX,
140-
bool flash_attn = false)
141-
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
140+
bool flash_attn = false,
141+
bool use_mask = false)
142+
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
142143
}
143144

144145
void alloc_params_buffer() {

otherarch/sdcpp/flux.hpp

Lines changed: 23 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -744,10 +744,10 @@ namespace Flux {
744744
return ids;
745745
}
746746

747+
747748
// Generate positional embeddings
748749
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
749750
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
750-
751751
std::vector<std::vector<float>> trans_ids = transpose(ids);
752752
size_t pos_len = ids.size();
753753
int num_axes = axes_dim.size();
@@ -872,7 +872,7 @@ namespace Flux {
872872
struct ggml_tensor* y,
873873
struct ggml_tensor* guidance,
874874
struct ggml_tensor* pe,
875-
struct ggml_tensor* arange = NULL,
875+
struct ggml_tensor* mod_index_arange = NULL,
876876
std::vector<int> skip_layers = {}) {
877877
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
878878
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
@@ -887,9 +887,10 @@ namespace Flux {
887887
auto distill_timestep = ggml_nn_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
888888
auto distill_guidance = ggml_nn_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
889889

890-
// auto arange = ggml_arange(ctx, 0, (float)mod_index_length, 1); // Not working on a lot of backends, precomputing it on CPU instead
890+
// auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1);
891+
// ggml_arange tot working on a lot of backends, precomputing it on CPU instead
891892
GGML_ASSERT(arange != NULL);
892-
auto modulation_index = ggml_nn_timestep_embedding(ctx, arange, 32, 10000, 1000.f); // [1, 344, 32]
893+
auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32]
893894

894895
// Batch broadcast (will it ever be useful)
895896
modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2])); // [N, 344, 32]
@@ -982,7 +983,7 @@ namespace Flux {
982983
struct ggml_tensor* y,
983984
struct ggml_tensor* guidance,
984985
struct ggml_tensor* pe,
985-
struct ggml_tensor* arange = NULL,
986+
struct ggml_tensor* mod_index_arange = NULL,
986987
std::vector<ggml_tensor*> ref_latents = {},
987988
std::vector<int> skip_layers = {}) {
988989
// Forward pass of DiT.
@@ -1024,7 +1025,7 @@ namespace Flux {
10241025
}
10251026
}
10261027

1027-
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
1028+
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
10281029
if (out->ne[1] > img_tokens) {
10291030
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
10301031
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
@@ -1044,15 +1045,18 @@ namespace Flux {
10441045
public:
10451046
FluxParams flux_params;
10461047
Flux flux;
1047-
std::vector<float> pe_vec, range; // for cache
1048+
std::vector<float> pe_vec;
1049+
std::vector<float> mod_index_arange_vec; // for cache
10481050
SDVersion version;
1051+
bool use_mask = false;
10491052

10501053
FluxRunner(ggml_backend_t backend,
10511054
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
10521055
const std::string prefix = "",
10531056
SDVersion version = VERSION_FLUX,
1054-
bool flash_attn = false)
1055-
: GGMLRunner(backend) {
1057+
bool flash_attn = false,
1058+
bool use_mask = false)
1059+
: GGMLRunner(backend), use_mask(use_mask) {
10561060
flux_params.flash_attn = flash_attn;
10571061
flux_params.guidance_embed = false;
10581062
flux_params.depth = 0;
@@ -1116,51 +1120,28 @@ namespace Flux {
11161120
struct ggml_tensor* y,
11171121
struct ggml_tensor* guidance,
11181122
std::vector<ggml_tensor*> ref_latents = {},
1119-
std::vector<int> skip_layers = std::vector<int>()) {
1123+
std::vector<int> skip_layers = {}) {
11201124
GGML_ASSERT(x->ne[3] == 1);
11211125
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
11221126

1123-
struct ggml_tensor* precompute_arange = NULL;
1127+
struct ggml_tensor* mod_index_arange = NULL;
11241128

11251129
x = to_backend(x);
11261130
context = to_backend(context);
11271131
if (c_concat != NULL) {
11281132
c_concat = to_backend(c_concat);
11291133
}
1130-
11311134
if (flux_params.is_chroma) {
1132-
const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
1133-
bool disable_guidance = true;
1134-
if (SD_CHROMA_ENABLE_GUIDANCE != NULL) {
1135-
std::string enable_guidance_str = SD_CHROMA_ENABLE_GUIDANCE;
1136-
if (enable_guidance_str == "ON" || enable_guidance_str == "TRUE") {
1137-
LOG_WARN("Chroma guidance has been enabled. Image might be broken. (SD_CHROMA_ENABLE_GUIDANCE env variable to \"OFF\" to disable)", SD_CHROMA_ENABLE_GUIDANCE);
1138-
disable_guidance = false;
1139-
} else if (enable_guidance_str != "OFF" && enable_guidance_str != "FALSE") {
1140-
LOG_WARN("SD_CHROMA_ENABLE_GUIDANCE environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_ENABLE_GUIDANCE);
1141-
}
1142-
}
1143-
if (disable_guidance) {
1144-
// LOG_DEBUG("Forcing guidance to 0 for chroma model (SD_CHROMA_ENABLE_GUIDANCE env variable to \"ON\" to enable)");
1145-
guidance = ggml_set_f32(guidance, 0);
1146-
}
1135+
guidance = ggml_set_f32(guidance, 0);
11471136

1148-
1149-
const char* SD_CHROMA_USE_DIT_MASK = getenv("SD_CHROMA_USE_DIT_MASK");
1150-
if (SD_CHROMA_USE_DIT_MASK != nullptr) {
1151-
std::string sd_chroma_use_DiT_mask_str = SD_CHROMA_USE_DIT_MASK;
1152-
if (sd_chroma_use_DiT_mask_str == "OFF" || sd_chroma_use_DiT_mask_str == "FALSE") {
1153-
y = NULL;
1154-
} else if (sd_chroma_use_DiT_mask_str != "ON" && sd_chroma_use_DiT_mask_str != "TRUE") {
1155-
LOG_WARN("SD_CHROMA_USE_DIT_MASK environment variable has unexpected value. Assuming default (\"ON\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_USE_DIT_MASK);
1156-
}
1137+
if (!use_mask) {
1138+
y = NULL;
11571139
}
11581140

1159-
// ggml_arrange is not working on some backends, and y isn't used, so let's reuse y to precompute it
1160-
range = arange(0, 344);
1161-
precompute_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, range.size());
1162-
set_backend_tensor_data(precompute_arange, range.data());
1163-
// y = NULL;
1141+
// ggml_arange is not working on some backends, precompute it
1142+
mod_index_arange_vec = arange(0, 344);
1143+
mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
1144+
set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
11641145
}
11651146
y = to_backend(y);
11661147

@@ -1189,7 +1170,7 @@ namespace Flux {
11891170
y,
11901171
guidance,
11911172
pe,
1192-
precompute_arange,
1173+
mod_index_arange,
11931174
ref_latents,
11941175
skip_layers);
11951176

otherarch/sdcpp/main.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ struct SDParams {
128128
float slg_scale = 0.f;
129129
float skip_layer_start = 0.01f;
130130
float skip_layer_end = 0.2f;
131+
132+
bool chroma_use_dit_mask = true;
133+
bool chroma_use_t5_mask = false;
134+
int chroma_t5_mask_pad = 1;
131135
};
132136

133137
void print_params(SDParams params) {
@@ -177,6 +181,9 @@ void print_params(SDParams params) {
177181
printf(" batch_count: %d\n", params.batch_count);
178182
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
179183
printf(" upscale_repeats: %d\n", params.upscale_repeats);
184+
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
185+
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
186+
printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad);
180187
}
181188

182189
void print_usage(int argc, const char* argv[]) {
@@ -243,6 +250,9 @@ void print_usage(int argc, const char* argv[]) {
243250
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
244251
printf(" --canny apply canny preprocessor (edge detection)\n");
245252
printf(" --color Colors the logging tags according to level\n");
253+
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
254+
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
255+
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
246256
printf(" -v, --verbose print extra info\n");
247257
printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n");
248258
}
@@ -938,7 +948,10 @@ int main(int argc, const char* argv[]) {
938948
params.clip_on_cpu,
939949
params.control_net_cpu,
940950
params.vae_on_cpu,
941-
params.diffusion_flash_attn);
951+
params.diffusion_flash_attn,
952+
params.chroma_use_dit_mask,
953+
params.chroma_use_t5_mask,
954+
params.chroma_t5_mask_pad);
942955

943956
if (sd_ctx == NULL) {
944957
printf("new_sd_ctx_t failed\n");

otherarch/sdcpp/sdtype_adapter.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ struct SDParams {
104104
float slg_scale = 0.f;
105105
float skip_layer_start = 0.01f;
106106
float skip_layer_end = 0.2f;
107+
108+
bool chroma_use_dit_mask = true;
109+
bool chroma_use_t5_mask = false;
110+
int chroma_t5_mask_pad = 1;
107111
};
108112

109113
//shared
@@ -272,7 +276,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
272276
sd_params->clip_on_cpu,
273277
sd_params->control_net_cpu,
274278
sd_params->vae_on_cpu,
275-
sd_params->diffusion_flash_attn);
279+
sd_params->diffusion_flash_attn,
280+
sd_params->chroma_use_dit_mask,
281+
sd_params->chroma_use_t5_mask,
282+
sd_params->chroma_t5_mask_pad);
276283

277284
if (sd_ctx == NULL) {
278285
printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");

0 commit comments

Comments
 (0)