Skip to content

Commit 86b9445

Browse files
committed
Sync sd.cpp to 90ef5f8
1 parent fab2ff0 commit 86b9445

File tree

8 files changed

+92
-66
lines changed

8 files changed

+92
-66
lines changed

otherarch/sdcpp/conditioner.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
14571457
const ConditionerParams& conditioner_params) {
14581458
std::string prompt;
14591459
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
1460-
size_t system_prompt_length = 0;
1460+
size_t system_prompt_length = 0;
14611461
int prompt_template_encode_start_idx = 34;
14621462
if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
14631463
LOG_INFO("QwenImageEditPlusPipeline");

otherarch/sdcpp/ggml_extend.hpp

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -381,12 +381,14 @@ __STATIC_INLINE__ float sigmoid(float x) {
381381

382382
// SPECIAL OPERATIONS WITH TENSORS
383383

384-
__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
384+
__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) {
385385
int64_t width = input->ne[0];
386386
int64_t height = input->ne[1];
387387
int64_t channels = input->ne[2];
388388
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
389-
uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
389+
if (image_data == nullptr) {
390+
image_data = (uint8_t*)malloc(width * height * channels);
391+
}
390392
for (int iy = 0; iy < height; iy++) {
391393
for (int ix = 0; ix < width; ix++) {
392394
for (int k = 0; k < channels; k++) {
@@ -979,38 +981,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
979981
struct ggml_tensor* x,
980982
struct ggml_tensor* w,
981983
struct ggml_tensor* b,
982-
int s0 = 1,
983-
int s1 = 1,
984-
int p0 = 0,
985-
int p1 = 0,
986-
int d0 = 1,
987-
int d1 = 1) {
988-
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
989-
if (b != NULL) {
990-
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
991-
// b = ggml_repeat(ctx, b, x);
992-
x = ggml_add_inplace(ctx, x, b);
984+
int s0 = 1,
985+
int s1 = 1,
986+
int p0 = 0,
987+
int p1 = 0,
988+
int d0 = 1,
989+
int d1 = 1,
990+
bool direct = false,
991+
float scale = 1.f) {
992+
if (scale != 1.f) {
993+
x = ggml_scale(ctx, x, scale);
994+
}
995+
if (direct) {
996+
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
997+
} else {
998+
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
999+
}
1000+
if (scale != 1.f) {
1001+
x = ggml_scale(ctx, x, 1.f / scale);
9931002
}
994-
return x;
995-
}
996-
997-
// w: [OC*IC, KD, KH, KW]
998-
// x: [N*IC, ID, IH, IW]
999-
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
1000-
struct ggml_tensor* x,
1001-
struct ggml_tensor* w,
1002-
struct ggml_tensor* b,
1003-
int s0 = 1,
1004-
int s1 = 1,
1005-
int p0 = 0,
1006-
int p1 = 0,
1007-
int d0 = 1,
1008-
int d1 = 1) {
1009-
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
10101003
if (b != NULL) {
10111004
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
1012-
// b = ggml_repeat(ctx, b, x);
1013-
x = ggml_add(ctx, x, b);
1005+
x = ggml_add_inplace(ctx, x, b);
10141006
}
10151007
return x;
10161008
}
@@ -2071,6 +2063,7 @@ class Conv2d : public UnaryBlock {
20712063
std::pair<int, int> dilation;
20722064
bool bias;
20732065
bool direct = false;
2066+
float scale = 1.f;
20742067

20752068
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
20762069
enum ggml_type wtype = GGML_TYPE_F16;
@@ -2101,6 +2094,10 @@ class Conv2d : public UnaryBlock {
21012094
direct = true;
21022095
}
21032096

2097+
void set_scale(float scale_value) {
2098+
scale = scale_value;
2099+
}
2100+
21042101
std::string get_desc() {
21052102
return "Conv2d";
21062103
}
@@ -2111,11 +2108,18 @@ class Conv2d : public UnaryBlock {
21112108
if (bias) {
21122109
b = params["bias"];
21132110
}
2114-
if (direct) {
2115-
return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
2116-
} else {
2117-
return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
2118-
}
2111+
return ggml_nn_conv_2d(ctx,
2112+
x,
2113+
w,
2114+
b,
2115+
stride.second,
2116+
stride.first,
2117+
padding.second,
2118+
padding.first,
2119+
dilation.second,
2120+
dilation.first,
2121+
direct,
2122+
scale);
21192123
}
21202124
};
21212125

otherarch/sdcpp/main.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ struct SDParams {
8080
std::string control_image_path;
8181
std::vector<std::string> ref_image_paths;
8282
std::string control_video_path;
83-
bool increase_ref_index = false;
83+
bool auto_resize_ref_image = true;
84+
bool increase_ref_index = false;
8485

8586
std::string prompt;
8687
std::string negative_prompt;
@@ -131,6 +132,7 @@ struct SDParams {
131132
prediction_t prediction = DEFAULT_PRED;
132133

133134
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
135+
bool force_sdxl_vae_conv_scale = false;
134136

135137
SDParams() {
136138
sd_sample_params_init(&sample_params);
@@ -174,6 +176,7 @@ void print_params(SDParams params) {
174176
printf(" %s\n", path.c_str());
175177
};
176178
printf(" control_video_path: %s\n", params.control_video_path.c_str());
179+
printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false");
177180
printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");
178181
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
179182
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
@@ -198,6 +201,7 @@ void print_params(SDParams params) {
198201
printf(" seed: %zd\n", params.seed);
199202
printf(" batch_count: %d\n", params.batch_count);
200203
printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false");
204+
printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
201205
printf(" upscale_repeats: %d\n", params.upscale_repeats);
202206
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
203207
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
@@ -242,9 +246,10 @@ void print_usage(int argc, const char* argv[]) {
242246
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
243247
printf(" --control-image [IMAGE] path to image condition, control net\n");
244248
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
249+
printf(" --disable-auto-resize-ref-image disable auto resize of ref images\n");
245250
printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n");
246251
printf(" The video frames inside should be stored as images in lexicographical (character) order\n");
247-
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, etc.\n");
252+
printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.\n");
248253
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
249254
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
250255
printf(" -p, --prompt [PROMPT] the prompt to render\n");
@@ -292,6 +297,7 @@ void print_usage(int argc, const char* argv[]) {
292297
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
293298
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
294299
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
300+
printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n");
295301
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
296302
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
297303
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
@@ -562,6 +568,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
562568

563569
options.bool_options = {
564570
{"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
571+
{"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
565572
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
566573
{"", "--control-net-cpu", "", true, &params.control_net_cpu},
567574
{"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@@ -575,6 +582,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
575582
{"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
576583
{"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
577584
{"", "--increase-ref-index", "", true, &params.increase_ref_index},
585+
{"", "--disable-auto-resize-ref-image", "", false, &params.auto_resize_ref_image},
578586
};
579587

580588
auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -1382,6 +1390,7 @@ int main(int argc, const char* argv[]) {
13821390
params.diffusion_flash_attn,
13831391
params.diffusion_conv_direct,
13841392
params.vae_conv_direct,
1393+
params.force_sdxl_vae_conv_scale,
13851394
params.chroma_use_dit_mask,
13861395
params.chroma_use_t5_mask,
13871396
params.chroma_t5_mask_pad,
@@ -1423,6 +1432,7 @@ int main(int argc, const char* argv[]) {
14231432
init_image,
14241433
ref_images.data(),
14251434
(int)ref_images.size(),
1435+
params.auto_resize_ref_image,
14261436
params.increase_ref_index,
14271437
mask_image,
14281438
params.width,

otherarch/sdcpp/preprocessing.hpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
88
struct ggml_init_params params;
9-
params.mem_size = 20 * 1024 * 1024; // 10
9+
params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512
1010
params.mem_buffer = NULL;
1111
params.no_alloc = false;
1212
struct ggml_context* ctx0 = ggml_init(params);
@@ -164,7 +164,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
164164

165165
bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
166166
struct ggml_init_params params;
167-
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10MB
167+
params.mem_size = static_cast<size_t>(40 * img.width * img.height); // 10MB for 512x512
168168
params.mem_buffer = NULL;
169169
params.no_alloc = false;
170170
struct ggml_context* work_ctx = ggml_init(params);
@@ -218,9 +218,7 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
218218
ggml_tensor_set_f32(image, gray, ix, iy, 2);
219219
}
220220
}
221-
uint8_t* output = sd_tensor_to_image(image);
222-
free(img.data);
223-
img.data = output;
221+
sd_tensor_to_image(image, img.data);
224222
ggml_free(work_ctx);
225223
return true;
226224
}

otherarch/sdcpp/qwen_image.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ namespace Qwen {
535535
}
536536
}
537537
LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
538-
qwen_image = QwenImageModel(qwen_image_params);
538+
qwen_image = QwenImageModel(qwen_image_params);
539539
qwen_image.init(params_ctx, tensor_types, prefix);
540540
}
541541

otherarch/sdcpp/stable-diffusion.cpp

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -452,13 +452,6 @@ class StableDiffusionGGML {
452452

453453
if (sd_version_is_sdxl(version)) {
454454
scale_factor = 0.13025f;
455-
if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && taesd_path_fixed.size() == 0) {
456-
LOG_WARN(
457-
"!!!It looks like you are using SDXL model. "
458-
"If you find that the generated images are completely black, "
459-
"try specifying a different VAE. "
460-
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
461-
}
462455
} else if (sd_version_is_sd3(version)) {
463456
scale_factor = 1.5305f;
464457
} else if (sd_version_is_flux(version)) {
@@ -476,17 +469,7 @@ class StableDiffusionGGML {
476469
bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
477470

478471
{
479-
clip_backend = backend;
480-
bool use_t5xxl = false;
481-
if (sd_version_is_dit(version) && !sd_version_is_qwen_image(version)) {
482-
use_t5xxl = true;
483-
}
484-
if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
485-
LOG_WARN(
486-
"!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
487-
"If you notice that the generated images are completely black,"
488-
"try running the T5 model on the CPU using the --clip-on-cpu parameter.");
489-
}
472+
clip_backend = backend;
490473
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
491474
LOG_INFO("CLIP: Using CPU backend");
492475
clip_backend = ggml_backend_cpu_init();
@@ -649,6 +632,15 @@ class StableDiffusionGGML {
649632
LOG_INFO("Using Conv2d direct in the vae model");
650633
first_stage_model->enable_conv2d_direct();
651634
}
635+
if (version == VERSION_SDXL &&
636+
(strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
637+
float vae_conv_2d_scale = 1.f / 32.f;
638+
LOG_WARN(
639+
"No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
640+
"using Conv2D scale %.3f",
641+
vae_conv_2d_scale);
642+
first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
643+
}
652644
first_stage_model->alloc_params_buffer();
653645
first_stage_model->get_param_tensors(tensors, "first_stage_model");
654646
} else {
@@ -2150,6 +2142,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
21502142
"seed: %" PRId64
21512143
"batch_count: %d\n"
21522144
"ref_images_count: %d\n"
2145+
"auto_resize_ref_image: %s\n"
21532146
"increase_ref_index: %s\n"
21542147
"control_strength: %.2f\n"
21552148
"photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@@ -2164,6 +2157,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
21642157
sd_img_gen_params->seed,
21652158
sd_img_gen_params->batch_count,
21662159
sd_img_gen_params->ref_images_count,
2160+
BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
21672161
BOOL_STR(sd_img_gen_params->increase_ref_index),
21682162
sd_img_gen_params->control_strength,
21692163
sd_img_gen_params->pm_params.style_strength,
@@ -2804,14 +2798,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
28042798
std::vector<ggml_tensor*> ref_latents;
28052799
for (int i = 0; i < ref_images.size(); i++) {
28062800
ggml_tensor* img;
2807-
if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
2801+
if (sd_img_gen_params->auto_resize_ref_image) {
2802+
LOG_DEBUG("auto resize ref images");
28082803
sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
28092804
int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height);
28102805
double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
28112806
double vae_height = vae_width * ref_image.height / ref_image.width;
28122807

2813-
vae_height = round(vae_height / 32) * 32;
2814-
vae_width = round(vae_width / 32) * 32;
2808+
int factor = 16;
2809+
if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
2810+
factor = 32;
2811+
}
2812+
2813+
vae_height = round(vae_height / factor) * factor;
2814+
vae_width = round(vae_width / factor) * factor;
28152815

28162816
sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
28172817
free(ref_image.data);

otherarch/sdcpp/stable-diffusion.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ typedef struct {
164164
bool diffusion_flash_attn;
165165
bool diffusion_conv_direct;
166166
bool vae_conv_direct;
167+
bool force_sdxl_vae_conv_scale;
167168
bool chroma_use_dit_mask;
168169
bool chroma_use_t5_mask;
169170
int chroma_t5_mask_pad;
@@ -215,6 +216,7 @@ typedef struct {
215216
sd_image_t init_image;
216217
sd_image_t* ref_images;
217218
int ref_images_count;
219+
bool auto_resize_ref_image;
218220
bool increase_ref_index;
219221
sd_image_t mask_image;
220222
int width;

otherarch/sdcpp/vae.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ struct VAE : public GGMLRunner {
530530
struct ggml_context* output_ctx) = 0;
531531
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
532532
virtual void enable_conv2d_direct(){};
533+
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
533534
};
534535

535536
struct AutoEncoderKL : public VAE {
@@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE {
558559
}
559560
}
560561

562+
void set_conv2d_scale(float scale) {
563+
std::vector<GGMLBlock*> blocks;
564+
ae.get_all_blocks(blocks);
565+
for (auto block : blocks) {
566+
if (block->get_desc() == "Conv2d") {
567+
auto conv_block = (Conv2d*)block;
568+
conv_block->set_scale(scale);
569+
}
570+
}
571+
}
572+
561573
std::string get_desc() {
562574
return "vae";
563575
}

0 commit comments

Comments
 (0)