Sync sd.cpp to 90ef5f8

LostRuins · LostRuins · commit 86b94456ded1 · 2025-10-20T11:07:42.000+08:00
diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp
@@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                                       const ConditionerParams& conditioner_params) {
         std::string prompt;
         std::vector<std::pair<int, ggml_tensor*>> image_embeds;
-        size_t system_prompt_length = 0;
+        size_t system_prompt_length          = 0;
         int prompt_template_encode_start_idx = 34;
         if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
             LOG_INFO("QwenImageEditPlusPipeline");
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
@@ -381,12 +381,14 @@ __STATIC_INLINE__ float sigmoid(float x) {
 
 // SPECIAL OPERATIONS WITH TENSORS
 
-__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
+__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) {
     int64_t width    = input->ne[0];
     int64_t height   = input->ne[1];
     int64_t channels = input->ne[2];
     GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
-    uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
+    if (image_data == nullptr) {
+        image_data = (uint8_t*)malloc(width * height * channels);
+    }
     for (int iy = 0; iy < height; iy++) {
         for (int ix = 0; ix < width; ix++) {
             for (int k = 0; k < channels; k++) {
@@ -979,38 +981,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
                                                       struct ggml_tensor* x,
                                                       struct ggml_tensor* w,
                                                       struct ggml_tensor* b,
-                                                      int s0 = 1,
-                                                      int s1 = 1,
-                                                      int p0 = 0,
-                                                      int p1 = 0,
-                                                      int d0 = 1,
-                                                      int d1 = 1) {
-    x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
-    if (b != NULL) {
-        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
-        // b = ggml_repeat(ctx, b, x);
-        x = ggml_add_inplace(ctx, x, b);
+                                                      int s0      = 1,
+                                                      int s1      = 1,
+                                                      int p0      = 0,
+                                                      int p1      = 0,
+                                                      int d0      = 1,
+                                                      int d1      = 1,
+                                                      bool direct = false,
+                                                      float scale = 1.f) {
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, scale);
+    }
+    if (direct) {
+        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    } else {
+        x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    }
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, 1.f / scale);
     }
-    return x;
-}
-
-// w: [OC*IC, KD, KH, KW]
-// x: [N*IC, ID, IH, IW]
-__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
-                                                             struct ggml_tensor* x,
-                                                             struct ggml_tensor* w,
-                                                             struct ggml_tensor* b,
-                                                             int s0 = 1,
-                                                             int s1 = 1,
-                                                             int p0 = 0,
-                                                             int p1 = 0,
-                                                             int d0 = 1,
-                                                             int d1 = 1) {
-    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
     if (b != NULL) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
-        // b = ggml_repeat(ctx, b, x);
-        x = ggml_add(ctx, x, b);
+        x = ggml_add_inplace(ctx, x, b);
     }
     return x;
 }
@@ -2071,6 +2063,7 @@ class Conv2d : public UnaryBlock {
     std::pair<int, int> dilation;
     bool bias;
     bool direct = false;
+    float scale = 1.f;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
         enum ggml_type wtype = GGML_TYPE_F16;
@@ -2101,6 +2094,10 @@ class Conv2d : public UnaryBlock {
         direct = true;
     }
 
+    void set_scale(float scale_value) {
+        scale = scale_value;
+    }
+
     std::string get_desc() {
         return "Conv2d";
     }
@@ -2111,11 +2108,18 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        if (direct) {
-            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-        } else {
-            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-        }
+        return ggml_nn_conv_2d(ctx,
+                               x,
+                               w,
+                               b,
+                               stride.second,
+                               stride.first,
+                               padding.second,
+                               padding.first,
+                               dilation.second,
+                               dilation.first,
+                               direct,
+                               scale);
     }
 };
 
diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp
@@ -80,7 +80,8 @@ struct SDParams {
     std::string control_image_path;
     std::vector<std::string> ref_image_paths;
     std::string control_video_path;
-    bool increase_ref_index = false;
+    bool auto_resize_ref_image = true;
+    bool increase_ref_index    = false;
 
     std::string prompt;
     std::string negative_prompt;
@@ -131,6 +132,7 @@ struct SDParams {
     prediction_t prediction = DEFAULT_PRED;
 
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
+    bool force_sdxl_vae_conv_scale       = false;
 
     SDParams() {
         sd_sample_params_init(&sample_params);
@@ -174,6 +176,7 @@ void print_params(SDParams params) {
         printf("        %s\n", path.c_str());
     };
     printf("    control_video_path:                %s\n", params.control_video_path.c_str());
+    printf("    auto_resize_ref_image:             %s\n", params.auto_resize_ref_image ? "true" : "false");
     printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
     printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
     printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@@ -198,6 +201,7 @@ void print_params(SDParams params) {
     printf("    seed:                              %zd\n", params.seed);
     printf("    batch_count:                       %d\n", params.batch_count);
     printf("    vae_tiling:                        %s\n", params.vae_tiling_params.enabled ? "true" : "false");
+    printf("    force_sdxl_vae_conv_scale:         %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
     printf("    upscale_repeats:                   %d\n", params.upscale_repeats);
     printf("    chroma_use_dit_mask:               %s\n", params.chroma_use_dit_mask ? "true" : "false");
     printf("    chroma_use_t5_mask:                %s\n", params.chroma_use_t5_mask ? "true" : "false");
@@ -242,9 +246,10 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
     printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  --disable-auto-resize-ref-image    disable auto resize of ref images\n");
     printf("  --control-video [PATH]             path to control video frames, It must be a directory path.\n");
     printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
-    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
+    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.\n");
     printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
     printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
@@ -292,6 +297,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)\n");
     printf("  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
     printf("  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
+    printf("  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae\n");
     printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
     printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
@@ -562,6 +568,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
 
     options.bool_options = {
         {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
+        {"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
         {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
         {"", "--control-net-cpu", "", true, &params.control_net_cpu},
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@@ -575,6 +582,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
         {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
         {"", "--increase-ref-index", "", true, &params.increase_ref_index},
+        {"", "--disable-auto-resize-ref-image", "", false, &params.auto_resize_ref_image},
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -1382,6 +1390,7 @@ int main(int argc, const char* argv[]) {
         params.diffusion_flash_attn,
         params.diffusion_conv_direct,
         params.vae_conv_direct,
+        params.force_sdxl_vae_conv_scale,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
         params.chroma_t5_mask_pad,
@@ -1423,6 +1432,7 @@ int main(int argc, const char* argv[]) {
                 init_image,
                 ref_images.data(),
                 (int)ref_images.size(),
+                params.auto_resize_ref_image,
                 params.increase_ref_index,
                 mask_image,
                 params.width,
diff --git a/otherarch/sdcpp/preprocessing.hpp b/otherarch/sdcpp/preprocessing.hpp
@@ -6,7 +6,7 @@
 
 void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
     struct ggml_init_params params;
-    params.mem_size                 = 20 * 1024 * 1024;  // 10
+    params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
     params.mem_buffer               = NULL;
     params.no_alloc                 = false;
     struct ggml_context* ctx0       = ggml_init(params);
@@ -164,7 +164,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
 
 bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
     struct ggml_init_params params;
-    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10MB
+    params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
     params.mem_buffer             = NULL;
     params.no_alloc               = false;
     struct ggml_context* work_ctx = ggml_init(params);
@@ -218,9 +218,7 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
             ggml_tensor_set_f32(image, gray, ix, iy, 2);
         }
     }
-    uint8_t* output = sd_tensor_to_image(image);
-    free(img.data);
-    img.data = output;
+    sd_tensor_to_image(image, img.data);
     ggml_free(work_ctx);
     return true;
 }
diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp
@@ -535,7 +535,7 @@ namespace Qwen {
                 }
             }
             LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
-            qwen_image                   = QwenImageModel(qwen_image_params);
+            qwen_image = QwenImageModel(qwen_image_params);
             qwen_image.init(params_ctx, tensor_types, prefix);
         }
 
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
@@ -452,13 +452,6 @@ class StableDiffusionGGML {
 
         if (sd_version_is_sdxl(version)) {
             scale_factor = 0.13025f;
-            if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && taesd_path_fixed.size() == 0) {
-                LOG_WARN(
-                    "!!!It looks like you are using SDXL model. "
-                    "If you find that the generated images are completely black, "
-                    "try specifying a different VAE. "
-                    "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
-            }
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
         } else if (sd_version_is_flux(version)) {
@@ -476,17 +469,7 @@ class StableDiffusionGGML {
         bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
 
         {
-            clip_backend   = backend;
-            bool use_t5xxl = false;
-            if (sd_version_is_dit(version) && !sd_version_is_qwen_image(version)) {
-                use_t5xxl = true;
-            }
-            if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
-                LOG_WARN(
-                    "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
-                    "If you notice that the generated images are completely black,"
-                    "try running the T5 model on the CPU using the --clip-on-cpu parameter.");
-            }
+            clip_backend = backend;
             if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
@@ -649,6 +632,15 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the vae model");
                     first_stage_model->enable_conv2d_direct();
                 }
+                if (version == VERSION_SDXL &&
+                    (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
+                    float vae_conv_2d_scale = 1.f / 32.f;
+                    LOG_WARN(
+                        "No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
+                        "using Conv2D scale %.3f",
+                        vae_conv_2d_scale);
+                    first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
+                }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
@@ -2150,6 +2142,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              "seed: %" PRId64
              "batch_count: %d\n"
              "ref_images_count: %d\n"
+             "auto_resize_ref_image: %s\n"
              "increase_ref_index: %s\n"
              "control_strength: %.2f\n"
              "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@@ -2164,6 +2157,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              sd_img_gen_params->seed,
              sd_img_gen_params->batch_count,
              sd_img_gen_params->ref_images_count,
+             BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
              BOOL_STR(sd_img_gen_params->increase_ref_index),
              sd_img_gen_params->control_strength,
              sd_img_gen_params->pm_params.style_strength,
@@ -2804,14 +2798,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     std::vector<ggml_tensor*> ref_latents;
     for (int i = 0; i < ref_images.size(); i++) {
         ggml_tensor* img;
-        if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+        if (sd_img_gen_params->auto_resize_ref_image) {
+            LOG_DEBUG("auto resize ref images");
             sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
             int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
             double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
             double vae_height        = vae_width * ref_image.height / ref_image.width;
 
-            vae_height = round(vae_height / 32) * 32;
-            vae_width  = round(vae_width / 32) * 32;
+            int factor = 16;
+            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+                factor = 32;
+            }
+
+            vae_height = round(vae_height / factor) * factor;
+            vae_width  = round(vae_width / factor) * factor;
 
             sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
             free(ref_image.data);
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
@@ -164,6 +164,7 @@ typedef struct {
     bool diffusion_flash_attn;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
+    bool force_sdxl_vae_conv_scale;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
@@ -215,6 +216,7 @@ typedef struct {
     sd_image_t init_image;
     sd_image_t* ref_images;
     int ref_images_count;
+    bool auto_resize_ref_image;
     bool increase_ref_index;
     sd_image_t mask_image;
     int width;
diff --git a/otherarch/sdcpp/vae.hpp b/otherarch/sdcpp/vae.hpp
@@ -530,6 +530,7 @@ struct VAE : public GGMLRunner {
                          struct ggml_context* output_ctx)                                                         = 0;
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
     virtual void enable_conv2d_direct(){};
+    virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
 };
 
 struct AutoEncoderKL : public VAE {
@@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE {
         }
     }
 
+    void set_conv2d_scale(float scale) {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->set_scale(scale);
+            }
+        }
+    }
+
     std::string get_desc() {
         return "vae";
     }

Original file line number	Diff line number	Diff line change
`@@ -535,7 +535,7 @@ namespace Qwen {`
`535`	`535`	`}`
`536`	`536`	`}`
`537`	`537`	`LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);`
`538`		`- qwen_image = QwenImageModel(qwen_image_params);`
	`538`	`+ qwen_image = QwenImageModel(qwen_image_params);`
`539`	`539`	`qwen_image.init(params_ctx, tensor_types, prefix);`
`540`	`540`	`}`
`541`	`541`