leejet · leejet · Aug 2, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/README.md b/README.md
@@ -339,6 +339,8 @@ arguments:
   --vae-on-cpu                       keep vae in cpu (for low vram)
   --clip-on-cpu                      keep clip in cpu (for low vram)
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+  --diffusion-conv-direct            use Conv2D direct in the diffusion model
+  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)
                                      Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)

diff --git a/common.hpp b/common.hpp
@@ -8,18 +8,21 @@ class DownSampleBlock : public GGMLBlock {
     int channels;
     int out_channels;
     bool vae_downsample;
+    bool direct = false;
 
 public:
     DownSampleBlock(int channels,
                     int out_channels,
-                    bool vae_downsample = false)
+                    bool vae_downsample = false,
+                    bool direct         = false)
         : channels(channels),
           out_channels(out_channels),
-          vae_downsample(vae_downsample) {
+          vae_downsample(vae_downsample),
+          direct(direct) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct));
         } else {
-            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         }
     }
 
@@ -43,13 +46,16 @@ class UpSampleBlock : public GGMLBlock {
 protected:
     int channels;
     int out_channels;
+    bool direct = false;
 
 public:
     UpSampleBlock(int channels,
-                  int out_channels)
+                  int out_channels,
+                  bool direct = false)
         : channels(channels),
-          out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+          out_channels(out_channels),
+          direct(direct) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -381,7 +387,8 @@ class SpatialTransformer : public GGMLBlock {
                        int64_t d_head,
                        int64_t depth,
                        int64_t context_dim,
-                       bool flash_attn = false)
+                       bool flash_attn = false,
+                       bool direct = false)
         : in_channels(in_channels),
           n_head(n_head),
           d_head(d_head),
@@ -391,14 +398,14 @@ class SpatialTransformer : public GGMLBlock {
         // disable_self_attn is always False
         int64_t inner_dim = n_head * d_head;  // in_channels
         blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
 
         for (int i = 0; i < depth; i++) {
             std::string name = "transformer_blocks." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
         }
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -34,8 +34,9 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_SD1,
-              bool flash_attn                     = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+              bool flash_attn                     = false,
+              bool direct                         = false)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) {
     }
 
     void alloc_params_buffer() {

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -97,6 +97,8 @@ struct SDParams {
     bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool diffusion_flash_attn     = false;
+    bool diffusion_conv_direct    = false;
+    bool vae_conv_direct          = false;
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
@@ -142,6 +144,8 @@ void print_params(SDParams params) {
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -232,6 +236,10 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2D direct in the diffusion model");
+    printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)");
+    printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -422,6 +430,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
         {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbos", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -901,6 +911,8 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
         params.chroma_t5_mask_pad,

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -706,6 +706,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                      struct ggml_tensor* x,
+                                                      struct ggml_tensor* w,
+                                                      struct ggml_tensor* b,
+                                                      int s0 = 1,
+                                                      int s1 = 1,
+                                                      int p0 = 0,
+                                                      int p1 = 0,
+                                                      int d0 = 1,
+                                                      int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
 // w: [OC，IC, KD, 1 * 1]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@@ -1464,6 +1483,7 @@ class Conv2d : public UnaryBlock {
     std::pair<int, int> padding;
     std::pair<int, int> dilation;
     bool bias;
+    bool direct;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
         enum ggml_type wtype = GGML_TYPE_F16;
@@ -1481,22 +1501,28 @@ class Conv2d : public UnaryBlock {
            std::pair<int, int> stride   = {1, 1},
            std::pair<int, int> padding  = {0, 0},
            std::pair<int, int> dilation = {1, 1},
-           bool bias                    = true)
+           bool bias                    = true,
+           bool direct                  = false)
         : in_channels(in_channels),
           out_channels(out_channels),
           kernel_size(kernel_size),
           stride(stride),
           padding(padding),
           dilation(dilation),
-          bias(bias) {}
+          bias(bias),
+          direct(direct) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = NULL;
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        if (direct) {
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        }
     }
 };
 

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -326,6 +326,12 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
+            if (sd_ctx_params->diffusion_conv_direct) {
+                LOG_INFO("Using Conv2D direct in the diffusion model");
+            }
+            if (sd_ctx_params->vae_conv_direct){
+                LOG_INFO("Using Conv2D direct in the vae model");
+            }
             if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
@@ -373,7 +379,8 @@ class StableDiffusionGGML {
                 diffusion_model = std::make_shared<UNetModel>(backend,
                                                               model_loader.tensor_storages_types,
                                                               version,
-                                                              sd_ctx_params->diffusion_flash_attn);
+                                                              sd_ctx_params->diffusion_flash_attn,
+                                                              sd_ctx_params->diffusion_conv_direct);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -394,15 +401,17 @@ class StableDiffusionGGML {
                                                                     "first_stage_model",
                                                                     vae_decode_only,
                                                                     false,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 

diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -134,6 +134,8 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;