conv direct as a flag

daniandtheweb · daniandtheweb · commit 7ce4f3b03182 · 2025-07-30T20:10:01.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,7 +31,6 @@ option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
-option(SD_CONV2D_DIRECT              "sd: enable conv2d direct support" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
@@ -58,7 +57,6 @@ if (SD_OPENCL)
     message("-- Use OpenCL as backend stable-diffusion")
     set(GGML_OPENCL ON)
     add_definitions(-DSD_USE_OPENCL)
-    add_definitions(-DSD_USE_CONV2D_DIRECT)
 endif ()
 
 if (SD_HIPBLAS)
@@ -79,11 +77,6 @@ if(SD_MUSA)
     endif()
 endif()
 
-if(SD_CONV2D_DIRECT)
-    message("-- Use CONV2D Direct for VAE")
-    add_definitions(-DSD_USE_CONV2D_DIRECT)
-endif ()
-
 set(SD_LIB stable-diffusion)
 
 file(GLOB SD_LIB_SOURCES
diff --git a/README.md b/README.md
@@ -339,6 +339,8 @@ arguments:
   --vae-on-cpu                       keep vae in cpu (for low vram)
   --clip-on-cpu                      keep clip in cpu (for low vram)
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+  --diffusion-conv-direct            use Conv2D direct in the diffusion model
+  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)
                                      Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)
diff --git a/common.hpp b/common.hpp
@@ -8,18 +8,21 @@ class DownSampleBlock : public GGMLBlock {
     int channels;
     int out_channels;
     bool vae_downsample;
+    bool direct = false;
 
 public:
     DownSampleBlock(int channels,
                     int out_channels,
-                    bool vae_downsample = false)
+                    bool vae_downsample = false,
+                    bool direct         = false)
         : channels(channels),
           out_channels(out_channels),
-          vae_downsample(vae_downsample) {
+          vae_downsample(vae_downsample),
+          direct(direct) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, true));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct));
         } else {
-            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         }
     }
 
@@ -43,13 +46,16 @@ class UpSampleBlock : public GGMLBlock {
 protected:
     int channels;
     int out_channels;
+    bool direct = false;
 
 public:
     UpSampleBlock(int channels,
-                  int out_channels)
+                  int out_channels,
+                  bool direct = false)
         : channels(channels),
-          out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+          out_channels(out_channels),
+          direct(direct) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -381,7 +387,8 @@ class SpatialTransformer : public GGMLBlock {
                        int64_t d_head,
                        int64_t depth,
                        int64_t context_dim,
-                       bool flash_attn = false)
+                       bool flash_attn = false,
+                       bool direct = false)
         : in_channels(in_channels),
           n_head(n_head),
           d_head(d_head),
@@ -391,14 +398,14 @@ class SpatialTransformer : public GGMLBlock {
         // disable_self_attn is always False
         int64_t inner_dim = n_head * d_head;  // in_channels
         blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
 
         for (int i = 0; i < depth; i++) {
             std::string name = "transformer_blocks." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
         }
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -34,8 +34,9 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               std::map<std::string, enum ggml_type>& tensor_types,
               SDVersion version = VERSION_SD1,
-              bool flash_attn   = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+              bool flash_attn   = false,
+              bool direct       = false)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) {
     }
 
     void alloc_params_buffer() {
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -97,6 +97,8 @@ struct SDParams {
     bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool diffusion_flash_attn     = false;
+    bool diffusion_conv_direct    = false;
+    bool vae_conv_direct          = false;
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
@@ -142,6 +144,8 @@ void print_params(SDParams params) {
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -232,6 +236,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2D direct in the diffusion model");
+    printf("  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -422,6 +428,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
         {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbos", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -901,6 +909,8 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
         params.chroma_t5_mask_pad,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -1514,14 +1514,10 @@ class Conv2d : public UnaryBlock {
             direct = true
         #endif
         if (direct) {
-            #if defined(SD_USE_CONV2D_DIRECT)
-                #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
-                    return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-                #else
-                    return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-                #endif
-            #else
+            #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
                 return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+            #else
+                return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
             #endif
         } else {
             return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -326,6 +326,12 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
+            if (sd_ctx_params->diffusion_conv_direct) {
+                LOG_INFO("Using Conv2D direct in the diffusion model");
+            }
+            if (sd_ctx_params->vae_conv_direct){
+                LOG_INFO("Using Conv2D direct in the vae model");
+            }
             if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
@@ -373,7 +379,8 @@ class StableDiffusionGGML {
                 diffusion_model = std::make_shared<UNetModel>(backend,
                                                               model_loader.tensor_storages_types,
                                                               version,
-                                                              sd_ctx_params->diffusion_flash_attn);
+                                                              sd_ctx_params->diffusion_flash_attn,
+                                                              sd_ctx_params->diffusion_conv_direct);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -394,15 +401,17 @@ class StableDiffusionGGML {
                                                                     "first_stage_model",
                                                                     vae_decode_only,
                                                                     false,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -134,6 +134,8 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
diff --git a/tae.hpp b/tae.hpp
diff --git a/unet.hpp b/unet.hpp
diff --git a/vae.hpp b/vae.hpp