Merge branch 'add-winograd-conv2d-v1' of https://github.com/bssrdf/stable-diffusion.cpp into server_flash_winograd1

Green-Sky · Green-Sky · commit a160cc999f82 · 2024-10-12T19:13:57.000+02:00
diff --git a/common.hpp b/common.hpp
@@ -49,12 +49,15 @@ class UpSampleBlock : public GGMLBlock {
                   int out_channels)
         : channels(channels),
           out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        if(channels % 8 == 0 && out_channels % 64 == 0)    
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(channels, out_channels));
+        else
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, channels, h, w]
-        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+        auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
 
         x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
         x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
@@ -82,7 +85,12 @@ class ResBlock : public GGMLBlock {
         if (dims == 3) {
             return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
         } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
+            if (kernel_size.first == 3 && kernel_size.second == 3 && 
+                in_channels % 8 == 0 && out_channels % 64 == 0 && 
+                padding.first == 1 && padding.second == 1)
+                return std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(in_channels, out_channels));
+            else
+                return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
         }
     }
 
@@ -138,8 +146,9 @@ class ResBlock : public GGMLBlock {
         // in_layers
         auto h = in_layers_0->forward(ctx, x);
         h      = ggml_silu_inplace(ctx, h);
+        // print_ggml_tensor(h, true, "bef in_layer"); 
         h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
-
+        // print_ggml_tensor(h, true, "aft in_layer"); 
         // emb_layers
         if (!skip_t_emb) {
             auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -24,6 +24,8 @@ struct DiffusionModel {
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                             = 0;
     virtual int64_t get_adm_in_channels()                                               = 0;
+    virtual void transform(int n)                                                       = 0;
+
 };
 
 struct UNetModel : public DiffusionModel {
@@ -40,6 +42,10 @@ struct UNetModel : public DiffusionModel {
         unet.alloc_params_buffer();
     }
 
+    void transform(int n){
+        unet.transform(n);
+    }
+
     void free_params_buffer() {
         unet.free_params_buffer();
     }
@@ -109,6 +115,10 @@ struct MMDiTModel : public DiffusionModel {
         return 768 + 1280;
     }
 
+    void transform(int n){
+     
+    }
+
     void compute(int n_threads,
                  struct ggml_tensor* x,
                  struct ggml_tensor* timesteps,
@@ -159,6 +169,10 @@ struct FluxModel : public DiffusionModel {
         return 768;
     }
 
+    void transform(int n){
+     
+    }
+
     void compute(int n_threads,
                  struct ggml_tensor* x,
                  struct ggml_tensor* timesteps,
diff --git a/ggml b/ggml
@@ -1 +1 @@
-Subproject commit 21d3a308fcb7f31cb9beceaeebad4fb622f3c337
+Subproject commit 9a389a22ac0d17fca052936b11ceff31b40d9364
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -591,7 +591,47 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
                                                       int p1 = 0,
                                                       int d0 = 1,
                                                       int d1 = 1) {
+    // if(w->ne[0]==3 && w->ne[1]==3 && p0==1 && p1==1 && s0==1 && s1==1 && 
+    //       d0==1 && d1==1 && w->ne[3]%64 == 0 && w->ne[2]%8 == 0 && x->ne[3] == 1){
+       
+        // printf("x-shape 0: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]); 
+        // printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
+        // print_ggml_tensor(x, false, "bef wino");
+        // x = ggml_conv_2d_3x3(ctx, w, x);         
+        // print_ggml_tensor(x, false, "aft wino");                                 
+        // printf("x-shape 2: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]); 
+    // }          
+    // else{    
     x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        // if(w->ne[0]==3 && w->ne[1]==3 && p0==1 && p1==1 && s0==1 && s1==1 && 
+        //   d0==1 && d1==1 && w->ne[3]%64 == 0 && w->ne[2]%8 == 0 && x->ne[3] == 1){
+        //     printf("x-shape1: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
+        // }
+    // }
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
+// w: [IC, 4, 4, OC]
+// x: [1, IC, IH, IW]
+// b: [OC,]
+// result: [N, OC, OH, OW]
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d1x3x3(struct ggml_context* ctx,
+                                                      struct ggml_tensor* x,
+                                                      struct ggml_tensor* w,
+                                                      struct ggml_tensor* b
+                                                      ) {
+    // int64_t *ne = x->ne;
+    // if(!w) printf("w is null\n");
+    // int64_t *ne1 = w->ne;
+    // printf("before: (%ld, %ld, %ld, %ld), (%ld, %ld, %ld, %ld)\n",  ne[0], ne[1], ne[2], ne[3], ne1[0], ne1[1], ne1[2], ne1[3]);
+    x = ggml_winograd_stage1(ctx, w, x);    
+    // ne = x->ne;
+    // printf("after: (%ld, %ld, %ld, %ld)\n",  ne[0], ne[1], ne[2], ne[3]);
     if (b != NULL) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
         // b = ggml_repeat(ctx, b, x);
@@ -1001,7 +1041,7 @@ struct GGMLRunner {
 
         // compute the required memory
         size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
-        LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
+        LOG_INFO("%s compute buffer size: %.2f MB(%s)",
                   get_desc().c_str(),
                   compute_buffer_size / 1024.0 / 1024.0,
                   ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
@@ -1019,6 +1059,8 @@ struct GGMLRunner {
         backend_tensor_data_map.clear();
     }
 
+    virtual void transform(int n){};
+
 public:
     virtual std::string get_desc() = 0;
 
@@ -1155,14 +1197,29 @@ class GGMLBlock {
         }
     }
 
+    void transform_blocks(struct ggml_context* ctx, int n, ggml_backend_t backend) {
+        for (auto& pair : blocks) {
+            auto& block = pair.second;
+
+            block->transform(ctx, n, backend);
+        }
+    }
+
     virtual void init_params(struct ggml_context* ctx, ggml_type wtype) {}
 
+    virtual void transform_params(struct ggml_context* ctx, int n, ggml_backend_t backend){}
+
 public:
     void init(struct ggml_context* ctx, ggml_type wtype) {
         init_blocks(ctx, wtype);
         init_params(ctx, wtype);
     }
 
+    void transform(struct ggml_context* ctx, int n, ggml_backend_t backend) {
+        transform_blocks(ctx, n, backend);
+        transform_params(ctx, n, backend);
+    }
+
     size_t get_params_num() {
         size_t num_tensors = params.size();
         for (auto& pair : blocks) {
@@ -1313,16 +1370,77 @@ class Conv2d : public UnaryBlock {
           dilation(dilation),
           bias(bias) {}
 
+    // Conv2d(){}      
+
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = NULL;
         if (bias) {
             b = params["bias"];
         }
+        // if(kernel_size.first == 3){
+        //    printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], in_channels, out_channels);
+        //     //  printf(" (%d -  %d - %d) \n", stride.first, padding.first, dilation.first);
+        // }
         return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
     }
 };
 
+class Conv2d1x3x3 : public UnaryBlock {
+protected:
+    int64_t in_channels;
+    int64_t out_channels;    
+    bool bias;
+
+    struct ggml_tensor* trans = NULL;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
+        // params["transform"] = ggml_winograd_stage0(ctx, params["weight"]); 
+        trans = ggml_winograd_stage0(ctx, params["weight"]);
+        if (bias) {
+            params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        }
+    }
+
+    void transform_params(struct ggml_context* ctx, int n_threads, ggml_backend_t backend){
+        // struct ggml_tensor* w = params["weight"];
+        // struct ggml_tensor* t = ggml_winograd_stage0(ctx, w);   
+        struct ggml_cgraph  * gf = ggml_new_graph(ctx);
+        ggml_build_forward_expand(gf, trans);
+        if (ggml_backend_is_cpu(backend)) {
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+        }
+        ggml_backend_graph_compute(backend, gf);
+        params["transform"] = trans;
+        ggml_graph_clear(gf);
+        trans->src[0] = NULL; // not elegant!! skip FX during wino_stage1
+    }
+
+public:
+    Conv2d1x3x3(int64_t in_channels,
+           int64_t out_channels,           
+           bool bias = true)
+        : in_channels(in_channels),
+          out_channels(out_channels),
+          bias(bias){}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // struct ggml_tensor* w = params["weight"];
+        struct ggml_tensor* w = params["transform"];
+        struct ggml_tensor* b = NULL;
+        if (bias) {
+            b = params["bias"];
+        }
+        // if(kernel_size.first == 3){
+        //    printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], in_channels, out_channels);
+        //     //  printf(" (%d -  %d - %d) \n", stride.first, padding.first, dilation.first);
+        // }
+        // return ggml_nn_conv_2d1x3x3(ctx, x, w, b);
+        return ggml_nn_conv_2d1x3x3(ctx, x, trans, b);
+    }
+};
+
 class Conv3dnx1x1 : public UnaryBlock {
 protected:
     int64_t in_channels;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -424,6 +424,8 @@ class StableDiffusionGGML {
         }
 
         // LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
+        diffusion_model->transform(1);
+        first_stage_model->transform(1);
 
         if (version == VERSION_SVD) {
             // diffusion_model->test();
diff --git a/unet.hpp b/unet.hpp
@@ -217,8 +217,11 @@ class UnetModelBlock : public GGMLBlock {
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
         }
 
-        // input_blocks
-        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+        // input_blocks        
+        if(in_channels % 8 == 0 && model_channels % 64 == 0)
+            blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(in_channels, model_channels));
+        else
+            blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
 
         std::vector<int> input_block_chans;
         input_block_chans.push_back(model_channels);
@@ -336,7 +339,10 @@ class UnetModelBlock : public GGMLBlock {
         // out
         blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
         // out_1 is nn.SiLU()
-        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        if(model_channels % 8 == 0 && out_channels % 64 == 0)
+            blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(model_channels, out_channels));
+        else
+            blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));    
     }
 
     struct ggml_tensor* resblock_forward(std::string name,
@@ -407,10 +413,19 @@ class UnetModelBlock : public GGMLBlock {
 
         auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
         auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
-        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
+        // std::shared_ptr<UnaryBlock> input_blocks_0_0;
+        // if(in_channels % 8 == 0 && model_channels % 64 == 0)
+        auto input_blocks_0_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["input_blocks.0.0"]);
+        // else
+            // input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
+
 
         auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
-        auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
+        //  std::shared_ptr<UnaryBlock> out_2;        
+        // if(model_channels % 8 == 0 && out_channels % 64 == 0)
+        auto out_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out.2"]);
+        // else
+            // out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
 
         auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
 
@@ -432,10 +447,11 @@ class UnetModelBlock : public GGMLBlock {
 
         // input_blocks
         std::vector<struct ggml_tensor*> hs;
-
+        // print_ggml_tensor(x, true, "input to unet"); 
         // input block 0
         auto h = input_blocks_0_0->forward(ctx, x);
-
+        // print_ggml_tensor(h, true, "after input block 0 0"); 
+          
         ggml_set_name(h, "bench-start");
         hs.push_back(h);
         // input block 1-11
@@ -447,7 +463,9 @@ class UnetModelBlock : public GGMLBlock {
             for (int j = 0; j < num_res_blocks; j++) {
                 input_block_idx += 1;
                 std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                // print_ggml_tensor(h, true, "bef res block"); 
                 h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
+                // print_ggml_tensor(h, true, "after res block"); 
                 if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                     std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                     h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
@@ -466,7 +484,7 @@ class UnetModelBlock : public GGMLBlock {
             }
         }
         // [N, 4*model_channels, h/8, w/8]
-
+        // print_ggml_tensor(h, true, "bef mid block");     
         // middle_block
         h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
         h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
@@ -478,6 +496,7 @@ class UnetModelBlock : public GGMLBlock {
         }
         int control_offset = controls.size() - 2;
 
+        // print_ggml_tensor(h, true, "bef out block");     
         // output_blocks
         int output_block_idx = 0;
         for (int i = (int)len_mults - 1; i >= 0; i--) {
@@ -543,6 +562,10 @@ struct UNetModelRunner : public GGMLRunner {
         return "unet";
     }
 
+    void transform(int n){
+        unet.transform(params_ctx, n, backend);
+    }
+
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         unet.get_param_tensors(tensors, prefix);
     }
diff --git a/vae.hpp b/vae.hpp

Original file line number	Diff line number	Diff line change
`@@ -424,6 +424,8 @@ class StableDiffusionGGML {`
`424`	`424`	`}`
`425`	`425`
`426`	`426`	`// LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);`
	`427`	`+ diffusion_model->transform(1);`
	`428`	`+ first_stage_model->transform(1);`
`427`	`429`
`428`	`430`	`if (version == VERSION_SVD) {`
`429`	`431`	`// diffusion_model->test();`