Skip to content

Commit a160cc9

Browse files
committed
Merge branch 'add-winograd-conv2d-v1' of https://github.com/bssrdf/stable-diffusion.cpp into server_flash_winograd1
2 parents 986b630 + 4e9f036 commit a160cc9

File tree

7 files changed

+215
-23
lines changed

7 files changed

+215
-23
lines changed

common.hpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,15 @@ class UpSampleBlock : public GGMLBlock {
4949
int out_channels)
5050
: channels(channels),
5151
out_channels(out_channels) {
52-
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
52+
if(channels % 8 == 0 && out_channels % 64 == 0)
53+
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(channels, out_channels));
54+
else
55+
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
5356
}
5457

5558
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
5659
// x: [N, channels, h, w]
57-
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
60+
auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
5861

5962
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
6063
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
@@ -82,7 +85,12 @@ class ResBlock : public GGMLBlock {
8285
if (dims == 3) {
8386
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
8487
} else {
85-
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
88+
if (kernel_size.first == 3 && kernel_size.second == 3 &&
89+
in_channels % 8 == 0 && out_channels % 64 == 0 &&
90+
padding.first == 1 && padding.second == 1)
91+
return std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(in_channels, out_channels));
92+
else
93+
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
8694
}
8795
}
8896

@@ -138,8 +146,9 @@ class ResBlock : public GGMLBlock {
138146
// in_layers
139147
auto h = in_layers_0->forward(ctx, x);
140148
h = ggml_silu_inplace(ctx, h);
149+
// print_ggml_tensor(h, true, "bef in_layer");
141150
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
142-
151+
// print_ggml_tensor(h, true, "aft in_layer");
143152
// emb_layers
144153
if (!skip_t_emb) {
145154
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);

diffusion_model.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ struct DiffusionModel {
2424
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
2525
virtual size_t get_params_buffer_size() = 0;
2626
virtual int64_t get_adm_in_channels() = 0;
27+
virtual void transform(int n) = 0;
28+
2729
};
2830

2931
struct UNetModel : public DiffusionModel {
@@ -40,6 +42,10 @@ struct UNetModel : public DiffusionModel {
4042
unet.alloc_params_buffer();
4143
}
4244

45+
void transform(int n){
46+
unet.transform(n);
47+
}
48+
4349
void free_params_buffer() {
4450
unet.free_params_buffer();
4551
}
@@ -109,6 +115,10 @@ struct MMDiTModel : public DiffusionModel {
109115
return 768 + 1280;
110116
}
111117

118+
void transform(int n){
119+
120+
}
121+
112122
void compute(int n_threads,
113123
struct ggml_tensor* x,
114124
struct ggml_tensor* timesteps,
@@ -159,6 +169,10 @@ struct FluxModel : public DiffusionModel {
159169
return 768;
160170
}
161171

172+
void transform(int n){
173+
174+
}
175+
162176
void compute(int n_threads,
163177
struct ggml_tensor* x,
164178
struct ggml_tensor* timesteps,

ggml

Submodule ggml updated from 21d3a30 to 9a389a2

ggml_extend.hpp

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,47 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
591591
int p1 = 0,
592592
int d0 = 1,
593593
int d1 = 1) {
594+
// if(w->ne[0]==3 && w->ne[1]==3 && p0==1 && p1==1 && s0==1 && s1==1 &&
595+
// d0==1 && d1==1 && w->ne[3]%64 == 0 && w->ne[2]%8 == 0 && x->ne[3] == 1){
596+
597+
// printf("x-shape 0: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
598+
// printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
599+
// print_ggml_tensor(x, false, "bef wino");
600+
// x = ggml_conv_2d_3x3(ctx, w, x);
601+
// print_ggml_tensor(x, false, "aft wino");
602+
// printf("x-shape 2: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
603+
// }
604+
// else{
594605
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
606+
// if(w->ne[0]==3 && w->ne[1]==3 && p0==1 && p1==1 && s0==1 && s1==1 &&
607+
// d0==1 && d1==1 && w->ne[3]%64 == 0 && w->ne[2]%8 == 0 && x->ne[3] == 1){
608+
// printf("x-shape1: (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], w->ne[2], w->ne[3]);
609+
// }
610+
// }
611+
if (b != NULL) {
612+
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
613+
// b = ggml_repeat(ctx, b, x);
614+
x = ggml_add(ctx, x, b);
615+
}
616+
return x;
617+
}
618+
619+
// w: [IC, 4, 4, OC]
620+
// x: [1, IC, IH, IW]
621+
// b: [OC,]
622+
// result: [N, OC, OH, OW]
623+
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d1x3x3(struct ggml_context* ctx,
624+
struct ggml_tensor* x,
625+
struct ggml_tensor* w,
626+
struct ggml_tensor* b
627+
) {
628+
// int64_t *ne = x->ne;
629+
// if(!w) printf("w is null\n");
630+
// int64_t *ne1 = w->ne;
631+
// printf("before: (%ld, %ld, %ld, %ld), (%ld, %ld, %ld, %ld)\n", ne[0], ne[1], ne[2], ne[3], ne1[0], ne1[1], ne1[2], ne1[3]);
632+
x = ggml_winograd_stage1(ctx, w, x);
633+
// ne = x->ne;
634+
// printf("after: (%ld, %ld, %ld, %ld)\n", ne[0], ne[1], ne[2], ne[3]);
595635
if (b != NULL) {
596636
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
597637
// b = ggml_repeat(ctx, b, x);
@@ -1001,7 +1041,7 @@ struct GGMLRunner {
10011041

10021042
// compute the required memory
10031043
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
1004-
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
1044+
LOG_INFO("%s compute buffer size: %.2f MB(%s)",
10051045
get_desc().c_str(),
10061046
compute_buffer_size / 1024.0 / 1024.0,
10071047
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
@@ -1019,6 +1059,8 @@ struct GGMLRunner {
10191059
backend_tensor_data_map.clear();
10201060
}
10211061

1062+
virtual void transform(int n){};
1063+
10221064
public:
10231065
virtual std::string get_desc() = 0;
10241066

@@ -1155,14 +1197,29 @@ class GGMLBlock {
11551197
}
11561198
}
11571199

1200+
void transform_blocks(struct ggml_context* ctx, int n, ggml_backend_t backend) {
1201+
for (auto& pair : blocks) {
1202+
auto& block = pair.second;
1203+
1204+
block->transform(ctx, n, backend);
1205+
}
1206+
}
1207+
11581208
virtual void init_params(struct ggml_context* ctx, ggml_type wtype) {}
11591209

1210+
virtual void transform_params(struct ggml_context* ctx, int n, ggml_backend_t backend){}
1211+
11601212
public:
11611213
void init(struct ggml_context* ctx, ggml_type wtype) {
11621214
init_blocks(ctx, wtype);
11631215
init_params(ctx, wtype);
11641216
}
11651217

1218+
void transform(struct ggml_context* ctx, int n, ggml_backend_t backend) {
1219+
transform_blocks(ctx, n, backend);
1220+
transform_params(ctx, n, backend);
1221+
}
1222+
11661223
size_t get_params_num() {
11671224
size_t num_tensors = params.size();
11681225
for (auto& pair : blocks) {
@@ -1313,16 +1370,77 @@ class Conv2d : public UnaryBlock {
13131370
dilation(dilation),
13141371
bias(bias) {}
13151372

1373+
// Conv2d(){}
1374+
13161375
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
13171376
struct ggml_tensor* w = params["weight"];
13181377
struct ggml_tensor* b = NULL;
13191378
if (bias) {
13201379
b = params["bias"];
13211380
}
1381+
// if(kernel_size.first == 3){
1382+
// printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], in_channels, out_channels);
1383+
// // printf(" (%d - %d - %d) \n", stride.first, padding.first, dilation.first);
1384+
// }
13221385
return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
13231386
}
13241387
};
13251388

1389+
class Conv2d1x3x3 : public UnaryBlock {
1390+
protected:
1391+
int64_t in_channels;
1392+
int64_t out_channels;
1393+
bool bias;
1394+
1395+
struct ggml_tensor* trans = NULL;
1396+
1397+
void init_params(struct ggml_context* ctx, ggml_type wtype) {
1398+
params["weight"] = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
1399+
// params["transform"] = ggml_winograd_stage0(ctx, params["weight"]);
1400+
trans = ggml_winograd_stage0(ctx, params["weight"]);
1401+
if (bias) {
1402+
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
1403+
}
1404+
}
1405+
1406+
void transform_params(struct ggml_context* ctx, int n_threads, ggml_backend_t backend){
1407+
// struct ggml_tensor* w = params["weight"];
1408+
// struct ggml_tensor* t = ggml_winograd_stage0(ctx, w);
1409+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
1410+
ggml_build_forward_expand(gf, trans);
1411+
if (ggml_backend_is_cpu(backend)) {
1412+
ggml_backend_cpu_set_n_threads(backend, n_threads);
1413+
}
1414+
ggml_backend_graph_compute(backend, gf);
1415+
params["transform"] = trans;
1416+
ggml_graph_clear(gf);
1417+
trans->src[0] = NULL; // not elegant!! skip FX during wino_stage1
1418+
}
1419+
1420+
public:
1421+
Conv2d1x3x3(int64_t in_channels,
1422+
int64_t out_channels,
1423+
bool bias = true)
1424+
: in_channels(in_channels),
1425+
out_channels(out_channels),
1426+
bias(bias){}
1427+
1428+
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
1429+
// struct ggml_tensor* w = params["weight"];
1430+
struct ggml_tensor* w = params["transform"];
1431+
struct ggml_tensor* b = NULL;
1432+
if (bias) {
1433+
b = params["bias"];
1434+
}
1435+
// if(kernel_size.first == 3){
1436+
// printf(" (%zu, %zu, %zu, %zu) %zu, %zu \n", x->ne[0], x->ne[1], x->ne[2], x->ne[3], in_channels, out_channels);
1437+
// // printf(" (%d - %d - %d) \n", stride.first, padding.first, dilation.first);
1438+
// }
1439+
// return ggml_nn_conv_2d1x3x3(ctx, x, w, b);
1440+
return ggml_nn_conv_2d1x3x3(ctx, x, trans, b);
1441+
}
1442+
};
1443+
13261444
class Conv3dnx1x1 : public UnaryBlock {
13271445
protected:
13281446
int64_t in_channels;

stable-diffusion.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,8 @@ class StableDiffusionGGML {
424424
}
425425

426426
// LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
427+
diffusion_model->transform(1);
428+
first_stage_model->transform(1);
427429

428430
if (version == VERSION_SVD) {
429431
// diffusion_model->test();

unet.hpp

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,11 @@ class UnetModelBlock : public GGMLBlock {
217217
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
218218
}
219219

220-
// input_blocks
221-
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
220+
// input_blocks
221+
if(in_channels % 8 == 0 && model_channels % 64 == 0)
222+
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(in_channels, model_channels));
223+
else
224+
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
222225

223226
std::vector<int> input_block_chans;
224227
input_block_chans.push_back(model_channels);
@@ -336,7 +339,10 @@ class UnetModelBlock : public GGMLBlock {
336339
// out
337340
blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch)); // ch == model_channels
338341
// out_1 is nn.SiLU()
339-
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
342+
if(model_channels % 8 == 0 && out_channels % 64 == 0)
343+
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d1x3x3(model_channels, out_channels));
344+
else
345+
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
340346
}
341347

342348
struct ggml_tensor* resblock_forward(std::string name,
@@ -407,10 +413,19 @@ class UnetModelBlock : public GGMLBlock {
407413

408414
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
409415
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
410-
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
416+
// std::shared_ptr<UnaryBlock> input_blocks_0_0;
417+
// if(in_channels % 8 == 0 && model_channels % 64 == 0)
418+
auto input_blocks_0_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["input_blocks.0.0"]);
419+
// else
420+
// input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
421+
411422

412423
auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
413-
auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
424+
// std::shared_ptr<UnaryBlock> out_2;
425+
// if(model_channels % 8 == 0 && out_channels % 64 == 0)
426+
auto out_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out.2"]);
427+
// else
428+
// out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
414429

415430
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
416431

@@ -432,10 +447,11 @@ class UnetModelBlock : public GGMLBlock {
432447

433448
// input_blocks
434449
std::vector<struct ggml_tensor*> hs;
435-
450+
// print_ggml_tensor(x, true, "input to unet");
436451
// input block 0
437452
auto h = input_blocks_0_0->forward(ctx, x);
438-
453+
// print_ggml_tensor(h, true, "after input block 0 0");
454+
439455
ggml_set_name(h, "bench-start");
440456
hs.push_back(h);
441457
// input block 1-11
@@ -447,7 +463,9 @@ class UnetModelBlock : public GGMLBlock {
447463
for (int j = 0; j < num_res_blocks; j++) {
448464
input_block_idx += 1;
449465
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
466+
// print_ggml_tensor(h, true, "bef res block");
450467
h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w]
468+
// print_ggml_tensor(h, true, "after res block");
451469
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
452470
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
453471
h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w]
@@ -466,7 +484,7 @@ class UnetModelBlock : public GGMLBlock {
466484
}
467485
}
468486
// [N, 4*model_channels, h/8, w/8]
469-
487+
// print_ggml_tensor(h, true, "bef mid block");
470488
// middle_block
471489
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
472490
h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
@@ -478,6 +496,7 @@ class UnetModelBlock : public GGMLBlock {
478496
}
479497
int control_offset = controls.size() - 2;
480498

499+
// print_ggml_tensor(h, true, "bef out block");
481500
// output_blocks
482501
int output_block_idx = 0;
483502
for (int i = (int)len_mults - 1; i >= 0; i--) {
@@ -543,6 +562,10 @@ struct UNetModelRunner : public GGMLRunner {
543562
return "unet";
544563
}
545564

565+
void transform(int n){
566+
unet.transform(params_ctx, n, backend);
567+
}
568+
546569
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
547570
unet.get_param_tensors(tensors, prefix);
548571
}

0 commit comments

Comments
 (0)