@@ -810,8 +810,7 @@ class StableDiffusionGGML {
810810 const std::vector<float >& sigmas,
811811 int start_merge_step,
812812 SDCondition id_cond,
813- ggml_tensor* noise_mask = nullptr ) {
814-
813+ ggml_tensor* denoise_mask = NULL ) {
815814 std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
816815
817816 // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1026,10 +1025,10 @@ class StableDiffusionGGML {
10261025 pretty_progress (step, (int )steps, (t1 - t0) / 1000000 .f );
10271026 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
10281027 }
1029- if (noise_mask != nullptr ) {
1028+ if (denoise_mask != nullptr ) {
10301029 for (int64_t x = 0 ; x < denoised->ne [0 ]; x++) {
10311030 for (int64_t y = 0 ; y < denoised->ne [1 ]; y++) {
1032- float mask = ggml_tensor_get_f32 (noise_mask , x, y);
1031+ float mask = ggml_tensor_get_f32 (denoise_mask , x, y);
10331032 for (int64_t k = 0 ; k < denoised->ne [2 ]; k++) {
10341033 float init = ggml_tensor_get_f32 (init_latent, x, y, k);
10351034 float den = ggml_tensor_get_f32 (denoised, x, y, k);
@@ -1283,7 +1282,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
12831282 float style_ratio,
12841283 bool normalize_input,
12851284 std::string input_id_images_path,
1286- ggml_tensor* masked_latent = NULL ) {
1285+ ggml_tensor* concat_latent = NULL ,
1286+ ggml_tensor* denoise_mask = NULL ) {
12871287 if (seed < 0 ) {
12881288 // Generally, when using the provided command line, the seed is always >0.
12891289 // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1470,7 +1470,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
14701470 int W = width / 8 ;
14711471 int H = height / 8 ;
14721472 LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1473- ggml_tensor* noise_mask = nullptr ;
14741473 if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
14751474 int64_t mask_channels = 1 ;
14761475 if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
@@ -1496,21 +1495,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
14961495 }
14971496 }
14981497 }
1499- if (masked_latent == NULL ) {
1500- masked_latent = empty_latent;
1498+ if (concat_latent == NULL ) {
1499+ concat_latent = empty_latent;
15011500 }
1502- cond.c_concat = masked_latent ;
1501+ cond.c_concat = concat_latent ;
15031502 uncond.c_concat = empty_latent;
1504- // noise_mask = masked_latent ;
1503+ denoise_mask = NULL ;
15051504 } else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1506- cond.c_concat = masked_latent;
1507- auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent->ne [0 ], masked_latent->ne [1 ], masked_latent->ne [2 ], masked_latent->ne [3 ]);
1505+ auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], init_latent->ne [3 ]);
15081506 ggml_set_f32 (empty_latent, 0 );
15091507 uncond.c_concat = empty_latent;
1510- } else {
1511- noise_mask = masked_latent;
1512- }
1508+ if (concat_latent == NULL ) {
1509+ concat_latent = empty_latent;
1510+ }
1511+ cond.c_concat = concat_latent;
15131512
1513+ }
15141514 for (int b = 0 ; b < batch_count; b++) {
15151515 int64_t sampling_start = ggml_time_ms ();
15161516 int64_t cur_seed = seed + b;
@@ -1545,7 +1545,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
15451545 sigmas,
15461546 start_merge_step,
15471547 id_cond,
1548- noise_mask );
1548+ denoise_mask );
15491549
15501550 // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
15511551 // print_ggml_tensor(x_0);
@@ -1756,7 +1756,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17561756
17571757 sd_image_to_tensor (init_image.data , init_img);
17581758
1759- ggml_tensor* masked_latent;
1759+ ggml_tensor* concat_latent;
1760+ ggml_tensor* denoise_mask = NULL ;
17601761
17611762 ggml_tensor* init_latent = NULL ;
17621763 ggml_tensor* init_moments = NULL ;
@@ -1776,63 +1777,65 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17761777 // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
17771778 sd_image_to_tensor (init_image.data , init_img);
17781779 sd_apply_mask (init_img, mask_img, masked_img);
1779- ggml_tensor* masked_latent_0 = NULL ;
1780+ ggml_tensor* masked_latent = NULL ;
17801781 if (!sd_ctx->sd ->use_tiny_autoencoder ) {
17811782 ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1782- masked_latent_0 = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1783+ masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
17831784 } else {
1784- masked_latent_0 = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1785+ masked_latent = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
17851786 }
1786- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent_0 ->ne [0 ], masked_latent_0 ->ne [1 ], mask_channels + masked_latent_0 ->ne [2 ], 1 );
1787- for (int ix = 0 ; ix < masked_latent_0 ->ne [0 ]; ix++) {
1788- for (int iy = 0 ; iy < masked_latent_0 ->ne [1 ]; iy++) {
1787+ concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent ->ne [0 ], masked_latent ->ne [1 ], mask_channels + masked_latent ->ne [2 ], 1 );
1788+ for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1789+ for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
17891790 int mx = ix * 8 ;
17901791 int my = iy * 8 ;
17911792 if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
1792- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1793- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1794- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k);
1793+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1794+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1795+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k);
17951796 }
17961797 // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
17971798 for (int x = 0 ; x < 8 ; x++) {
17981799 for (int y = 0 ; y < 8 ; y++) {
17991800 float m = ggml_tensor_get_f32 (mask_img, mx + x, my + y);
18001801 // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
18011802 // python code was using "b (h 8) (w 8) -> b (8 8) h w"
1802- ggml_tensor_set_f32 (masked_latent , m, ix, iy, masked_latent_0 ->ne [2 ] + x * 8 + y);
1803+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, masked_latent ->ne [2 ] + x * 8 + y);
18031804 }
18041805 }
18051806 } else {
18061807 float m = ggml_tensor_get_f32 (mask_img, mx, my);
1807- ggml_tensor_set_f32 (masked_latent , m, ix, iy, 0 );
1808- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1809- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1810- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k + mask_channels);
1808+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, 0 );
1809+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1810+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1811+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k + mask_channels);
18111812 }
18121813 }
18131814 }
18141815 }
18151816 } else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1816- // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way
1817+ // Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way
18171818 if (!sd_ctx->sd ->use_tiny_autoencoder ) {
18181819 if (sd_ctx->sd ->is_using_edm_v_parameterization ) {
18191820 // for CosXL edit
1820- masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
1821+ concat_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
18211822 } else {
1822- masked_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
1823+ concat_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
18231824 }
18241825 } else {
1825- masked_latent = init_latent;
1826+ concat_latent = init_latent;
18261827 }
1827- } else {
1828+ }
1829+
1830+ {
18281831 // LOG_WARN("Inpainting with a base model is not great");
1829- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1830- for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1831- for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
1832+ denoise_mask = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1833+ for (int ix = 0 ; ix < denoise_mask ->ne [0 ]; ix++) {
1834+ for (int iy = 0 ; iy < denoise_mask ->ne [1 ]; iy++) {
18321835 int mx = ix * 8 ;
18331836 int my = iy * 8 ;
18341837 float m = ggml_tensor_get_f32 (mask_img, mx, my);
1835- ggml_tensor_set_f32 (masked_latent , m, ix, iy);
1838+ ggml_tensor_set_f32 (denoise_mask , m, ix, iy);
18361839 }
18371840 }
18381841 }
@@ -1868,7 +1871,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
18681871 style_ratio,
18691872 normalize_input,
18701873 input_id_images_path_c_str,
1871- masked_latent);
1874+ concat_latent,
1875+ denoise_mask);
18721876
18731877 size_t t2 = ggml_time_ms ();
18741878
0 commit comments