experimental: preview during tiled vae decode

stduhpf · stduhpf · commit eff2bda7ef00 · 2025-02-28T13:44:55.000+01:00
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -310,7 +310,11 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
     for (int iy = 0; iy < height; iy++) {
         for (int ix = 0; ix < width; ix++) {
             for (int k = 0; k < channels; k++) {
-                float value                                               = ggml_tensor_get_f32(input, ix, iy, k);
+                float value = ggml_tensor_get_f32(input, ix, iy, k);
+
+                value = value > 1.0f ? 1.0f : value < 0.0f ? 0.0f
+                                                           : value;
+
                 *(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
             }
         }
@@ -466,7 +470,8 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                                             int overlap_x,
                                             int overlap_y,
                                             int x_skip = 0,
-                                            int y_skip = 0) {
+                                            int y_skip = 0,
+                                            bool clear = false) {
     int64_t width    = input->ne[0];
     int64_t height   = input->ne[1];
     int64_t channels = input->ne[2];
@@ -486,6 +491,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                     const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
                     const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
                     const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;
+                    // clear old value for first pass
+                    if (clear && (x_f_0 >= 1.0f || x == 0) && (y_f_0 >= 1.0f || y == 0)) {
+                        old_value = 0.0f;
+                    }
 
                     const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
                     const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
@@ -597,9 +606,10 @@ __STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) {
 }
 
 typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
+typedef std::function<void(ggml_tensor*)> on_tile_merge;
 
 // Tiling
-__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing, bool scaled_out = true) {
+__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing, bool scaled_out = true, on_tile_merge on_merge = NULL) {
     int input_width   = (int)input->ne[0];
     int input_height  = (int)input->ne[1];
     int output_width  = (int)output->ne[0];
@@ -713,13 +723,16 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
             ggml_split_tensor_2d(input, input_tile, x, y);
             on_processing(input_tile, output_tile, false);
             if (scaled_out) {
-                ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap_x * scale, tile_overlap_y * scale, dx * scale, dy * scale);
+                ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap_x * scale, tile_overlap_y * scale, dx * scale, dy * scale, true);
             } else {
                 ggml_merge_tensor_2d(output_tile, output, x / scale, y / scale, tile_overlap_x / scale, tile_overlap_y / scale, dx / scale, dy / scale);
             }
             int64_t t2 = ggml_time_ms();
             last_time  = (t2 - t1) / 1000.0f;
             pretty_progress(tile_count, num_tiles, last_time);
+            if (on_merge != NULL) {
+                on_merge(output);
+            }
             tile_count++;
         }
         last_x = false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -806,6 +806,21 @@ class StableDiffusionGGML {
         sd_set_progress_callback(cb, cbd);
     }
 
+    const float (*get_latent_rgb_proj(enum SDVersion version))[3] {
+        if (sd_version_is_sd3(version)) {
+            return sd3_latent_rgb_proj;
+        } else if (sd_version_is_flux(version)) {
+            return flux_latent_rgb_proj;
+        } else if (sd_version_is_sdxl(version)) {
+            return sdxl_latent_rgb_proj;
+        } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+            return sd_latent_rgb_proj;
+        } else {
+            LOG_WARN("No latent to RGB projection known for this model");
+            return NULL;
+        }
+    }
+
     void preview_image(ggml_context* work_ctx,
                        int step,
                        struct ggml_tensor* latents,
@@ -820,33 +835,8 @@ class StableDiffusionGGML {
         if (preview_mode == SD_PREVIEW_PROJ) {
             const float(*latent_rgb_proj)[channel];
 
-            if (dim == 16) {
-                // 16 channels VAE -> Flux or SD3
-
-                if (sd_version_is_sd3(version)) {
-                    latent_rgb_proj = sd3_latent_rgb_proj;
-                } else if (sd_version_is_flux(version)) {
-                    latent_rgb_proj = flux_latent_rgb_proj;
-                } else {
-                    LOG_WARN("No latent to RGB projection known for this model");
-                    // unknown model
-                    return;
-                }
-
-            } else if (dim == 4) {
-                // 4 channels VAE
-                if (sd_version_is_sdxl(version)) {
-                    latent_rgb_proj = sdxl_latent_rgb_proj;
-                } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
-                    latent_rgb_proj = sd_latent_rgb_proj;
-                } else {
-                    // unknown model
-                    LOG_WARN("No latent to RGB projection known for this model");
-                    return;
-                }
-            } else {
-                LOG_WARN("No latent to RGB projection known for this model");
-                // unknown latent space
+            latent_rgb_proj = get_latent_rgb_proj(version);
+            if (latent_rgb_proj == NULL) {
                 return;
             }
             uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
@@ -1237,7 +1227,56 @@ class StableDiffusionGGML {
                                                  decode ? (H * 8) : (H / 8),  // height
                                                  decode ? 3 : C,
                                                  x->ne[3]);  // channels
-        int64_t t0          = ggml_time_ms();
+
+        if (decode && vae_tiling) {
+            const float(*latent_rgb_proj)[3];
+            latent_rgb_proj = get_latent_rgb_proj(version);
+            if (latent_rgb_proj != NULL) {
+                uint8_t* data = (uint8_t*)malloc(W * H * 3 * sizeof(uint8_t));
+
+                preview_latent_image(data, x, latent_rgb_proj, W, H, C / 2);
+
+                // fill result with upscaled data
+                for (int w = 0; w < W; w++) {
+                    for (int h = 0; h < H; h++) {
+                        for (int c = 0; c < 3; c++) {
+                            // int i = (w * H + h) * 3 + c; //wrong
+                            int i       = (h * W + w) * 3 + c;
+                            float value = data[i] / 255.0f;
+                            if (!use_tiny_autoencoder) {
+                                value = value * 2.0f - 1.0f;
+                            }
+                            for (int x = 0; x < 8; x++) {
+                                for (int y = 0; y < 8; y++) {
+                                    ggml_tensor_set_f32(result, value, w * 8 + x, h * 8 + y, c);
+                                }
+                            }
+                        }
+                    }
+                }
+                free(data);
+                // upscale
+            }
+        }
+        auto preview_cb     = sd_get_preview_callback();
+        auto on_tile_merged = [&](ggml_tensor* output) {
+            if (preview_cb && output->ne[2] == 3) {
+                if (!use_tiny_autoencoder) {
+                    ggml_tensor_scale_output(output);
+                }
+                sd_image_t image = {
+                    output->ne[0],
+                    output->ne[1],
+                    3,
+                    sd_tensor_to_image(output)};
+                preview_cb(-1, image);
+                free(image.data);
+                if (!use_tiny_autoencoder) {
+                    ggml_tensor_scale_input(output);
+                }
+            }
+        };
+        int64_t t0 = ggml_time_ms();
         if (!use_tiny_autoencoder) {
             if (decode) {
                 ggml_tensor_scale(x, 1.0f / scale_factor);
@@ -1249,7 +1288,7 @@ class StableDiffusionGGML {
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     first_stage_model->compute(n_threads, in, decode, &out);
                 };
-                sd_tiling(x, result, 8, 32, 0.5f, on_tiling, decode);
+                sd_tiling(x, result, 8, 32, 0.5f, on_tiling, decode, on_tile_merged);
             } else {
                 first_stage_model->compute(n_threads, x, decode, &result);
             }
@@ -1263,7 +1302,7 @@ class StableDiffusionGGML {
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     tae_first_stage->compute(n_threads, in, decode, &out);
                 };
-                sd_tiling(x, result, 8, 64, 0.5f, on_tiling, decode);
+                sd_tiling(x, result, 8, 64, 0.5f, on_tiling, decode, on_tile_merged);
             } else {
                 tae_first_stage->compute(n_threads, x, decode, &result);
             }