diff --git a/.gitignore b/.gitignore
index dd4f6435..b0e3af83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ test/
 output*.png
 models*
 *.log
+preview.png
diff --git a/examples/cli/README.md b/examples/cli/README.md
index ee17d17d..abbb0b6f 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -32,6 +32,7 @@ Options:
   -o, --output <string>                    path to write result image to (default: ./output.png)
   -p, --prompt <string>                    the prompt to render
   -n, --negative-prompt <string>           the negative prompt (default: "")
+  --preview-path <string>                  path to write preview image to (default: ./preview.png)
   --upscale-model <string>                 path to esrgan model.
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
@@ -48,6 +49,8 @@ Options:
   --fps <int>                              fps (default: 24)
   --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                            NitroSD-Vibrant
+  --preview-interval <int>                 interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+                                           every step)
   --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
   --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
   --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
@@ -86,6 +89,7 @@ Options:
   --chroma-enable-t5-mask                  enable t5 mask for chroma
   --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
   --disable-auto-resize-ref-image          disable auto resize of ref images
+  --taesd-preview-only                     prevents usage of taesd for decoding the final image. (for use with --preview tae)
   -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
@@ -107,4 +111,5 @@ Options:
   --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
   --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                            (overrides --vae-tile-size)
+  --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
 ```
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 8f938c9b..2aeed946 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -46,6 +46,13 @@ const char* modes_str[] = {
 };
 #define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
 
+const char* previews_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
 enum SDMode {
     IMG_GEN,
     VID_GEN,
@@ -135,11 +142,17 @@ struct SDParams {
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
     bool force_sdxl_vae_conv_scale       = false;
 
+    preview_t preview_method = PREVIEW_NONE;
+    int preview_interval     = 1;
+    std::string preview_path = "preview.png";
+    bool taesd_preview       = false;
+
     SDParams() {
         sd_sample_params_init(&sample_params);
         sd_sample_params_init(&high_noise_sample_params);
         high_noise_sample_params.sample_steps = -1;
     }
+
 };
 
 void print_params(SDParams params) {
@@ -210,6 +223,8 @@ void print_params(SDParams params) {
     printf("    video_frames:                      %d\n", params.video_frames);
     printf("    vace_strength:                     %.2f\n", params.vace_strength);
     printf("    fps:                               %d\n", params.fps);
+    printf("    preview_mode:                      %s\n", previews_str[params.preview_method]);
+    printf("    preview_interval:                  %d\n", params.preview_interval);
     free(sample_params_str);
     free(high_noise_sample_params_str);
 }
@@ -589,6 +604,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--negative-prompt",
          "the negative prompt (default: \"\")",
          &params.negative_prompt},
+        {"", 
+         "--preview-path",
+         "path to write preview image to (default: ./preview.png)",
+         &params.preview_path},
         {"",
          "--upscale-model",
          "path to esrgan model.",
@@ -647,6 +666,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "shift timestep for NitroFusion models (default: 0). "
          "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
          &params.sample_params.shifted_timestep},
+        {"",
+         "--preview-interval",
+         "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
+          &params.preview_interval},
     };
 
     options.float_options = {
@@ -801,6 +824,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--disable-auto-resize-ref-image",
          "disable auto resize of ref images",
          false, &params.auto_resize_ref_image},
+        {"",
+         "--taesd-preview-only",
+         std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", 
+         false, &params.taesd_preview},
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -1046,6 +1073,26 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_preview_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* preview = argv[index];
+        int preview_method  = -1;
+        for (int m = 0; m < PREVIEW_COUNT; m++) {
+            if (!strcmp(preview, previews_str[m])) {
+                preview_method = m;
+            }
+        }
+        if (preview_method == -1) {
+            fprintf(stderr, "error: preview method %s\n",
+                    preview);
+            return -1;
+        }
+        params.preview_method = (preview_t)preview_method;
+        return 1;
+    };
+
     options.manual_options = {
         {"-M",
          "--mode",
@@ -1110,6 +1157,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--vae-relative-tile-size",
          "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
          on_relative_tile_size_arg},
+        {"",
+         "--preview",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         on_preview_arg},
     };
 
     if (!parse_options(argc, argv, options)) {
@@ -1452,15 +1503,45 @@ bool load_images_from_dir(const std::string dir,
     return true;
 }
 
+const char* preview_path;
+float preview_fps;
+
+void step_callback(int step, int frame_count, sd_image_t* image) {
+    if (frame_count == 1) {
+        stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
+    } else {
+        create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps);
+    }
+}
+
 int main(int argc, const char* argv[]) {
     SDParams params;
     parse_args(argc, argv, params);
+    preview_path = params.preview_path.c_str();
+    if (params.video_frames > 4) {
+        size_t last_dot_pos   = params.preview_path.find_last_of(".");
+        std::string base_path = params.preview_path;
+        std::string file_ext  = "";
+        if (last_dot_pos != std::string::npos) {  // filename has extension
+            base_path = params.preview_path.substr(0, last_dot_pos);
+            file_ext  = params.preview_path.substr(last_dot_pos);
+            std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
+        }
+        if (file_ext == ".png") {
+            preview_path = (base_path + ".avi").c_str();
+        }
+    }
+    preview_fps = params.fps;
+    if (params.preview_method == PREVIEW_PROJ)
+        preview_fps /= 4.0f;
+
     params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
     params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
     params.high_noise_sample_params.guidance.slg.layers      = params.high_noise_skip_layers.data();
     params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
 
     if (params.verbose) {
         print_params(params);
@@ -1654,6 +1735,7 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.taesd_preview,
         params.diffusion_conv_direct,
         params.vae_conv_direct,
         params.force_sdxl_vae_conv_scale,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 02d82bc0..a8916ec4 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -876,7 +876,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
     ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
     ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
     int num_tiles            = num_tiles_x * num_tiles_y;
-    LOG_INFO("processing %i tiles", num_tiles);
+    LOG_DEBUG("processing %i tiles", num_tiles);
     pretty_progress(0, num_tiles, 0.0f);
     int tile_count = 1;
     bool last_y = false, last_x = false;
diff --git a/latent-preview.h b/latent-preview.h
new file mode 100644
index 00000000..5c160605
--- /dev/null
+++ b/latent-preview.h
@@ -0,0 +1,162 @@
+const float wan_21_latent_rgb_proj[16][3] = {
+    {-0.1299, -0.1692, 0.2932},
+    {0.0671, 0.0406, 0.0442},
+    {0.3568, 0.2548, 0.1747},
+    {0.0372, 0.2344, 0.1420},
+    {0.0313, 0.0189, -0.0328},
+    {0.0296, -0.0956, -0.0665},
+    {-0.3477, -0.4059, -0.2925},
+    {0.0166, 0.1902, 0.1975},
+    {-0.0412, 0.0267, -0.1364},
+    {-0.1293, 0.0740, 0.1636},
+    {0.0680, 0.3019, 0.1128},
+    {0.0032, 0.0581, 0.0639},
+    {-0.1251, 0.0927, 0.1699},
+    {0.0060, -0.0633, 0.0005},
+    {0.3477, 0.2275, 0.2950},
+    {0.1984, 0.0913, 0.1861}};
+float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976};
+
+const float wan_22_latent_rgb_proj[48][3] = {
+    {0.0119, 0.0103, 0.0046},
+    {-0.1062, -0.0504, 0.0165},
+    {0.0140, 0.0409, 0.0491},
+    {-0.0813, -0.0677, 0.0607},
+    {0.0656, 0.0851, 0.0808},
+    {0.0264, 0.0463, 0.0912},
+    {0.0295, 0.0326, 0.0590},
+    {-0.0244, -0.0270, 0.0025},
+    {0.0443, -0.0102, 0.0288},
+    {-0.0465, -0.0090, -0.0205},
+    {0.0359, 0.0236, 0.0082},
+    {-0.0776, 0.0854, 0.1048},
+    {0.0564, 0.0264, 0.0561},
+    {0.0006, 0.0594, 0.0418},
+    {-0.0319, -0.0542, -0.0637},
+    {-0.0268, 0.0024, 0.0260},
+    {0.0539, 0.0265, 0.0358},
+    {-0.0359, -0.0312, -0.0287},
+    {-0.0285, -0.1032, -0.1237},
+    {0.1041, 0.0537, 0.0622},
+    {-0.0086, -0.0374, -0.0051},
+    {0.0390, 0.0670, 0.2863},
+    {0.0069, 0.0144, 0.0082},
+    {0.0006, -0.0167, 0.0079},
+    {0.0313, -0.0574, -0.0232},
+    {-0.1454, -0.0902, -0.0481},
+    {0.0714, 0.0827, 0.0447},
+    {-0.0304, -0.0574, -0.0196},
+    {0.0401, 0.0384, 0.0204},
+    {-0.0758, -0.0297, -0.0014},
+    {0.0568, 0.1307, 0.1372},
+    {-0.0055, -0.0310, -0.0380},
+    {0.0239, -0.0305, 0.0325},
+    {-0.0663, -0.0673, -0.0140},
+    {-0.0416, -0.0047, -0.0023},
+    {0.0166, 0.0112, -0.0093},
+    {-0.0211, 0.0011, 0.0331},
+    {0.1833, 0.1466, 0.2250},
+    {-0.0368, 0.0370, 0.0295},
+    {-0.3441, -0.3543, -0.2008},
+    {-0.0479, -0.0489, -0.0420},
+    {-0.0660, -0.0153, 0.0800},
+    {-0.0101, 0.0068, 0.0156},
+    {-0.0690, -0.0452, -0.0927},
+    {-0.0145, 0.0041, 0.0015},
+    {0.0421, 0.0451, 0.0373},
+    {0.0504, -0.0483, -0.0356},
+    {-0.0837, 0.0168, 0.0055}};
+float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.0346f, 0.0244f, 0.0681f},
+    {0.0034f, 0.0210f, 0.0687f},
+    {0.0275f, -0.0668f, -0.0433f},
+    {-0.0174f, 0.0160f, 0.0617f},
+    {0.0859f, 0.0721f, 0.0329f},
+    {0.0004f, 0.0383f, 0.0115f},
+    {0.0405f, 0.0861f, 0.0915f},
+    {-0.0236f, -0.0185f, -0.0259f},
+    {-0.0245f, 0.0250f, 0.1180f},
+    {0.1008f, 0.0755f, -0.0421f},
+    {-0.0515f, 0.0201f, 0.0011f},
+    {0.0428f, -0.0012f, -0.0036f},
+    {0.0817f, 0.0765f, 0.0749f},
+    {-0.1264f, -0.0522f, -0.1103f},
+    {-0.0280f, -0.0881f, -0.0499f},
+    {-0.1262f, -0.0982f, -0.0778f}};
+float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851};
+
+// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
+};
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.3651f, 0.4232f, 0.4341f},
+    {-0.2533f, -0.0042f, 0.1068f},
+    {0.1076f, 0.1111f, -0.0362f},
+    {-0.3165f, -0.2492f, -0.2188f}};
+float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sd_latent_rgb_proj[4][3]{
+    {0.3512f, 0.2297f, 0.3227f},
+    {0.3250f, 0.4974f, 0.2350f},
+    {-0.2829f, 0.1762f, 0.2721f},
+    {-0.2120f, -0.2616f, -0.7177f}};
+float sd_latent_rgb_bias[3] = {0,0,0};
+
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
+    size_t buffer_head = 0;
+    for (int k = 0; k < frames; k++) {
+        for (int j = 0; j < height; j++) {
+            for (int i = 0; i < width; i++) {
+                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
+                float r = 0, g = 0, b = 0;
+                for (int d = 0; d < dim; d++) {
+                    float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                    r += value * latent_rgb_proj[d][0];
+                    g += value * latent_rgb_proj[d][1];
+                    b += value * latent_rgb_proj[d][2];
+                }
+                // bias
+                r += latent_rgb_bias[0];
+                g += latent_rgb_bias[1];
+                b += latent_rgb_bias[2];
+
+                // change range
+                r = r * .5f + .5f;
+                g = g * .5f + .5f;
+                b = b * .5f + .5f;
+
+                // clamp rgb values to [0,1] range
+                r = r >= 0 ? r <= 1 ? r : 1 : 0;
+                g = g >= 0 ? g <= 1 ? g : 1 : 0;
+                b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+                buffer[buffer_head++] = (uint8_t)(r * 255);
+                buffer[buffer_head++] = (uint8_t)(g * 255);
+                buffer[buffer_head++] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 8fb88f48..ba2f0d7f 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -20,6 +20,8 @@
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 
+#include "latent-preview.h"
+
 // #define STB_IMAGE_WRITE_IMPLEMENTATION
 // #define STB_IMAGE_WRITE_STATIC
 // #include "stb_image_write.h"
@@ -78,6 +80,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
     }
 }
 
+void suppress_pp(int step, int steps, float time, void* data) {
+    (void)step;
+    (void)steps;
+    (void)time;
+    (void)data;
+    return;
+}
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
@@ -352,8 +362,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               sd_ctx_params->diffusion_flash_attn,
+                                                                offload_params_to_cpu,
+                                                                sd_ctx_params->diffusion_flash_attn,
                                                                model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
@@ -397,11 +407,11 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             model_loader.tensor_storages_types,
-                                                             "model.diffusion_model",
-                                                             version,
-                                                             sd_ctx_params->diffusion_flash_attn);
+                                                              offload_params_to_cpu,
+                                                              model_loader.tensor_storages_types,
+                                                              "model.diffusion_model",
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -469,7 +479,7 @@ class StableDiffusionGGML {
                 vae_decode_only = false;
             }
 
-            if (high_noise_diffusion_model) {
+            if (high_noise_diffusion_model || sd_ctx_params->tae_preview_only) {
                 high_noise_diffusion_model->alloc_params_buffer();
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
@@ -513,7 +523,8 @@ class StableDiffusionGGML {
                 }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
                                                                     offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,
@@ -629,9 +640,10 @@ class StableDiffusionGGML {
                 unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
             }
             size_t vae_params_mem_size = 0;
-            if (!use_tiny_autoencoder) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                 vae_params_mem_size = first_stage_model->get_params_buffer_size();
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                 if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
                     return false;
                 }
@@ -802,6 +814,7 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("finished loaded file");
         ggml_free(ctx);
+        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
         return true;
     }
 
@@ -1110,6 +1123,153 @@ class StableDiffusionGGML {
         }
     }
 
+    void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+        sd_progress_cb_t cb = sd_get_progress_callback();
+        void* cbd           = sd_get_progress_callback_data();
+        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL);
+        sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
+        sd_set_progress_callback(cb, cbd);
+    }
+
+    void preview_image(ggml_context* work_ctx,
+                       int step,
+                       struct ggml_tensor* latents,
+                       enum SDVersion version,
+                       preview_t preview_mode,
+                       ggml_tensor* result,
+                       std::function<void(int, int, sd_image_t*)> step_callback) {
+        const uint32_t channel = 3;
+        uint32_t width         = latents->ne[0];
+        uint32_t height        = latents->ne[1];
+        uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+
+        if (preview_mode == PREVIEW_PROJ) {
+            const float (*latent_rgb_proj)[channel];
+            float *latent_rgb_bias;
+
+            if (dim == 48) {
+                if (sd_version_is_wan(version)) {
+                    latent_rgb_proj = wan_22_latent_rgb_proj;
+                    latent_rgb_bias = wan_22_latent_rgb_bias;
+                } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    // unknown model
+                    return;
+                }
+            } else if (dim == 16) {
+                // 16 channels VAE -> Flux or SD3
+
+                if (sd_version_is_sd3(version)) {
+                    latent_rgb_proj = sd3_latent_rgb_proj;
+                    latent_rgb_bias = sd3_latent_rgb_bias;
+                } else if (sd_version_is_flux(version)) {
+                    latent_rgb_proj = flux_latent_rgb_proj;
+                    latent_rgb_bias = flux_latent_rgb_bias;
+                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+                    latent_rgb_proj = wan_21_latent_rgb_proj;
+                    latent_rgb_bias = wan_21_latent_rgb_bias;
+                } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    // unknown model
+                    return;
+                }
+
+            } else if (dim == 4) {
+                // 4 channels VAE
+                if (sd_version_is_sdxl(version)) {
+                    latent_rgb_proj = sdxl_latent_rgb_proj;
+                    latent_rgb_bias = sdxl_latent_rgb_bias;
+                } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+                    latent_rgb_proj = sd_latent_rgb_proj;
+                    latent_rgb_bias = sd_latent_rgb_bias;
+                } else {
+                    // unknown model
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    return;
+                }
+            } else {
+                LOG_WARN("No latent to RGB projection known for this model");
+                // unknown latent space
+                return;
+            }
+
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = latents->ne[2];
+            }
+
+            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, frames, dim);
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            for (int i = 0; i < frames; i++) {
+                images[i] = {width, height, channel, data + i * width * height * channel};
+            }
+            step_callback(step, frames, images);
+            free(data);
+            free(images);
+        } else {
+            if (preview_mode == PREVIEW_VAE) {
+                process_latent_out(latents);
+                if (vae_tiling_params.enabled) {
+                    // split latent in 32x32 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        first_stage_model->compute(n_threads, in, true, &out, NULL);
+                    };
+                    silent_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+
+                } else {
+                    first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
+                }
+
+                first_stage_model->free_compute_buffer();
+                process_vae_output_tensor(result);
+                process_latent_in(latents);
+            } else if (preview_mode == PREVIEW_TAE) {
+                if (tae_first_stage == nullptr) {
+                    LOG_WARN("TAE not found for preview");
+                    return;
+                }
+                if (vae_tiling_params.enabled) {
+                    // split latent in 64x64 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        tae_first_stage->compute(n_threads, in, true, &out, NULL);
+                    };
+                    silent_tiling(latents, result, 8, 64, 0.5f, on_tiling);
+                } else {
+                    tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
+                }
+                tae_first_stage->free_compute_buffer();
+            } else {
+                return;
+            }
+
+            ggml_tensor_clamp(result, 0.0f, 1.0f);
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = result->ne[2];
+            }
+
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            print_ggml_tensor(result,true);
+            for (size_t i = 0; i < frames; i++) {
+                images[i].width   = result->ne[0];
+                images[i].height  = result->ne[1];
+                images[i].channel = 3;
+                images[i].data    = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4);
+            }
+
+            step_callback(step, frames, images);
+            
+            ggml_tensor_scale(result, 0);
+            for (int i = 0; i < frames; i++) {
+                free(images[i].data);
+            }
+
+            free(images);
+        }
+    }
+
     ggml_tensor* sample(ggml_context* work_ctx,
                         std::shared_ptr<DiffusionModel> work_diffusion_model,
                         bool inverse_noise_scaling,
@@ -1185,6 +1345,35 @@ class StableDiffusionGGML {
 
         int64_t t0 = ggml_time_us();
 
+        struct ggml_tensor* preview_tensor = NULL;
+        auto sd_preview_mode               = sd_get_preview_mode();
+        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
+            int64_t W = x->ne[0] * 8;
+            int64_t H = x->ne[1] * 8;
+            if (ggml_n_dims(x) == 4) {
+                // assuming video mode (if batch processing gets implemented this will break)
+                int T = x->ne[2];
+                if (sd_version_is_wan(version)) {
+                    T = ((T - 1) * 4) + 1;
+                    if (version == VERSION_WAN2_2_TI2V) {
+                        W = x->ne[0] * 16;
+                        H = x->ne[1] * 16;
+                    }
+                }
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    T,
+                                                    3);
+            } else {
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    3,
+                                                    x->ne[3]);
+            }
+        }
+
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
@@ -1350,7 +1539,17 @@ class StableDiffusionGGML {
             if (denoise_mask != nullptr) {
                 apply_mask(denoised, init_latent, denoise_mask);
             }
-
+            if (step > 0) {
+                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+            }
+            auto sd_preview_cb   = sd_get_preview_callback();
+            auto sd_preview_mode = sd_get_preview_mode();
+            if (sd_preview_cb != NULL) {
+                if (step % sd_get_preview_interval() == 0) {
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb);
+                }
+            }
             return denoised;
         };
 
@@ -1383,12 +1582,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
@@ -1423,12 +1622,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
@@ -1796,6 +1995,29 @@ enum prediction_t str_to_prediction(const char* str) {
     return PREDICTION_COUNT;
 }
 
+const char* preview_to_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
+const char* sd_preview_name(enum preview_t preview) {
+    if (preview < PREVIEW_COUNT) {
+        return preview_to_str[preview];
+    }
+    return NONE_STR;
+}
+
+enum preview_t str_to_preview(const char* str) {
+    for (int i = 0; i < PREVIEW_COUNT; i++) {
+        if (!strcmp(str, preview_to_str[i])) {
+            return (enum preview_t)i;
+        }
+    }
+    return PREVIEW_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     *sd_ctx_params                         = {};
     sd_ctx_params->vae_decode_only         = true;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index f618d457..e82a7fd8 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -126,6 +126,14 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
+enum preview_t {
+    PREVIEW_NONE,
+    PREVIEW_PROJ,
+    PREVIEW_TAE,
+    PREVIEW_VAE,
+    PREVIEW_COUNT
+};
+
 typedef struct {
     bool enabled;
     int tile_size_x;
@@ -162,6 +170,7 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool tae_preview_only;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
     bool force_sdxl_vae_conv_scale;
@@ -254,9 +263,11 @@ typedef struct sd_ctx_t sd_ctx_t;
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef void (*sd_preview_cb_t)(int, int, sd_image_t*);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
@@ -270,6 +281,8 @@ SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
 SD_API enum scheduler_t str_to_schedule(const char* str);
 SD_API const char* sd_prediction_name(enum prediction_t prediction);
 SD_API enum prediction_t str_to_prediction(const char* str);
+SD_API const char* sd_preview_name(enum preview_t preview);
+SD_API enum preview_t str_to_preview(const char* str);
 
 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
diff --git a/util.cpp b/util.cpp
index d6d06752..23b6c3b2 100644
--- a/util.cpp
+++ b/util.cpp
@@ -188,6 +188,10 @@ int32_t get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;
 
+static sd_preview_cb_t sd_preview_cb = NULL;
+preview_t sd_preview_mode         = PREVIEW_NONE;
+int sd_preview_interval              = 1;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
     return converter.from_bytes(utf8_str);
@@ -331,6 +335,29 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1) {
+    sd_preview_cb       = cb;
+    sd_preview_mode     = mode;
+    sd_preview_interval = interval;
+}
+
+sd_preview_cb_t sd_get_preview_callback() {
+    return sd_preview_cb;
+}
+
+preview_t sd_get_preview_mode() {
+    return sd_preview_mode;
+}
+int sd_get_preview_interval() {
+    return sd_preview_interval;
+}
+
+sd_progress_cb_t sd_get_progress_callback() {
+    return sd_progress_cb;
+}
+void* sd_get_progress_callback_data() {
+    return sd_progress_cb_data;
+}
 const char* sd_get_system_info() {
     static char buffer[1024];
     std::stringstream ss;
diff --git a/util.h b/util.h
index 17bcd1d3..3e34a2f7 100644
--- a/util.h
+++ b/util.h
@@ -54,6 +54,13 @@ std::string trim(const std::string& s);
 
 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
 
+sd_progress_cb_t sd_get_progress_callback();
+void* sd_get_progress_callback_data();
+
+sd_preview_cb_t sd_get_preview_callback();
+preview_t sd_get_preview_mode();
+int sd_get_preview_interval();
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)