Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e8ac336
fast latent image preview
stduhpf Jul 10, 2025
de9c492
fix posix compile
stduhpf Jul 10, 2025
ee4aef8
move latent preview code to a separate file
stduhpf Jul 10, 2025
75a9abd
Latent preview support for img2img and img2vid
stduhpf Jul 10, 2025
8dcb814
add latent-preview to .gitignore
stduhpf Jul 10, 2025
ef62078
Refactor latent preview + support tae/vae preview
stduhpf Jul 10, 2025
2cedeb5
update usage
stduhpf Jul 10, 2025
be0a442
Fix build + add warning
stduhpf Jul 10, 2025
31b0fdd
Disable preview by default in sdcpp too
stduhpf Jul 10, 2025
95fd31c
Done not preload preview tensor when preview is disabled.
stduhpf Jul 10, 2025
cbd8c99
Fix VAE preview darkening
stduhpf Jul 10, 2025
c3d72c0
Increase context memory when loading multiple auto encoders
stduhpf Jul 10, 2025
8059ac3
Increase context memory when previewing with auto encoder instead
stduhpf Jul 10, 2025
8e6024f
fix compile warnings
stduhpf Jul 10, 2025
19ac567
fix print-params
stduhpf Jul 10, 2025
430f7d8
fix preview with unet inpaint models
stduhpf Jul 10, 2025
2272068
do not spam pretty progress when using tiled vae/tae as preview
stduhpf Jul 10, 2025
eeca697
change log level of "processing %i tiles"
stduhpf Jul 10, 2025
beb0e91
Refactor preview to match the other callbacks
stduhpf Jul 10, 2025
d465a70
preview: new API
stduhpf Jul 23, 2025
55ef7be
latent proj bias
stduhpf Aug 31, 2025
86e5c49
Merge branch 'master' into image-preview
stduhpf Sep 9, 2025
a5278ce
fix merge issues
stduhpf Sep 9, 2025
030aa3d
add wan latent projs
stduhpf Aug 30, 2025
4c536b5
animated previews
stduhpf Aug 30, 2025
7a0ab28
latent proj bias
stduhpf Aug 31, 2025
3e0ef27
fix dup
stduhpf Sep 9, 2025
2ba5a43
Merge branch 'master' into image-preview
stduhpf Sep 12, 2025
a57c3f4
Merge branch 'master' into image-preview
stduhpf Oct 15, 2025
70a1611
Support latent2rgb preview for qwen image (via wan21)
stduhpf Oct 15, 2025
e2ce17d
Fix ctx memory pool size overwritten during merge
stduhpf Oct 15, 2025
05bf92a
Merge branch 'master' into image-preview
stduhpf Oct 16, 2025
1bae409
Merge remote-tracking branch 'origin' into image-preview
stduhpf Oct 19, 2025
f7b53e5
fix build and update help messages
stduhpf Oct 19, 2025
0a59f36
update help message in readme
stduhpf Oct 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ test/
output*.png
models*
*.log
preview.png
5 changes: 5 additions & 0 deletions examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Options:
-o, --output <string> path to write result image to (default: ./output.png)
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
--preview-path <string> path to write preview image to (default: ./preview.png)
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
Expand All @@ -48,6 +49,8 @@ Options:
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
Expand Down Expand Up @@ -86,6 +89,7 @@ Options:
--chroma-enable-t5-mask enable t5 mask for chroma
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
Expand All @@ -107,4 +111,5 @@ Options:
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
```
82 changes: 82 additions & 0 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ const char* modes_str[] = {
};
#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"

const char* previews_str[] = {
"none",
"proj",
"tae",
"vae",
};

enum SDMode {
IMG_GEN,
VID_GEN,
Expand Down Expand Up @@ -135,11 +142,17 @@ struct SDParams {
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
bool force_sdxl_vae_conv_scale = false;

preview_t preview_method = PREVIEW_NONE;
int preview_interval = 1;
std::string preview_path = "preview.png";
bool taesd_preview = false;

SDParams() {
sd_sample_params_init(&sample_params);
sd_sample_params_init(&high_noise_sample_params);
high_noise_sample_params.sample_steps = -1;
}

};

void print_params(SDParams params) {
Expand Down Expand Up @@ -210,6 +223,8 @@ void print_params(SDParams params) {
printf(" video_frames: %d\n", params.video_frames);
printf(" vace_strength: %.2f\n", params.vace_strength);
printf(" fps: %d\n", params.fps);
printf(" preview_mode: %s\n", previews_str[params.preview_method]);
printf(" preview_interval: %d\n", params.preview_interval);
free(sample_params_str);
free(high_noise_sample_params_str);
}
Expand Down Expand Up @@ -589,6 +604,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--negative-prompt",
"the negative prompt (default: \"\")",
&params.negative_prompt},
{"",
"--preview-path",
"path to write preview image to (default: ./preview.png)",
&params.preview_path},
{"",
"--upscale-model",
"path to esrgan model.",
Expand Down Expand Up @@ -647,6 +666,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"shift timestep for NitroFusion models (default: 0). "
"recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
&params.sample_params.shifted_timestep},
{"",
"--preview-interval",
"interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
&params.preview_interval},
};

options.float_options = {
Expand Down Expand Up @@ -801,6 +824,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--disable-auto-resize-ref-image",
"disable auto resize of ref images",
false, &params.auto_resize_ref_image},
{"",
"--taesd-preview-only",
std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")",
false, &params.taesd_preview},
};

auto on_mode_arg = [&](int argc, const char** argv, int index) {
Expand Down Expand Up @@ -1046,6 +1073,26 @@ void parse_args(int argc, const char** argv, SDParams& params) {
return 1;
};

auto on_preview_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
const char* preview = argv[index];
int preview_method = -1;
for (int m = 0; m < PREVIEW_COUNT; m++) {
if (!strcmp(preview, previews_str[m])) {
preview_method = m;
}
}
if (preview_method == -1) {
fprintf(stderr, "error: preview method %s\n",
preview);
return -1;
}
params.preview_method = (preview_t)preview_method;
return 1;
};

options.manual_options = {
{"-M",
"--mode",
Expand Down Expand Up @@ -1110,6 +1157,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--vae-relative-tile-size",
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
on_relative_tile_size_arg},
{"",
"--preview",
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
on_preview_arg},
};

if (!parse_options(argc, argv, options)) {
Expand Down Expand Up @@ -1452,15 +1503,45 @@ bool load_images_from_dir(const std::string dir,
return true;
}

const char* preview_path;
float preview_fps;

void step_callback(int step, int frame_count, sd_image_t* image) {
if (frame_count == 1) {
stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
} else {
create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps);
}
}

int main(int argc, const char* argv[]) {
SDParams params;
parse_args(argc, argv, params);
preview_path = params.preview_path.c_str();
if (params.video_frames > 4) {
size_t last_dot_pos = params.preview_path.find_last_of(".");
std::string base_path = params.preview_path;
std::string file_ext = "";
if (last_dot_pos != std::string::npos) { // filename has extension
base_path = params.preview_path.substr(0, last_dot_pos);
file_ext = params.preview_path.substr(last_dot_pos);
std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
}
if (file_ext == ".png") {
preview_path = (base_path + ".avi").c_str();
}
}
preview_fps = params.fps;
if (params.preview_method == PREVIEW_PROJ)
preview_fps /= 4.0f;

params.sample_params.guidance.slg.layers = params.skip_layers.data();
params.sample_params.guidance.slg.layer_count = params.skip_layers.size();
params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data();
params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();

sd_set_log_callback(sd_log_cb, (void*)&params);
sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);

if (params.verbose) {
print_params(params);
Expand Down Expand Up @@ -1654,6 +1735,7 @@ int main(int argc, const char* argv[]) {
params.control_net_cpu,
params.vae_on_cpu,
params.diffusion_flash_attn,
params.taesd_preview,
params.diffusion_conv_direct,
params.vae_conv_direct,
params.force_sdxl_vae_conv_scale,
Expand Down
2 changes: 1 addition & 1 deletion ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
int num_tiles = num_tiles_x * num_tiles_y;
LOG_INFO("processing %i tiles", num_tiles);
LOG_DEBUG("processing %i tiles", num_tiles);
pretty_progress(0, num_tiles, 0.0f);
int tile_count = 1;
bool last_y = false, last_x = false;
Expand Down
162 changes: 162 additions & 0 deletions latent-preview.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
const float wan_21_latent_rgb_proj[16][3] = {
{-0.1299, -0.1692, 0.2932},
{0.0671, 0.0406, 0.0442},
{0.3568, 0.2548, 0.1747},
{0.0372, 0.2344, 0.1420},
{0.0313, 0.0189, -0.0328},
{0.0296, -0.0956, -0.0665},
{-0.3477, -0.4059, -0.2925},
{0.0166, 0.1902, 0.1975},
{-0.0412, 0.0267, -0.1364},
{-0.1293, 0.0740, 0.1636},
{0.0680, 0.3019, 0.1128},
{0.0032, 0.0581, 0.0639},
{-0.1251, 0.0927, 0.1699},
{0.0060, -0.0633, 0.0005},
{0.3477, 0.2275, 0.2950},
{0.1984, 0.0913, 0.1861}};
float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976};

const float wan_22_latent_rgb_proj[48][3] = {
{0.0119, 0.0103, 0.0046},
{-0.1062, -0.0504, 0.0165},
{0.0140, 0.0409, 0.0491},
{-0.0813, -0.0677, 0.0607},
{0.0656, 0.0851, 0.0808},
{0.0264, 0.0463, 0.0912},
{0.0295, 0.0326, 0.0590},
{-0.0244, -0.0270, 0.0025},
{0.0443, -0.0102, 0.0288},
{-0.0465, -0.0090, -0.0205},
{0.0359, 0.0236, 0.0082},
{-0.0776, 0.0854, 0.1048},
{0.0564, 0.0264, 0.0561},
{0.0006, 0.0594, 0.0418},
{-0.0319, -0.0542, -0.0637},
{-0.0268, 0.0024, 0.0260},
{0.0539, 0.0265, 0.0358},
{-0.0359, -0.0312, -0.0287},
{-0.0285, -0.1032, -0.1237},
{0.1041, 0.0537, 0.0622},
{-0.0086, -0.0374, -0.0051},
{0.0390, 0.0670, 0.2863},
{0.0069, 0.0144, 0.0082},
{0.0006, -0.0167, 0.0079},
{0.0313, -0.0574, -0.0232},
{-0.1454, -0.0902, -0.0481},
{0.0714, 0.0827, 0.0447},
{-0.0304, -0.0574, -0.0196},
{0.0401, 0.0384, 0.0204},
{-0.0758, -0.0297, -0.0014},
{0.0568, 0.1307, 0.1372},
{-0.0055, -0.0310, -0.0380},
{0.0239, -0.0305, 0.0325},
{-0.0663, -0.0673, -0.0140},
{-0.0416, -0.0047, -0.0023},
{0.0166, 0.0112, -0.0093},
{-0.0211, 0.0011, 0.0331},
{0.1833, 0.1466, 0.2250},
{-0.0368, 0.0370, 0.0295},
{-0.3441, -0.3543, -0.2008},
{-0.0479, -0.0489, -0.0420},
{-0.0660, -0.0153, 0.0800},
{-0.0101, 0.0068, 0.0156},
{-0.0690, -0.0452, -0.0927},
{-0.0145, 0.0041, 0.0015},
{0.0421, 0.0451, 0.0373},
{0.0504, -0.0483, -0.0356},
{-0.0837, 0.0168, 0.0055}};
float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
const float flux_latent_rgb_proj[16][3] = {
{-0.0346f, 0.0244f, 0.0681f},
{0.0034f, 0.0210f, 0.0687f},
{0.0275f, -0.0668f, -0.0433f},
{-0.0174f, 0.0160f, 0.0617f},
{0.0859f, 0.0721f, 0.0329f},
{0.0004f, 0.0383f, 0.0115f},
{0.0405f, 0.0861f, 0.0915f},
{-0.0236f, -0.0185f, -0.0259f},
{-0.0245f, 0.0250f, 0.1180f},
{0.1008f, 0.0755f, -0.0421f},
{-0.0515f, 0.0201f, 0.0011f},
{0.0428f, -0.0012f, -0.0036f},
{0.0817f, 0.0765f, 0.0749f},
{-0.1264f, -0.0522f, -0.1103f},
{-0.0280f, -0.0881f, -0.0499f},
{-0.1262f, -0.0982f, -0.0778f}};
float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851};

// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f},
};
float sd3_latent_rgb_bias[3] = {0, 0, 0};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sdxl_latent_rgb_proj[4][3] = {
{0.3651f, 0.4232f, 0.4341f},
{-0.2533f, -0.0042f, 0.1068f},
{0.1076f, 0.1111f, -0.0362f},
{-0.3165f, -0.2492f, -0.2188f}};
float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sd_latent_rgb_proj[4][3]{
{0.3512f, 0.2297f, 0.3227f},
{0.3250f, 0.4974f, 0.2350f},
{-0.2829f, 0.1762f, 0.2721f},
{-0.2120f, -0.2616f, -0.7177f}};
float sd_latent_rgb_bias[3] = {0,0,0};

void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
size_t buffer_head = 0;
for (int k = 0; k < frames; k++) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
float r = 0, g = 0, b = 0;
for (int d = 0; d < dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}
// bias
r += latent_rgb_bias[0];
g += latent_rgb_bias[1];
b += latent_rgb_bias[2];

// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;

// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;

buffer[buffer_head++] = (uint8_t)(r * 255);
buffer[buffer_head++] = (uint8_t)(g * 255);
buffer[buffer_head++] = (uint8_t)(b * 255);
}
}
}
}
Loading
Loading