@@ -806,6 +806,11 @@ class StableDiffusionGGML {
806806 float skip_layer_start = 0.01 ,
807807 float skip_layer_end = 0.2 ,
808808 ggml_tensor* noise_mask = nullptr ) {
809+
810+ // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
811+
812+ float img_cfg_scale = guidance;
813+
809814 LOG_DEBUG (" Sample" );
810815 struct ggml_init_params params;
811816 size_t data_size = ggml_row_size (init_latent->type , init_latent->ne [0 ]);
@@ -828,12 +833,15 @@ class StableDiffusionGGML {
828833 struct ggml_tensor * noised_input = ggml_dup_tensor (work_ctx, noise);
829834
830835 bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL ;
836+ bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale;
837+ has_unconditioned = has_unconditioned || has_img_guidance;
831838 bool has_skiplayer = slg_scale != 0.0 && skip_layers.size () > 0 ;
832839
833840 // denoise wrapper
834- struct ggml_tensor * out_cond = ggml_dup_tensor (work_ctx, x);
835- struct ggml_tensor * out_uncond = NULL ;
836- struct ggml_tensor * out_skip = NULL ;
841+ struct ggml_tensor * out_cond = ggml_dup_tensor (work_ctx, x);
842+ struct ggml_tensor * out_uncond = NULL ;
843+ struct ggml_tensor * out_skip = NULL ;
844+ struct ggml_tensor * out_img_cond = NULL ;
837845
838846 if (has_unconditioned) {
839847 out_uncond = ggml_dup_tensor (work_ctx, x);
@@ -846,6 +854,9 @@ class StableDiffusionGGML {
846854 LOG_WARN (" SLG is incompatible with %s models" , model_version_to_str[version]);
847855 }
848856 }
857+ if (has_img_guidance) {
858+ out_img_cond = ggml_dup_tensor (work_ctx, x);
859+ }
849860 struct ggml_tensor * denoised = ggml_dup_tensor (work_ctx, x);
850861
851862 auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -927,6 +938,22 @@ class StableDiffusionGGML {
927938 negative_data = (float *)out_uncond->data ;
928939 }
929940
941+ float * img_cond_data = NULL ;
942+ if (has_img_guidance) {
943+ diffusion_model->compute (n_threads,
944+ noised_input,
945+ timesteps,
946+ uncond.c_crossattn ,
947+ cond.c_concat ,
948+ uncond.c_vector ,
949+ guidance_tensor,
950+ -1 ,
951+ controls,
952+ control_strength,
953+ &out_img_cond);
954+ img_cond_data = (float *)out_img_cond->data ;
955+ }
956+
930957 int step_count = sigmas.size ();
931958 bool is_skiplayer_step = has_skiplayer && step > (int )(skip_layer_start * step_count) && step < (int )(skip_layer_end * step_count);
932959 float * skip_layer_data = NULL ;
@@ -961,7 +988,11 @@ class StableDiffusionGGML {
961988 int64_t i3 = i / out_cond->ne [0 ] * out_cond->ne [1 ] * out_cond->ne [2 ];
962989 float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1 .0f / ne3);
963990 } else {
964- latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
991+ if (has_img_guidance){
992+ latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
993+ } else {
994+ latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
995+ }
965996 }
966997 }
967998 if (is_skiplayer_step) {
@@ -1362,7 +1393,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
13621393 sd_ctx->sd ->diffusion_model ->get_adm_in_channels ());
13631394
13641395 SDCondition uncond;
1365- if (cfg_scale != 1.0 ) {
1396+ if (cfg_scale != 1.0 || sd_ctx-> sd -> version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!=guidance ) {
13661397 bool force_zero_embeddings = false ;
13671398 if (sd_version_is_sdxl (sd_ctx->sd ->version ) && negative_prompt.size () == 0 ) {
13681399 force_zero_embeddings = true ;
0 commit comments