@@ -848,6 +848,7 @@ class StableDiffusionGGML {
848848 int start_merge_step,
849849 SDCondition id_cond,
850850 std::vector<ggml_tensor*> ref_latents = {},
851+ sd_apg_params_t apg_params = {1 , 0 , 0 },
851852 ggml_tensor* denoise_mask = nullptr ) {
852853 std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
853854
@@ -909,6 +910,10 @@ class StableDiffusionGGML {
909910 }
910911 struct ggml_tensor * denoised = ggml_dup_tensor (work_ctx, x);
911912
913+ std::vector<float > apg_momentum_buffer;
914+ if (apg_params.momentum != 0 )
915+ apg_momentum_buffer.resize ((size_t )ggml_nelements (denoised));
916+
912917 auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
913918 if (step == 1 ) {
914919 pretty_progress (0 , (int )steps, 0 );
@@ -1034,6 +1039,56 @@ class StableDiffusionGGML {
10341039 float * vec_input = (float *)input->data ;
10351040 float * positive_data = (float *)out_cond->data ;
10361041 int ne_elements = (int )ggml_nelements (denoised);
1042+
1043+ float * deltas = vec_denoised;
1044+
1045+ // https://arxiv.org/pdf/2410.02416
1046+ float apg_scale_factor = 1 .;
1047+ float diff_norm = 0 ;
1048+ float cond_norm_sq = 0 ;
1049+ float dot = 0 ;
1050+ if (has_unconditioned || has_img_cond) {
1051+ for (int i = 0 ; i < ne_elements; i++) {
1052+ float delta;
1053+ if (has_img_cond) {
1054+ if (cfg_scale == 1 ) {
1055+ // Weird guidance (important: use img_cfg_scale instead of cfg_scale in the final formula)
1056+ delta = img_cond_data[i] - negative_data[i];
1057+ } else if (has_unconditioned) {
1058+ // 2-conditioning CFG (img_cfg_scale != cfg_scale != 1)
1059+ delta = positive_data[i] + (negative_data[i] * (1 - img_cfg_scale) + img_cond_data[i] * (img_cfg_scale - cfg_scale)) / (cfg_scale - 1 );
1060+ } else {
1061+ // pure img CFG (img_cfg_scale == 1, cfg_scale !=1)
1062+ delta = positive_data[i] - img_cond_data[i];
1063+ }
1064+ } else {
1065+ // classic CFG (img_cfg_scale == cfg_scale != 1)
1066+ delta = positive_data[i] - negative_data[i];
1067+ }
1068+ deltas[i] = delta;
1069+ }
1070+ if (apg_params.norm_treshold > 0 ) {
1071+ diff_norm = sqrtf (diff_norm);
1072+ apg_scale_factor = std::min (1 .0f , apg_params.norm_treshold / diff_norm);
1073+ }
1074+ if (apg_params.eta != 1 .0f ) {
1075+ dot *= apg_scale_factor;
1076+ // pre-normalize (avoids one square root and ne_elements extra divs)
1077+ dot /= cond_norm_sq;
1078+ }
1079+
1080+ for (int i = 0 ; i < ne_elements; i++) {
1081+ deltas[i] *= apg_scale_factor;
1082+ if (apg_params.eta != 1 .0f ) {
1083+ float apg_parallel = dot * positive_data[i];
1084+ float apg_orthogonal = deltas[i] - apg_parallel;
1085+
1086+ // tweak deltas
1087+ deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
1088+ }
1089+ }
1090+ }
1091+
10371092 for (int i = 0 ; i < ne_elements; i++) {
10381093 float latent_result = positive_data[i];
10391094 if (has_unconditioned) {
@@ -1043,12 +1098,12 @@ class StableDiffusionGGML {
10431098 int64_t i3 = i / out_cond->ne [0 ] * out_cond->ne [1 ] * out_cond->ne [2 ];
10441099 float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1 .0f / ne3);
10451100 } else {
1046- if (has_img_cond) {
1047- // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
1048- latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
1049- } else {
1050- // img_cfg_scale == cfg_scale
1051- latent_result = negative_data [i] + cfg_scale * (positive_data[i] - negative_data[i]) ;
1101+ float delta = deltas[i];
1102+
1103+ if ( cfg_scale != 1 ) {
1104+ latent_result = positive_data[i] + (cfg_scale - 1 ) * delta;
1105+ } else if (has_img_cond) {
1106+ latent_result = positive_data [i] + (img_cfg_scale - 1 ) * delta ;
10521107 }
10531108 }
10541109 } else if (has_img_cond) {
@@ -1096,7 +1151,8 @@ class StableDiffusionGGML {
10961151 }
10971152
10981153 // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
1099- ggml_tensor* get_first_stage_encoding (ggml_context* work_ctx, ggml_tensor* moments) {
1154+ ggml_tensor*
1155+ get_first_stage_encoding (ggml_context* work_ctx, ggml_tensor* moments) {
11001156 // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
11011157 ggml_tensor* latent = ggml_new_tensor_4d (work_ctx, moments->type , moments->ne [0 ], moments->ne [1 ], moments->ne [2 ] / 2 , moments->ne [3 ]);
11021158 struct ggml_tensor * noise = ggml_dup_tensor (work_ctx, latent);
@@ -1529,6 +1585,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
15291585 std::string input_id_images_path,
15301586 std::vector<ggml_tensor*> ref_latents,
15311587 ggml_tensor* concat_latent = NULL ,
1588+ sd_apg_params_t apg_params = {},
15321589 ggml_tensor* denoise_mask = NULL ) {
15331590 if (seed < 0 ) {
15341591 // Generally, when using the provided command line, the seed is always >0.
@@ -1798,6 +1855,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
17981855 start_merge_step,
17991856 id_cond,
18001857 ref_latents,
1858+ apg_params,
18011859 denoise_mask);
18021860
18031861 // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
@@ -1872,7 +1930,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
18721930 return init_latent;
18731931}
18741932
1875- sd_image_t * generate_image (sd_ctx_t * sd_ctx, const sd_img_gen_params_t * sd_img_gen_params) {
1933+ sd_image_t * generate_image (sd_ctx_t * sd_ctx, const sd_img_gen_params_t * sd_img_gen_params, sd_apg_params_t apg_params ) {
18761934 int width = sd_img_gen_params->width ;
18771935 int height = sd_img_gen_params->height ;
18781936 LOG_DEBUG (" generate_image %dx%d" , width, height);
@@ -2072,6 +2130,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
20722130 sd_img_gen_params->input_id_images_path ,
20732131 ref_latents,
20742132 concat_latent,
2133+ apg_params,
20752134 denoise_mask);
20762135
20772136 size_t t2 = ggml_time_ms ();
0 commit comments