Add gradient estimation sampler

SkutteOleg · SkutteOleg · commit d3a4d5a15b2f · 2025-10-20T03:28:32.000+04:00
diff --git a/denoiser.hpp b/denoiser.hpp
@@ -576,7 +576,8 @@ static void sample_k_diffusion(sample_method_t method,
                                ggml_tensor* x,
                                std::vector<float> sigmas,
                                std::shared_ptr<RNG> rng,
-                               float eta) {
+                               float eta,
+                               float ge_gamma) {
     size_t steps = sigmas.size() - 1;
     // sample_euler_ancestral
     switch (method) {
@@ -1462,7 +1463,52 @@ static void sample_k_diffusion(sample_method_t method,
                 }
             }
         } break;
+        case GRADIENT_ESTIMATION: {
+            struct ggml_tensor* d      = ggml_dup_tensor(work_ctx, x);
+            struct ggml_tensor* old_d  = ggml_dup_tensor(work_ctx, x);
+            bool has_old_d             = false;
 
+            for (int i = 0; i < steps; i++) {
+                float sigma = sigmas[i];
+
+                ggml_tensor* denoised = model(x, sigma, i + 1);
+
+                // d = (x - denoised) / sigma
+                float* vec_d        = (float*)d->data;
+                float* vec_x        = (float*)x->data;
+                float* vec_denoised = (float*)denoised->data;
+
+                for (int j = 0; j < ggml_nelements(d); j++) {
+                    vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
+                }
+
+                float dt = sigmas[i + 1] - sigma;
+
+                if (sigmas[i + 1] == 0) {
+                    // Denoising step
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] = vec_denoised[j];
+                    }
+                } else {
+                    // Euler method
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] = vec_x[j] + vec_d[j] * dt;
+                    }
+                }
+
+                if (has_old_d) {
+                    // Gradient estimation
+                    float* vec_old_d = (float*)old_d->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        float d_bar = (ge_gamma - 1.f) * (vec_d[j] - vec_old_d[j]);
+                        vec_x[j]    = vec_x[j] + d_bar * dt;
+                    }
+                }
+                // old_d = d
+                copy_ggml_tensor(old_d, d);
+                has_old_d = true;
+            }
+        } break;
         default:
             LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
             abort();
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -679,7 +679,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "eta in DDIM, only for DDIM and TCD (default: 0)",
          &params.sample_params.eta},
         {"",
-         "--high-noise-cfg-scale",
+         "--ge-gamma", "", &params.sample_params.ge_gamma},
+        {"", "--high-noise-cfg-scale",
          "(high noise) unconditional guidance scale: (default: 7.0)",
          &params.high_noise_sample_params.guidance.txt_cfg},
         {"",
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -59,6 +59,7 @@ const char* sampling_methods_str[] = {
     "DDIM \"trailing\"",
     "TCD",
     "Euler A",
+    "Gradient Estimation",
 };
 
 /*================================================== Helper Functions ================================================*/
@@ -1124,6 +1125,7 @@ class StableDiffusionGGML {
                         float eta,
                         int shifted_timestep,
                         sample_method_t method,
+                        float ge_gamma,
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
@@ -1354,7 +1356,7 @@ class StableDiffusionGGML {
             return denoised;
         };
 
-        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
+        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta, ge_gamma);
 
         if (inverse_noise_scaling) {
             x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
@@ -1725,6 +1727,7 @@ const char* sample_method_to_str[] = {
     "ddim_trailing",
     "tcd",
     "euler_a",
+    "gradient_estimation",
 };
 
 const char* sd_sample_method_name(enum sample_method_t sample_method) {
@@ -1896,6 +1899,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
     sample_params->scheduler                   = DEFAULT;
     sample_params->sample_method               = SAMPLE_METHOD_DEFAULT;
     sample_params->sample_steps                = 20;
+    sample_params->ge_gamma                    = 2.0f;
 }
 
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@@ -1916,6 +1920,7 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
              "sample_method: %s, "
              "sample_steps: %d, "
              "eta: %.2f, "
+             "ge_gamma: %.2f, "
              "shifted_timestep: %d)",
              sample_params->guidance.txt_cfg,
              isfinite(sample_params->guidance.img_cfg)
@@ -1930,6 +1935,7 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
              sd_sample_method_name(sample_params->sample_method),
              sample_params->sample_steps,
              sample_params->eta,
+             sample_params->ge_gamma,
              sample_params->shifted_timestep);
 
     return buf;
@@ -2065,6 +2071,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     int width,
                                     int height,
                                     enum sample_method_t sample_method,
+                                    float ge_gamma,
                                     const std::vector<float>& sigmas,
                                     int64_t seed,
                                     int batch_count,
@@ -2353,6 +2360,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      eta,
                                                      shifted_timestep,
                                                      sample_method,
+                                                     ge_gamma,
                                                      sigmas,
                                                      start_merge_step,
                                                      id_cond,
@@ -2692,6 +2700,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         width,
                                                         height,
                                                         sample_method,
+                                                        sd_img_gen_params->sample_params.ge_gamma,
                                                         sigmas,
                                                         seed,
                                                         sd_img_gen_params->batch_count,
@@ -3019,6 +3028,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                  sd_vid_gen_params->high_noise_sample_params.eta,
                                  sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
                                  sd_vid_gen_params->high_noise_sample_params.sample_method,
+                                 sd_vid_gen_params->high_noise_sample_params.ge_gamma,
                                  high_noise_sigmas,
                                  -1,
                                  {},
@@ -3055,6 +3065,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                           sd_vid_gen_params->sample_params.eta,
                                           sd_vid_gen_params->sample_params.shifted_timestep,
                                           sd_vid_gen_params->sample_params.sample_method,
+                                          sd_vid_gen_params->sample_params.ge_gamma,
                                           sigmas,
                                           -1,
                                           {},
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -48,6 +48,7 @@ enum sample_method_t {
     DDIM_TRAILING,
     TCD,
     EULER_A,
+    GRADIENT_ESTIMATION,
     SAMPLE_METHOD_COUNT
 };
 
@@ -199,6 +200,7 @@ typedef struct {
     enum sample_method_t sample_method;
     int sample_steps;
     float eta;
+    float ge_gamma;
     int shifted_timestep;
 } sd_sample_params_t;