add skip layer guidance support (mmdit only)

stduhpf · stduhpf · commit 6e42ffacf2a2 · 2024-11-04T18:42:51.000+01:00
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -17,7 +17,8 @@ struct DiffusionModel {
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
                          struct ggml_tensor** output               = NULL,
-                         struct ggml_context* output_ctx           = NULL)                        = 0;
+                         struct ggml_context* output_ctx           = NULL,
+                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
     virtual void alloc_params_buffer()                                                  = 0;
     virtual void free_params_buffer()                                                   = 0;
     virtual void free_compute_buffer()                                                  = 0;
@@ -70,7 +71,8 @@ struct UNetModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
         return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
     }
 };
@@ -119,8 +121,9 @@ struct MMDiTModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
-        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx);
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
     }
 };
 
@@ -168,7 +171,8 @@ struct FluxModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
         return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx);
     }
 };
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -119,6 +119,11 @@ struct SDParams {
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+    float slg_scale              = 2.5;
+    float skip_layer_start       = 0.01;
+    float skip_layer_end         = 0.2;
 };
 
 void print_params(SDParams params) {
@@ -197,6 +202,11 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
     printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
+    printf("  --slg                              enable skip layer guidance (CFG variant)\n");
+    printf("  --skip_layers LAYERS               Layers to skip for skip layer CFG (requires --slg): (default: [7,8,9])\n");
+    printf("  --slg-scale SCALE                  skip layer guidance scale (requires --slg): (default: 2.5)\n");
+    printf("  --skip_layer_start START           skip layer enabling point (* steps) (requires --slg): (default: 0.01)\n");
+    printf("  --skip_layer_end END               skip layer enabling point (* steps) (requires --slg): (default: 0.2)\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
     printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
@@ -223,6 +233,7 @@ void print_usage(int argc, const char* argv[]) {
 
 void parse_args(int argc, const char** argv, SDParams& params) {
     bool invalid_arg = false;
+    bool cfg_skip    = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -534,6 +545,63 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.verbose = true;
         } else if (arg == "--color") {
             params.color = true;
+        } else if (arg == "--slg") {
+            cfg_skip = true;
+        } else if (arg == "--skip-layers") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            if (argv[i][0] != '[') {
+                invalid_arg = true;
+                break;
+            }
+            std::string layers_str = argv[i];
+            while (layers_str.back() != ']') {
+                if (++i >= argc) {
+                    invalid_arg = true;
+                    break;
+                }
+                layers_str += " " + std::string(argv[i]);
+            }
+            layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+            std::regex regex("[, ]+");
+            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+            std::sregex_token_iterator end;
+            std::vector<std::string> tokens(iter, end);
+            std::vector<int> layers;
+            for (const auto& token : tokens) {
+                try {
+                    layers.push_back(std::stoi(token));
+                } catch (const std::invalid_argument& e) {
+                    invalid_arg = true;
+                    break;
+                }
+            }
+            params.skip_layers = layers;
+
+            if (invalid_arg) {
+                break;
+            }
+        } else if (arg == "--slg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.slg_scale = std::stof(argv[i]);
+        } else if (arg == "--skip-layer-start") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.skip_layer_start = std::stof(argv[i]);
+        } else if (arg == "--skip-layer-end") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.skip_layer_end = std::stof(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -549,6 +617,11 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         params.n_threads = get_num_physical_cores();
     }
 
+    if (!cfg_skip) {
+        // set skip_layers to empty
+        params.skip_layers.clear();
+    }
+
     if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
         print_usage(argc, argv);
@@ -840,7 +913,11 @@ int main(int argc, const char* argv[]) {
                           params.control_strength,
                           params.style_ratio,
                           params.normalize_input,
-                          params.input_id_images_path.c_str());
+                          params.input_id_images_path.c_str(),
+                          params.skip_layers,
+                          params.slg_scale,
+                          params.skip_layer_start,
+                          params.skip_layer_end);
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
diff --git a/mmdit.hpp b/mmdit.hpp
@@ -801,14 +801,20 @@ struct MMDiT : public GGMLBlock {
     struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
                                                  struct ggml_tensor* x,
                                                  struct ggml_tensor* c_mod,
-                                                 struct ggml_tensor* context) {
+                                                 struct ggml_tensor* context,
+                                                 std::vector<int> skip_layers = std::vector<int>()) {
         // x: [N, H*W, hidden_size]
         // context: [N, n_context, d_context]
         // c: [N, hidden_size]
         // return: [N, N*W, patch_size * patch_size * out_channels]
         auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
 
         for (int i = 0; i < depth; i++) {
+            // skip iteration if i is in skip_layers
+            if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
+                continue;
+            }
+
             auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);
 
             auto context_x = block->forward(ctx, context, x, c_mod);
@@ -824,8 +830,9 @@ struct MMDiT : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx,
                                 struct ggml_tensor* x,
                                 struct ggml_tensor* t,
-                                struct ggml_tensor* y       = NULL,
-                                struct ggml_tensor* context = NULL) {
+                                struct ggml_tensor* y        = NULL,
+                                struct ggml_tensor* context  = NULL,
+                                std::vector<int> skip_layers = std::vector<int>()) {
         // Forward pass of DiT.
         // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
         // t: (N,) tensor of diffusion timesteps
@@ -856,7 +863,7 @@ struct MMDiT : public GGMLBlock {
             context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
         }
 
-        x = forward_core_with_concat(ctx, x, c, context);  // (N, H*W, patch_size ** 2 * out_channels)
+        x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)
 
         x = unpatchify(ctx, x, h, w);  // [N, C, H, W]
 
@@ -885,7 +892,8 @@ struct MMDiTRunner : public GGMLRunner {
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
-                                    struct ggml_tensor* y) {
+                                    struct ggml_tensor* y,
+                                    std::vector<int> skip_layers = std::vector<int>()) {
         struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
 
         x         = to_backend(x);
@@ -897,7 +905,8 @@ struct MMDiTRunner : public GGMLRunner {
                                                 x,
                                                 timesteps,
                                                 y,
-                                                context);
+                                                context,
+                                                skip_layers);
 
         ggml_build_forward_expand(gf, out);
 
@@ -910,13 +919,14 @@ struct MMDiTRunner : public GGMLRunner {
                  struct ggml_tensor* context,
                  struct ggml_tensor* y,
                  struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_context* output_ctx = NULL,
+                 std::vector<int> skip_layers = std::vector<int>()) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(x, timesteps, context, y);
+            return build_graph(x, timesteps, context, y, skip_layers);
         };
 
         GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -772,7 +772,11 @@ class StableDiffusionGGML {
                         sample_method_t method,
                         const std::vector<float>& sigmas,
                         int start_merge_step,
-                        SDCondition id_cond) {
+                        SDCondition id_cond,
+                        std::vector<int> skip_layers                                    = {},
+                        float slg_scale                                                 = 2.5,
+                        float skip_layer_start                                          = 0.01,
+                        float skip_layer_end                                            = 0.2) {
         size_t steps = sigmas.size() - 1;
         // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
         // print_ggml_tensor(noise);
@@ -870,6 +874,30 @@ class StableDiffusionGGML {
                                          &out_uncond);
                 negative_data = (float*)out_uncond->data;
             }
+
+            bool has_skiplayer     = skip_layers.size() > 0;
+            int stepCount          = sigmas.size();
+            has_skiplayer          = has_skiplayer && step > (int)(skip_layer_start * stepCount) && step < (int)(skip_layer_end * stepCount);
+            float* skip_layer_data = NULL;
+            if (has_skiplayer) {
+                LOG_DEBUG("Skipping layers at step %d\n", step);
+                ggml_tensor* out_skip = ggml_dup_tensor(work_ctx, x);
+                // skip layer (same as conditionned)
+                diffusion_model->compute(n_threads,
+                                         noised_input,
+                                         timesteps,
+                                         cond.c_crossattn,
+                                         cond.c_concat,
+                                         cond.c_vector,
+                                         guidance_tensor,
+                                         -1,
+                                         controls,
+                                         control_strength,
+                                         &out_skip,
+                                         NULL,
+                                         skip_layers);
+                skip_layer_data = (float*)out_skip->data;
+            }
             float* vec_denoised  = (float*)denoised->data;
             float* vec_input     = (float*)input->data;
             float* positive_data = (float*)out_cond->data;
@@ -886,6 +914,9 @@ class StableDiffusionGGML {
                         latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
                     }
                 }
+                if (has_skiplayer) {
+                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
+                }
                 // v = latent_result, eps = latent_result
                 // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
                 vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
@@ -1112,7 +1143,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float control_strength,
                            float style_ratio,
                            bool normalize_input,
-                           std::string input_id_images_path) {
+                           std::string input_id_images_path,
+                           std::vector<int> skip_layers                                    = {},
+                           float slg_scale                                                 = 2.5,
+                           float skip_layer_start                                          = 0.01,
+                           float skip_layer_end                                            = 0.2) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1321,7 +1356,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      sample_method,
                                                      sigmas,
                                                      start_merge_step,
-                                                     id_cond);
+                                                     id_cond,
+                                                     skip_layers,
+                                                     slg_scale,
+                                                     skip_layer_start,
+                                                     skip_layer_end);
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
         int64_t sampling_end = ggml_time_ms();
@@ -1387,7 +1426,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     float control_strength,
                     float style_ratio,
                     bool normalize_input,
-                    const char* input_id_images_path_c_str) {
+                    const char* input_id_images_path_c_str,
+                    std::vector<int> skip_layers,
+                    float slg_scale,
+                    float skip_layer_start,
+                    float skip_layer_end) {
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1455,7 +1498,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                control_strength,
                                                style_ratio,
                                                normalize_input,
-                                               input_id_images_path_c_str);
+                                               input_id_images_path_c_str,
+                                               skip_layers,
+                                               slg_scale,
+                                               skip_layer_start,
+                                               skip_layer_end);
 
     size_t t1 = ggml_time_ms();
 
@@ -1482,7 +1529,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     float control_strength,
                     float style_ratio,
                     bool normalize_input,
-                    const char* input_id_images_path_c_str) {
+                    const char* input_id_images_path_c_str,
+                    std::vector<int> skip_layers,
+                    float slg_scale,
+                    float skip_layer_start,
+                    float skip_layer_end) {
     LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1556,7 +1607,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                control_strength,
                                                style_ratio,
                                                normalize_input,
-                                               input_id_images_path_c_str);
+                                               input_id_images_path_c_str,
+                                               skip_layers,
+                                               slg_scale,
+                                               skip_layer_start,
+                                               skip_layer_end);
 
     size_t t2 = ggml_time_ms();
 
diff --git a/stable-diffusion.h b/stable-diffusion.h