diff --git a/README.md b/README.md index a5b181aad..c69335b24 100644 --- a/README.md +++ b/README.md @@ -319,6 +319,7 @@ arguments: -i, --end-img [IMAGE] path to the end image, required by flf2v --control-image [IMAGE] path to image condition, control net -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) + --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). -o, --output OUTPUT path to write result image to (default: ./output.png) -p, --prompt [PROMPT] the prompt to render -n, --negative-prompt PROMPT the negative prompt (default: "") diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 312266e8b..89f31b13e 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -16,6 +16,7 @@ struct DiffusionModel { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -77,6 +78,7 @@ struct UNetModel : public DiffusionModel { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -133,6 +135,7 @@ struct MMDiTModel : public DiffusionModel { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -191,13 +194,14 @@ struct FluxModel : public DiffusionModel { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, struct ggml_tensor** output = NULL, struct ggml_context* output_ctx = NULL, std::vector skip_layers = std::vector()) { - return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers); + return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers); } }; @@ -250,6 +254,7 @@ struct WanModel : public DiffusionModel { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 94cdbdb91..8dd29051d 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -74,6 +74,7 @@ struct SDParams { std::string mask_image_path; std::string control_image_path; std::vector ref_image_paths; + bool increase_ref_index = false; std::string prompt; std::string negative_prompt; @@ -156,6 +157,7 @@ void print_params(SDParams params) { for (auto& path : params.ref_image_paths) { printf(" %s\n", path.c_str()); }; + printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false"); @@ -222,6 +224,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); + printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); @@ -536,6 +539,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--color", "", true, ¶ms.color}, {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, + {"", "--increase-ref-index", "", true, ¶ms.increase_ref_index}, }; auto on_mode_arg = [&](int argc, const char** argv, int index) { @@ -1207,6 +1211,7 @@ int main(int argc, const char* argv[]) { init_image, ref_images.data(), (int)ref_images.size(), + params.increase_ref_index, mask_image, params.width, params.height, diff --git a/flux.hpp b/flux.hpp index ae0cd3755..9d910051c 100644 --- a/flux.hpp +++ b/flux.hpp @@ -960,6 +960,7 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); @@ -999,6 +1000,7 @@ namespace Flux { x->ne[3], context->ne[1], ref_latents, + increase_ref_index, flux_params.theta, flux_params.axes_dim); int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2; @@ -1035,6 +1037,7 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, + bool increase_ref_index = false, struct ggml_tensor** output = NULL, struct ggml_context* output_ctx = NULL, std::vector skip_layers = std::vector()) { @@ -1044,7 +1047,7 @@ namespace Flux { // y: [N, adm_in_channels] or [1, adm_in_channels] // guidance: [N, ] auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers); + return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers); }; GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); @@ -1084,7 +1087,7 @@ namespace Flux { struct ggml_tensor* out = NULL; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx); + compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); diff --git a/lora.hpp b/lora.hpp index 8d4b5cf73..b7a27306c 100644 --- a/lora.hpp +++ b/lora.hpp @@ -58,6 +58,7 @@ struct LoraModel : public GGMLRunner { {"x_block.attn.proj", "attn.to_out.0"}, {"x_block.attn2.proj", "attn2.to_out.0"}, // flux + {"img_in", "x_embedder"}, // singlestream {"linear2", "proj_out"}, {"modulation.lin", "norm.linear"}, diff --git a/rope.hpp b/rope.hpp index 15889304a..bde075a02 100644 --- a/rope.hpp +++ b/rope.hpp @@ -156,25 +156,33 @@ struct Rope { int patch_size, int bs, int context_len, - std::vector ref_latents) { + std::vector ref_latents, + bool increase_ref_index) { auto txt_ids = gen_txt_ids(bs, context_len); auto img_ids = gen_img_ids(h, w, patch_size, bs); auto ids = concat_ids(txt_ids, img_ids, bs); uint64_t curr_h_offset = 0; uint64_t curr_w_offset = 0; + int index = 1; for (ggml_tensor* ref : ref_latents) { uint64_t h_offset = 0; uint64_t w_offset = 0; - if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) { - w_offset = curr_w_offset; - } else { - h_offset = curr_h_offset; + if (!increase_ref_index) { + if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) { + w_offset = curr_w_offset; + } else { + h_offset = curr_h_offset; + } } - auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset); + auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset); ids = concat_ids(ids, ref_ids, bs); + if (increase_ref_index) { + index++; + } + curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset); curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset); } @@ -188,9 +196,10 @@ struct Rope { int bs, int context_len, std::vector ref_latents, + bool increase_ref_index, int theta, const std::vector& axes_dim) { - std::vector> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents); + std::vector> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); return embed_nd(ids, bs, theta, axes_dim); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 64164a2fb..31d092958 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -775,7 +775,7 @@ class StableDiffusionGGML { int64_t t0 = ggml_time_ms(); struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); - diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out); + diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, false, -1, {}, 0.f, &out); diffusion_model->free_compute_buffer(); double result = 0.f; @@ -1032,6 +1032,7 @@ class StableDiffusionGGML { int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, + bool increase_ref_index = false, ggml_tensor* denoise_mask = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); @@ -1126,6 +1127,7 @@ class StableDiffusionGGML { cond.c_vector, guidance_tensor, ref_latents, + increase_ref_index, -1, controls, control_strength, @@ -1139,6 +1141,7 @@ class StableDiffusionGGML { id_cond.c_vector, guidance_tensor, ref_latents, + increase_ref_index, -1, controls, control_strength, @@ -1160,6 +1163,7 @@ class StableDiffusionGGML { uncond.c_vector, guidance_tensor, ref_latents, + increase_ref_index, -1, controls, control_strength, @@ -1177,6 +1181,7 @@ class StableDiffusionGGML { img_cond.c_vector, guidance_tensor, ref_latents, + increase_ref_index, -1, controls, control_strength, @@ -1198,6 +1203,7 @@ class StableDiffusionGGML { cond.c_vector, guidance_tensor, ref_latents, + increase_ref_index, -1, controls, control_strength, @@ -1710,6 +1716,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "\n" "batch_count: %d\n" "ref_images_count: %d\n" + "increase_ref_index: %s\n" "control_strength: %.2f\n" "style_strength: %.2f\n" "normalize_input: %s\n" @@ -1724,6 +1731,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->seed, sd_img_gen_params->batch_count, sd_img_gen_params->ref_images_count, + BOOL_STR(sd_img_gen_params->increase_ref_index), sd_img_gen_params->control_strength, sd_img_gen_params->style_strength, BOOL_STR(sd_img_gen_params->normalize_input), @@ -1797,6 +1805,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, bool normalize_input, std::string input_id_images_path, std::vector ref_latents, + bool increase_ref_index, ggml_tensor* concat_latent = NULL, ggml_tensor* denoise_mask = NULL) { if (seed < 0) { @@ -2054,6 +2063,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, + increase_ref_index, denoise_mask); // print_ggml_tensor(x_0); int64_t sampling_end = ggml_time_ms(); @@ -2304,7 +2314,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g LOG_INFO("EDIT mode"); } - std::vector ref_latents; + std::vector ref_latents; for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, @@ -2359,6 +2369,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->normalize_input, sd_img_gen_params->input_id_images_path, ref_latents, + sd_img_gen_params->increase_ref_index, concat_latent, denoise_mask); diff --git a/stable-diffusion.h b/stable-diffusion.h index 7bfb52700..34b0d1492 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -182,6 +182,7 @@ typedef struct { sd_image_t init_image; sd_image_t* ref_images; int ref_images_count; + bool increase_ref_index; sd_image_t mask_image; int width; int height;