Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ arguments:
-i, --end-img [IMAGE] path to the end image, required by flf2v
--control-image [IMAGE] path to image condition, control net
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
-o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
Expand Down
7 changes: 6 additions & 1 deletion diffusion_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ struct DiffusionModel {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
Expand Down Expand Up @@ -77,6 +78,7 @@ struct UNetModel : public DiffusionModel {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
Expand Down Expand Up @@ -133,6 +135,7 @@ struct MMDiTModel : public DiffusionModel {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
Expand Down Expand Up @@ -191,13 +194,14 @@ struct FluxModel : public DiffusionModel {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers);
}
};

Expand Down Expand Up @@ -250,6 +254,7 @@ struct WanModel : public DiffusionModel {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
Expand Down
5 changes: 5 additions & 0 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ struct SDParams {
std::string mask_image_path;
std::string control_image_path;
std::vector<std::string> ref_image_paths;
bool increase_ref_index = false;

std::string prompt;
std::string negative_prompt;
Expand Down Expand Up @@ -156,6 +157,7 @@ void print_params(SDParams params) {
for (auto& path : params.ref_image_paths) {
printf(" %s\n", path.c_str());
};
printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false");
Expand Down Expand Up @@ -222,6 +224,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");
printf(" --control-image [IMAGE] path to image condition, control net\n");
printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
printf(" -p, --prompt [PROMPT] the prompt to render\n");
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
Expand Down Expand Up @@ -536,6 +539,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
{"", "--color", "", true, &params.color},
{"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
{"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
{"", "--increase-ref-index", "", true, &params.increase_ref_index},
};

auto on_mode_arg = [&](int argc, const char** argv, int index) {
Expand Down Expand Up @@ -1207,6 +1211,7 @@ int main(int argc, const char* argv[]) {
init_image,
ref_images.data(),
(int)ref_images.size(),
params.increase_ref_index,
mask_image,
params.width,
params.height,
Expand Down
7 changes: 5 additions & 2 deletions flux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,7 @@ namespace Flux {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
std::vector<int> skip_layers = {}) {
GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
Expand Down Expand Up @@ -999,6 +1000,7 @@ namespace Flux {
x->ne[3],
context->ne[1],
ref_latents,
increase_ref_index,
flux_params.theta,
flux_params.axes_dim);
int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
Expand Down Expand Up @@ -1035,6 +1037,7 @@ namespace Flux {
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
Expand All @@ -1044,7 +1047,7 @@ namespace Flux {
// y: [N, adm_in_channels] or [1, adm_in_channels]
// guidance: [N, ]
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
};

GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
Expand Down Expand Up @@ -1084,7 +1087,7 @@ namespace Flux {
struct ggml_tensor* out = NULL;

int t0 = ggml_time_ms();
compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);
compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);
int t1 = ggml_time_ms();

print_ggml_tensor(out);
Expand Down
1 change: 1 addition & 0 deletions lora.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct LoraModel : public GGMLRunner {
{"x_block.attn.proj", "attn.to_out.0"},
{"x_block.attn2.proj", "attn2.to_out.0"},
// flux
{"img_in", "x_embedder"},
// singlestream
{"linear2", "proj_out"},
{"modulation.lin", "norm.linear"},
Expand Down
23 changes: 16 additions & 7 deletions rope.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,25 +156,33 @@ struct Rope {
int patch_size,
int bs,
int context_len,
std::vector<ggml_tensor*> ref_latents) {
std::vector<ggml_tensor*> ref_latents,
bool increase_ref_index) {
auto txt_ids = gen_txt_ids(bs, context_len);
auto img_ids = gen_img_ids(h, w, patch_size, bs);

auto ids = concat_ids(txt_ids, img_ids, bs);
uint64_t curr_h_offset = 0;
uint64_t curr_w_offset = 0;
int index = 1;
for (ggml_tensor* ref : ref_latents) {
uint64_t h_offset = 0;
uint64_t w_offset = 0;
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
w_offset = curr_w_offset;
} else {
h_offset = curr_h_offset;
if (!increase_ref_index) {
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
w_offset = curr_w_offset;
} else {
h_offset = curr_h_offset;
}
}

auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
ids = concat_ids(ids, ref_ids, bs);

if (increase_ref_index) {
index++;
}

curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
}
Expand All @@ -188,9 +196,10 @@ struct Rope {
int bs,
int context_len,
std::vector<ggml_tensor*> ref_latents,
bool increase_ref_index,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents);
std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim);
}

Expand Down
15 changes: 13 additions & 2 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ class StableDiffusionGGML {

int64_t t0 = ggml_time_ms();
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, false, -1, {}, 0.f, &out);
diffusion_model->free_compute_buffer();

double result = 0.f;
Expand Down Expand Up @@ -1032,6 +1032,7 @@ class StableDiffusionGGML {
int start_merge_step,
SDCondition id_cond,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
ggml_tensor* denoise_mask = nullptr) {
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);

Expand Down Expand Up @@ -1126,6 +1127,7 @@ class StableDiffusionGGML {
cond.c_vector,
guidance_tensor,
ref_latents,
increase_ref_index,
-1,
controls,
control_strength,
Expand All @@ -1139,6 +1141,7 @@ class StableDiffusionGGML {
id_cond.c_vector,
guidance_tensor,
ref_latents,
increase_ref_index,
-1,
controls,
control_strength,
Expand All @@ -1160,6 +1163,7 @@ class StableDiffusionGGML {
uncond.c_vector,
guidance_tensor,
ref_latents,
increase_ref_index,
-1,
controls,
control_strength,
Expand All @@ -1177,6 +1181,7 @@ class StableDiffusionGGML {
img_cond.c_vector,
guidance_tensor,
ref_latents,
increase_ref_index,
-1,
controls,
control_strength,
Expand All @@ -1198,6 +1203,7 @@ class StableDiffusionGGML {
cond.c_vector,
guidance_tensor,
ref_latents,
increase_ref_index,
-1,
controls,
control_strength,
Expand Down Expand Up @@ -1710,6 +1716,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
"\n"
"batch_count: %d\n"
"ref_images_count: %d\n"
"increase_ref_index: %s\n"
"control_strength: %.2f\n"
"style_strength: %.2f\n"
"normalize_input: %s\n"
Expand All @@ -1724,6 +1731,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->seed,
sd_img_gen_params->batch_count,
sd_img_gen_params->ref_images_count,
BOOL_STR(sd_img_gen_params->increase_ref_index),
sd_img_gen_params->control_strength,
sd_img_gen_params->style_strength,
BOOL_STR(sd_img_gen_params->normalize_input),
Expand Down Expand Up @@ -1797,6 +1805,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
bool normalize_input,
std::string input_id_images_path,
std::vector<ggml_tensor*> ref_latents,
bool increase_ref_index,
ggml_tensor* concat_latent = NULL,
ggml_tensor* denoise_mask = NULL) {
if (seed < 0) {
Expand Down Expand Up @@ -2054,6 +2063,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
start_merge_step,
id_cond,
ref_latents,
increase_ref_index,
denoise_mask);
// print_ggml_tensor(x_0);
int64_t sampling_end = ggml_time_ms();
Expand Down Expand Up @@ -2304,7 +2314,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
LOG_INFO("EDIT mode");
}

std::vector<struct ggml_tensor*> ref_latents;
std::vector<ggml_tensor*> ref_latents;
for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
GGML_TYPE_F32,
Expand Down Expand Up @@ -2359,6 +2369,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_img_gen_params->normalize_input,
sd_img_gen_params->input_id_images_path,
ref_latents,
sd_img_gen_params->increase_ref_index,
concat_latent,
denoise_mask);

Expand Down
1 change: 1 addition & 0 deletions stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ typedef struct {
sd_image_t init_image;
sd_image_t* ref_images;
int ref_images_count;
bool increase_ref_index;
sd_image_t mask_image;
int width;
int height;
Expand Down
Loading