diff --git a/examples/cli/README.md b/examples/cli/README.md index 6e8ddd48..a4e8d52c 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -35,8 +35,9 @@ arguments: -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) --control-video [PATH] path to control video frames, It must be a directory path. The video frames inside should be stored as images in lexicographical (character) order - For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc. + For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc. --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). + --disable-auto-resize-ref-image disable auto resize of ref images -o, --output OUTPUT path to write result image to (default: ./output.png) -p, --prompt [PROMPT] the prompt to render -n, --negative-prompt PROMPT the negative prompt (default: "") diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ff36cea2..a64172be 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -80,7 +80,8 @@ struct SDParams { std::string control_image_path; std::vector ref_image_paths; std::string control_video_path; - bool increase_ref_index = false; + bool auto_resize_ref_image = true; + bool increase_ref_index = false; std::string prompt; std::string negative_prompt; @@ -175,6 +176,7 @@ void print_params(SDParams params) { printf(" %s\n", path.c_str()); }; printf(" control_video_path: %s\n", params.control_video_path.c_str()); + printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false"); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); @@ -244,9 +246,10 @@ void print_usage(int argc, const char* argv[]) { printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); + printf(" --disable-auto-resize-ref-image disable auto resize of ref images\n"); printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n"); printf(" The video frames inside should be stored as images in lexicographical (character) order\n"); - printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n"); + printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc.\n"); printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n"); @@ -579,6 +582,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, {"", "--increase-ref-index", "", true, ¶ms.increase_ref_index}, + {"", "--disable-auto-resize-ref-image", "", false, ¶ms.auto_resize_ref_image}, }; auto on_mode_arg = [&](int argc, const char** argv, int index) { @@ -1428,6 +1432,7 @@ int main(int argc, const char* argv[]) { init_image, ref_images.data(), (int)ref_images.size(), + params.auto_resize_ref_image, params.increase_ref_index, mask_image, params.width, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 87b6a377..a520522d 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1981,6 +1981,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "seed: %" PRId64 "batch_count: %d\n" "ref_images_count: %d\n" + "auto_resize_ref_image: %s\n" "increase_ref_index: %s\n" "control_strength: %.2f\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" @@ -1995,6 +1996,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->seed, sd_img_gen_params->batch_count, sd_img_gen_params->ref_images_count, + BOOL_STR(sd_img_gen_params->auto_resize_ref_image), BOOL_STR(sd_img_gen_params->increase_ref_index), sd_img_gen_params->control_strength, sd_img_gen_params->pm_params.style_strength, @@ -2635,14 +2637,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g std::vector ref_latents; for (int i = 0; i < ref_images.size(); i++) { ggml_tensor* img; - if (sd_version_is_qwen_image(sd_ctx->sd->version)) { + if (sd_img_gen_params->auto_resize_ref_image) { + LOG_DEBUG("auto resize ref images"); sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); double vae_height = vae_width * ref_image.height / ref_image.width; - vae_height = round(vae_height / 32) * 32; - vae_width = round(vae_width / 32) * 32; + int factor = 16; + if (sd_version_is_qwen_image(sd_ctx->sd->version)) { + factor = 32; + } + + vae_height = round(vae_height / factor) * factor; + vae_width = round(vae_width / factor) * factor; sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast(vae_width), static_cast(vae_height)); free(ref_image.data); diff --git a/stable-diffusion.h b/stable-diffusion.h index a891a58f..f618d457 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -216,6 +216,7 @@ typedef struct { sd_image_t init_image; sd_image_t* ref_images; int ref_images_count; + bool auto_resize_ref_image; bool increase_ref_index; sd_image_t mask_image; int width;