Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ API and command-line option may change frequently.***
- Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
- [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md)
- [Chroma](./docs/chroma.md)
Expand Down Expand Up @@ -364,6 +363,7 @@ arguments:
--vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
--vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
--vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram)
Expand Down
2 changes: 1 addition & 1 deletion conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
const ConditionerParams& conditioner_params) {
std::string prompt;
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
size_t system_prompt_length = 0;
size_t system_prompt_length = 0;
int prompt_template_encode_start_idx = 34;
if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
LOG_INFO("QwenImageEditPlusPipeline");
Expand Down
5 changes: 5 additions & 0 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ struct SDParams {
float flow_shift = INFINITY;

sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
bool force_sdxl_vae_conv_scale = false;

SDParams() {
sd_sample_params_init(&sample_params);
Expand Down Expand Up @@ -194,6 +195,7 @@ void print_params(SDParams params) {
printf(" seed: %zd\n", params.seed);
printf(" batch_count: %d\n", params.batch_count);
printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false");
printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
printf(" upscale_repeats: %d\n", params.upscale_repeats);
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
Expand Down Expand Up @@ -287,6 +289,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n");
printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n");
printf(" --vae-on-cpu keep vae in cpu (for low vram)\n");
printf(" --clip-on-cpu keep clip in cpu (for low vram)\n");
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
Expand Down Expand Up @@ -557,6 +560,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {

options.bool_options = {
{"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
{"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
{"", "--control-net-cpu", "", true, &params.control_net_cpu},
{"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
Expand Down Expand Up @@ -1361,6 +1365,7 @@ int main(int argc, const char* argv[]) {
params.diffusion_flash_attn,
params.diffusion_conv_direct,
params.vae_conv_direct,
params.force_sdxl_vae_conv_scale,
params.chroma_use_dit_mask,
params.chroma_use_t5_mask,
params.chroma_t5_mask_pad,
Expand Down
70 changes: 36 additions & 34 deletions ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -975,38 +975,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1) {
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
if (b != NULL) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
// b = ggml_repeat(ctx, b, x);
x = ggml_add_inplace(ctx, x, b);
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1,
bool direct = false,
float scale = 1.f) {
if (scale != 1.f) {
x = ggml_scale(ctx, x, scale);
}
if (direct) {
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
} else {
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
}
if (scale != 1.f) {
x = ggml_scale(ctx, x, 1.f / scale);
}
return x;
}

// w: [OC*IC, KD, KH, KW]
// x: [N*IC, ID, IH, IW]
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1) {
x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
if (b != NULL) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
// b = ggml_repeat(ctx, b, x);
x = ggml_add(ctx, x, b);
x = ggml_add_inplace(ctx, x, b);
}
return x;
}
Expand Down Expand Up @@ -2067,6 +2057,7 @@ class Conv2d : public UnaryBlock {
std::pair<int, int> dilation;
bool bias;
bool direct = false;
float scale = 1.f;

void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
enum ggml_type wtype = GGML_TYPE_F16;
Expand Down Expand Up @@ -2097,6 +2088,10 @@ class Conv2d : public UnaryBlock {
direct = true;
}

void set_scale(float scale_value) {
scale = scale_value;
}

std::string get_desc() {
return "Conv2d";
}
Expand All @@ -2107,11 +2102,18 @@ class Conv2d : public UnaryBlock {
if (bias) {
b = params["bias"];
}
if (direct) {
return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
} else {
return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
}
return ggml_nn_conv_2d(ctx,
x,
w,
b,
stride.second,
stride.first,
padding.second,
padding.first,
dilation.second,
dilation.first,
direct,
scale);
}
};

Expand Down
2 changes: 1 addition & 1 deletion qwen_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ namespace Qwen {
}
}
LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
qwen_image = QwenImageModel(qwen_image_params);
qwen_image = QwenImageModel(qwen_image_params);
qwen_image.init(params_ctx, tensor_types, prefix);
}

Expand Down
16 changes: 9 additions & 7 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,13 +330,6 @@ class StableDiffusionGGML {

if (sd_version_is_sdxl(version)) {
scale_factor = 0.13025f;
if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) {
LOG_WARN(
"!!!It looks like you are using SDXL model. "
"If you find that the generated images are completely black, "
"try specifying SDXL VAE FP16 Fix with the --vae parameter. "
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
}
} else if (sd_version_is_sd3(version)) {
scale_factor = 1.5305f;
} else if (sd_version_is_flux(version)) {
Expand Down Expand Up @@ -517,6 +510,15 @@ class StableDiffusionGGML {
LOG_INFO("Using Conv2d direct in the vae model");
first_stage_model->enable_conv2d_direct();
}
if (version == VERSION_SDXL &&
(strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
float vae_conv_2d_scale = 1.f / 32.f;
LOG_WARN(
"No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
"using Conv2D scale %.3f",
vae_conv_2d_scale);
first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
}
first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else {
Expand Down
1 change: 1 addition & 0 deletions stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ typedef struct {
bool diffusion_flash_attn;
bool diffusion_conv_direct;
bool vae_conv_direct;
bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
Expand Down
12 changes: 12 additions & 0 deletions vae.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ struct VAE : public GGMLRunner {
struct ggml_context* output_ctx) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void enable_conv2d_direct(){};
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
};

struct AutoEncoderKL : public VAE {
Expand Down Expand Up @@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE {
}
}

void set_conv2d_scale(float scale) {
std::vector<GGMLBlock*> blocks;
ae.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->set_scale(scale);
}
}
}

std::string get_desc() {
return "vae";
}
Expand Down
Loading