Skip to content

Commit 34a6fd4

Browse files
authored
feat: add z-image support (#1020)
* add z-image support * use flux_latent_rgb_proj for z-image * fix qwen3 rope type * add support for qwen3 4b gguf * add support for diffusers format lora * fix nan issue that occurs when using CUDA with k-quants weights * add z-image docs
1 parent 3c1187c commit 34a6fd4

20 files changed

+993
-24
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ API and command-line option may change frequently.***
4545
- [Chroma](./docs/chroma.md)
4646
- [Chroma1-Radiance](./docs/chroma_radiance.md)
4747
- [Qwen Image](./docs/qwen_image.md)
48+
- [Z-Image](./docs/z_image.md)
4849
- Image Edit Models
4950
- [FLUX.1-Kontext-dev](./docs/kontext.md)
5051
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
@@ -129,6 +130,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
129130
- [🔥Qwen Image](./docs/qwen_image.md)
130131
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
131132
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
133+
- [🔥Z-Image](./docs/z_image.md)
132134
- [LoRA](./docs/lora.md)
133135
- [LCM/LCM-LoRA](./docs/lcm.md)
134136
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

assets/z_image/bf16.png

1.01 MB
Loading

assets/z_image/q2_K.png

1.15 MB
Loading

assets/z_image/q3_K.png

1.07 MB
Loading

assets/z_image/q4_0.png

1.01 MB
Loading

assets/z_image/q4_K.png

1.02 MB
Loading

assets/z_image/q5_0.png

1.01 MB
Loading

assets/z_image/q6_K.png

1.02 MB
Loading

assets/z_image/q8_0.png

1.01 MB
Loading

conditioner.hpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,8 @@ struct LLMEmbedder : public Conditioner {
16381638
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
16391639
if (sd_version_is_flux2(version)) {
16401640
arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
1641+
} else if (sd_version_is_z_image(version)) {
1642+
arch = LLM::LLMArch::QWEN3;
16411643
}
16421644
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
16431645
tokenizer = std::make_shared<LLM::MistralTokenizer>();
@@ -1785,9 +1787,31 @@ struct LLMEmbedder : public Conditioner {
17851787
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
17861788
prompt += img_prompt;
17871789

1788-
prompt_attn_range.first = prompt.size();
1790+
prompt_attn_range.first = static_cast<int>(prompt.size());
17891791
prompt += conditioner_params.text;
1790-
prompt_attn_range.second = prompt.size();
1792+
prompt_attn_range.second = static_cast<int>(prompt.size());
1793+
1794+
prompt += "<|im_end|>\n<|im_start|>assistant\n";
1795+
} else if (sd_version_is_flux2(version)) {
1796+
prompt_template_encode_start_idx = 0;
1797+
out_layers = {10, 20, 30};
1798+
1799+
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
1800+
1801+
prompt_attn_range.first = static_cast<int>(prompt.size());
1802+
prompt += conditioner_params.text;
1803+
prompt_attn_range.second = static_cast<int>(prompt.size());
1804+
1805+
prompt += "[/INST]";
1806+
} else if (sd_version_is_z_image(version)) {
1807+
prompt_template_encode_start_idx = 0;
1808+
out_layers = {35}; // -2
1809+
1810+
prompt = "<|im_start|>user\n";
1811+
1812+
prompt_attn_range.first = static_cast<int>(prompt.size());
1813+
prompt += conditioner_params.text;
1814+
prompt_attn_range.second = static_cast<int>(prompt.size());
17911815

17921816
prompt += "<|im_end|>\n<|im_start|>assistant\n";
17931817
} else if (sd_version_is_flux2(version)) {
@@ -1806,9 +1830,9 @@ struct LLMEmbedder : public Conditioner {
18061830

18071831
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
18081832

1809-
prompt_attn_range.first = prompt.size();
1833+
prompt_attn_range.first = static_cast<int>(prompt.size());
18101834
prompt += conditioner_params.text;
1811-
prompt_attn_range.second = prompt.size();
1835+
prompt_attn_range.second = static_cast<int>(prompt.size());
18121836

18131837
prompt += "<|im_end|>\n<|im_start|>assistant\n";
18141838
}

0 commit comments

Comments
 (0)