[Modular] Qwen #12220

yiyixuxu · 2025-08-22T18:43:03Z

Qwen-Image

Test Script for Qwen-Image Auto Pipeline

# test modular auto (qwen image)
# use standard repo
import os
import torch

from diffusers import ModularPipeline, ComponentsManager
from diffusers.modular_pipelines.qwenimage import ALL_BLOCKS

from diffusers.utils import load_image
from image_gen_aux import DepthPreprocessor
import numpy as np
from PIL import Image

import logging
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("diffusers").setLevel(logging.INFO)

device = "cuda:2"
output_name_prefix = "test_modular_qwen_out"

components = ComponentsManager()
components.enable_auto_cpu_offload(device=device)

pipeline = ModularPipeline.from_pretrained("Qwen/Qwen-Image", components_manager=components)
print(pipeline)

pipeline.load_components(torch_dtype=torch.bfloat16)

print("pipeline loaded")
print(pipeline)
print(f" ")
print(f"pipeline.blocks")
print(pipeline.blocks)
print(f" ")

print(f"components:")
print(components)
print(f" ")


# test1: text2image with custom height/width

prompt = "现实主义风格的人像摄影作品，画面主体是一位容貌惊艳的女性面部特写。她拥有一头自然微卷的短发，发丝根根分明，蓬松的刘海修饰着额头，增添俏皮感。头上佩戴一顶绿色格子蕾丝边头巾，增添复古与柔美气息。身着一件简约绿色背心裙，在纯白色背景下格外突出。两只手分别握着半个红色桃子，双手轻轻贴在脸颊两侧，营造出可爱又富有创意的视觉效果。  人物表情生动，一只眼睛睁开，另一只微微闭合，展现出调皮与自信的神态。整体构图采用个性视角、非对称构图，聚焦人物主体，增强现场感和既视感。背景虚化处理，层次丰富，景深效果强烈，营造出低光氛围下浓厚的情绪张力。  画面细节精致，色彩生动饱满却不失柔和，呈现出富士胶片独有的温润质感。光影运用充满美学张力，带有轻微超现实的光效处理，提升整体画面高级感。整体风格为现实主义人像摄影，强调细腻的纹理与艺术化的光线表现，堪称一幅细节丰富、氛围拉满的杰作。超清，4K，电影级构图"
inputs = {
    "prompt": prompt,
    "generator": torch.manual_seed(0),
    "negative_prompt": " ",
    "height": 1328,
    "width": 1328,
    "num_inference_steps": 50,
    "num_images_per_prompt": 1,
}

output_images = pipeline(**inputs, output="images")
for i, image in enumerate(output_images):
    assert image.size == (1328, 1328)
    image.save(f"{output_name_prefix}_1_text2image_1328_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_1_text2image_1328_{i}.png')}")


# test2: text2image with default height and width
prompt = "现实主义风格的人像摄影作品，画面主体是一位容貌惊艳的女性面部特写。她拥有一头自然微卷的短发，发丝根根分明，蓬松的刘海修饰着额头，增添俏皮感。头上佩戴一顶绿色格子蕾丝边头巾，增添复古与柔美气息。身着一件简约绿色背心裙，在纯白色背景下格外突出。两只手分别握着半个红色桃子，双手轻轻贴在脸颊两侧，营造出可爱又富有创意的视觉效果。  人物表情生动，一只眼睛睁开，另一只微微闭合，展现出调皮与自信的神态。整体构图采用个性视角、非对称构图，聚焦人物主体，增强现场感和既视感。背景虚化处理，层次丰富，景深效果强烈，营造出低光氛围下浓厚的情绪张力。  画面细节精致，色彩生动饱满却不失柔和，呈现出富士胶片独有的温润质感。光影运用充满美学张力，带有轻微超现实的光效处理，提升整体画面高级感。整体风格为现实主义人像摄影，强调细腻的纹理与艺术化的光线表现，堪称一幅细节丰富、氛围拉满的杰作。超清，4K，电影级构图"
inputs = {
    "prompt": prompt,
    "generator": torch.manual_seed(0),
    "negative_prompt": " ",
    "num_inference_steps": 50,
    "num_images_per_prompt": 1,
}

output_images = pipeline(**inputs, output="images")
for i, image in enumerate(output_images):
    assert image.size == (1024, 1024)
    image.save(f"{output_name_prefix}_1_text2image_1024_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_1_text2image_1024_{i}.png')}")

# test3: inpaint

prompt = "cat wizard with red hat, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney"
negative_prompt = " "
source = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/cute_cat.png?raw=true")
mask = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/mask_cat.png?raw=true")

strengths = [0.9]

print(f"source.size: {source.size}")

for strength in strengths:
    image = pipeline(
        prompt=prompt,
        negative_prompt=negative_prompt,
        height=source.size[1],
        width=source.size[0],
        image=source,
        mask_image=mask,
        strength=strength,
        num_inference_steps=35,
        generator=torch.Generator(device="cuda").manual_seed(42),
        output="images"
    )[0]
    image.save(f"{output_name_prefix}_2_inpaint_{strength}.png")
    assert image.size == source.size
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_2_inpaint_{strength}.png')}")


# test4: controlnet

print("test controlnet")

# canny
from diffusers import QwenImageControlNetModel, QwenImageMultiControlNetModel

controlnet_spec = pipeline.get_component_spec("controlnet")
controlnet_spec.repo = "InstantX/Qwen-Image-ControlNet-Union"
controlnet = controlnet_spec.load(torch_dtype=torch.bfloat16)

pipeline.update_components(controlnet=controlnet)

print("pipeline (with controlnet)")
print(pipeline)
print(f" ")
print("components (with controlnet)")
print(components)
print(f" ")


control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/qwencond_input.png")
prompt = "Aesthetics art, traditional asian pagoda, elaborate golden accents, sky blue and white color palette, swirling cloud pattern, digital illustration, east asian architecture, ornamental rooftop, intricate detailing on building, cultural representation."
controlnet_conditioning_scale = 1.0

print(f"control_image.size: {control_image.size}")

images = pipeline(
    prompt=prompt,
    negative_prompt=" ",
    control_image=control_image,
    controlnet_conditioning_scale=controlnet_conditioning_scale,
    width=control_image.size[0],
    height=control_image.size[1],
    generator=torch.Generator(device="cuda").manual_seed(42),
    output="images"
)
for i, image in enumerate(images):
    assert image.size == control_image.size
    image.save(f"{output_name_prefix}_3_controlnet_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_3_controlnet_{i}.png')}")

print(f" components:")
print(components)
print(f" ")

# test5: multi-controlnet 
multi_controlnet = QwenImageMultiControlNetModel([controlnet])
pipeline.update_components(controlnet=multi_controlnet)

images = pipeline(
    prompt=prompt,
    negative_prompt=" ",
    control_image=[control_image, control_image],
    controlnet_conditioning_scale=[controlnet_conditioning_scale/2, controlnet_conditioning_scale/2],
    width=control_image.size[0],
    height=control_image.size[1],
    generator=torch.Generator(device="cuda").manual_seed(42),
    output="images"
)
for i, image in enumerate(images):
    assert image.size == control_image.size
    image.save(f"{output_name_prefix}_3_controlnet_multi_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_3_controlnet_multi_{i}.png')}")



# test6: multi-controlnet, default height/width and num_images_per_prompt = 2
multi_controlnet = QwenImageMultiControlNetModel([controlnet])
pipeline.update_components(controlnet=multi_controlnet)

images = pipeline(
    prompt=prompt,
    negative_prompt=" ",
    control_image=[control_image, control_image],
    controlnet_conditioning_scale=[controlnet_conditioning_scale/2, controlnet_conditioning_scale/2],
    num_images_per_prompt=2,
    generator=torch.Generator(device="cuda").manual_seed(42),
    output="images"
)
for i, image in enumerate(images):
    assert image.size == (1024, 1024)
    image.save(f"{output_name_prefix}_3_controlnet_multi_2_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_3_controlnet_multi_2_{i}.png')}")

# test7: controlnet + inpaint

pipeline.update_components(controlnet=controlnet)

prompt = "a blue robot singing opera with human-like expressions"
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")

head_mask = np.zeros_like(image)
head_mask[65:580,300:642] = 255
mask_image = Image.fromarray(head_mask)

processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
control_image = processor(image)[0].convert("RGB")

print(f"image.size: {image.size}")
print(f"control_image.size: {control_image.size}")
print(f"mask_image.size: {mask_image.size}")

image_output = pipeline(
    prompt=prompt,
    image=image,
    mask_image=mask_image,
    control_image=control_image,
    strength=0.9,
    num_inference_steps=30,
    output="images",
)
for i, image in enumerate(image_output):
    assert image.size == (1024, 1024)
    image.save(f"{output_name_prefix}_4_controlnet_inpaint_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_4_controlnet_inpaint_{i}.png')}")



# test8: update guider (PAG)

from diffusers import LayerSkipConfig, PerturbedAttentionGuidance

# make a copy of the cfg guider to swith back later
cfg_guider_spec = pipeline.get_component_spec("guider")

pag_config = LayerSkipConfig(indices=[2, 9], skip_attention=False, skip_attention_scores=True, skip_ff=False)
pag_guider = PerturbedAttentionGuidance(
    guidance_scale=5.0, perturbed_guidance_scale=2.5, perturbed_guidance_config=pag_config
)
pipeline.update_components(guider=pag_guider)

print("pipeline.guider")
print(pipeline.guider)


# prompt = "A painting of a squirrel eating a burger"
prompt = "现实主义风格的人像摄影作品，画面主体是一位容貌惊艳的女性面部特写。她拥有一头自然微卷的短发，发丝根根分明，蓬松的刘海修饰着额头，增添俏皮感。头上佩戴一顶绿色格子蕾丝边头巾，增添复古与柔美气息。身着一件简约绿色背心裙，在纯白色背景下格外突出。两只手分别握着半个红色桃子，双手轻轻贴在脸颊两侧，营造出可爱又富有创意的视觉效果。  人物表情生动，一只眼睛睁开，另一只微微闭合，展现出调皮与自信的神态。整体构图采用个性视角、非对称构图，聚焦人物主体，增强现场感和既视感。背景虚化处理，层次丰富，景深效果强烈，营造出低光氛围下浓厚的情绪张力。  画面细节精致，色彩生动饱满却不失柔和，呈现出富士胶片独有的温润质感。光影运用充满美学张力，带有轻微超现实的光效处理，提升整体画面高级感。整体风格为现实主义人像摄影，强调细腻的纹理与艺术化的光线表现，堪称一幅细节丰富、氛围拉满的杰作。超清，4K，电影级构图"
inputs = {
    "prompt": prompt,
    "generator": torch.manual_seed(0),
    "negative_prompt": " ",
    "height": 1328,
    "width": 1328,
    "num_inference_steps": 50,
    "num_images_per_prompt": 1,
}

output_images = pipeline(**inputs, output="images")
for i, image in enumerate(output_images):
    assert image.size == (1328, 1328)
    image.save(f"{output_name_prefix}_5_guider_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_5_guider_{i}.png')}")



# test9: img2img

print(f"pipeline.guider")
print(pipeline.guider)
pipeline.update_components(guider=cfg_guider_spec)
print(f"pipeline.guider")
print(pipeline.guider)


init_image = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/cute_cat.png?raw=true")
prompt = "wizard dog, Gandalf-inspired, Lord of the Rings aesthetic, majestic yet cute, Studio Ghibli style"

strengths = [0.6, 0.9, 1.0]

for s in strengths:
    out = pipeline(
        prompt=prompt,
        image=init_image,
        height=init_image.size[1],
        width=init_image.size[0],
        strength=s,
        num_inference_steps=35,
        generator=torch.Generator(device="cuda").manual_seed(42),
    )
    out.images[0].save(f"yiyi_test_5_output_6_img2img_{s}.png")
    print(f"image saved at {os.path.abspath(f'yiyi_test_5_output_6_img2img_{s}.png')}")


# test10: img2img + controlnet

# extract canny
get_image_step = ModularPipeline.from_pretrained("YiYiXu/image_inputs", trust_remote_code=True)
control_image = get_image_step(image=init_image, processor_id="canny", output="image")
controlnet_conditioning_scale = 1.0

strengths = [0.6, 0.9, 1.0]

for s in strengths:
    out = pipeline(
        prompt=prompt,
        image=init_image,
        control_image=control_image,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        height=init_image.size[1],
        width=init_image.size[0],
        strength=s,
        num_inference_steps=35,
        generator=torch.Generator(device="cuda").manual_seed(42),
    )
    out.images[0].save(f"yiyi_test_5_output_6_img2img_controlnet_{s}.png")
    print(f"image saved at {os.path.abspath(f'yiyi_test_5_output_6_img2img_controlnet_{s}.png')}")

print(f" components:")
print(components)
print(f" ")

QwenImage Edit

Edit
Edit + Inpaint
diffdiff (next PR!)

Test script for QwenImage-Edit in Modular

# test modular auto (qwen image edit)
# use standard repo
import os
import torch

from diffusers import ModularPipeline, ComponentsManager
from diffusers.modular_pipelines.qwenimage import ALL_BLOCKS

from diffusers.utils import load_image
from image_gen_aux import DepthPreprocessor
import numpy as np
from PIL import Image

import logging
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("diffusers").setLevel(logging.INFO)

device = "cuda:2"
output_name_prefix = "test_modular_qwen_edit_output"

components = ComponentsManager()
components.enable_auto_cpu_offload(device=device)

pipeline = ModularPipeline.from_pretrained("Qwen/Qwen-Image-Edit", components_manager=components)
print(pipeline)

pipeline.load_components(torch_dtype=torch.bfloat16)

print("pipeline loaded")
print(pipeline)
print(f" ")
print(f"pipeline.blocks")
print(pipeline.blocks)
print(f" ")

print(f"components:")
print(components)
print(f" ")


# edit

prompt = "change the hat to red"
negative_prompt = " "
source = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/cute_cat.png?raw=true")
mask = load_image("https://github.com/Trgtuan10/Image_storage/blob/main/mask_cat.png?raw=true")

# edit
print(f"source size: {source.size}")
print(f"mask size: {mask.size}")
output_images = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=source,
    num_inference_steps=35,
    generator=torch.Generator(device="cuda").manual_seed(42),
).images
for i, image in enumerate(output_images):
    image.save(f"{output_name_prefix}_1_edit_{source.size[1]}_{source.size[0]}_{i}.png")
    print(f"image size: {image.size}")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_1_edit_{source.size[1]}_{source.size[0]}_{i}.png')}")

# edit + update guider (guidance_scale=4.5)

cfg_guider_spec = pipeline.get_component_spec("guider")
cfg_guider_spec.config["guidance_scale"] = 4.5
pipeline.update_components(guider=cfg_guider_spec)

print(f" print pipeline.guider")
print(pipeline.guider)

output_images = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=source,
    num_inference_steps=35,
    generator=torch.Generator(device="cuda").manual_seed(42),
).images
for i, image in enumerate(output_images):
    image.save(f"{output_name_prefix}_2_edit_guidance_scale_4.5_{i}.png")
    print(f"image size: {image.size}")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_2_edit_guidance_scale_4.5_{i}.png')}")

# edit + num_images_per_prompt==2
output_images = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=source,
    num_inference_steps=35,
    generator=torch.Generator(device="cuda").manual_seed(42),
    num_images_per_prompt=2,
).images
for i, image in enumerate(output_images):
    image.save(f"{output_name_prefix}_3_edit_num_images_per_prompt_2_{i}.png")
    print(f"image size: {image.size}")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_3_edit_num_images_per_prompt_2_{i}.png')}")

# edit + pag 
from diffusers import LayerSkipConfig, PerturbedAttentionGuidance

pag_config = LayerSkipConfig(indices=[2, 9], skip_attention=False, skip_attention_scores=True, skip_ff=False)
pag_guider = PerturbedAttentionGuidance(
    guidance_scale=5.0, perturbed_guidance_scale=2.5, perturbed_guidance_config=pag_config
)
pipeline.update_components(guider=pag_guider)

print(f" print pipeline.guider")
print(pipeline.guider)

output_images = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=source,
    num_inference_steps=35,
    generator=torch.Generator(device="cuda").manual_seed(42),
).images
for i, image in enumerate(output_images):
    image.save(f"{output_name_prefix}_4_edit_pag_{i}.png")
    print(f"image size: {image.size}")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_4_edit_pag_{i}.png')}")


# inpaint

strengths = [0.9, 1.0]
print(f" pipeline.guider")
print(pipeline.guider)

for strength in strengths:
    image_output = pipeline(
        prompt=prompt,
        negative_prompt=negative_prompt,
        image=source,
        mask_image=mask,
        strength=strength,
        num_inference_steps=35,
        generator=torch.Generator(device="cuda").manual_seed(42),
    ).images[0]
    image_output.save(f"{output_name_prefix}_5_inpaint_pag_{strength}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_5_inpaint_pag_{strength}.png')}")


# edit + cfg guider

pipeline.update_components(guider=cfg_guider_spec)

print(f" print pipeline.guider")
print(pipeline.guider)

input_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/qwenedit_input.png")
seed = 43
prompt = "The woman is displaying a plush toy product in her hand, while preserving her exact facial features, expression, clothing, and pose. Maintain the same background, natural lighting, and overall photographic composition and style."
inputs = {
    "image": input_image,
    "prompt": prompt,
    "generator": torch.Generator(device=device).manual_seed(seed),
    "num_inference_steps": 50,
    # "height": 1024,
    # "width": 1024,
}

output_images = pipeline(**inputs, output="images")
for i, image in enumerate(output_images):
    image.save(f"{output_name_prefix}_6_inpaint_cfg_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_6_inpaint_cfg_{i}.png')}")




# edit + cfg guider + custom size

pipeline.update_components(guider=cfg_guider_spec)

print(f" print pipeline.guider")
print(pipeline.guider)

input_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/qwenedit_input.png")
seed = 43
prompt = "The woman is displaying a plush toy product in her hand, while preserving her exact facial features, expression, clothing, and pose. Maintain the same background, natural lighting, and overall photographic composition and style."
inputs = {
    "image": input_image,
    "prompt": prompt,
    "generator": torch.Generator(device=device).manual_seed(seed),
    "num_inference_steps": 50,
    "height": 1024,
    "width": 1024,
}

output_images = pipeline(**inputs, output="images")
for i, image in enumerate(output_images):
    assert image.size == (1024, 1024)
    image.save(f"{output_name_prefix}_7_inpaint_cfg_custom_size_{i}.png")
    print(f"image saved at {os.path.abspath(f'{output_name_prefix}_7_inpaint_cfg_custom_size_{i}.png')}")

How to Use

the shorter version, check the test scripts above for complete, runnable examples

to load from standard repo

import torch
from diffusers import ModularPipeline, ComponentsManager

repo_id = "Qwen/Qwen-Image"
# repo_id = "Qwen/Qwen-Image-Edit"

components = ComponentsManager()
components.enable_auto_cpu_offload(device="cuda")
pipeline = ModularPipeline.from_pretrained(repo_id, components_manager=components)
pipeline.load_components(torch_dtype=torch.float16)
print(pipeline)

add controlnet (we currently only have controlnet for Qwen-Image)

from diffusers import QwenImageControlNetModel, QwenImageMultiControlNetModel

controlnet_spec = pipeline.get_component_spec("controlnet")
controlnet_spec.repo = "InstantX/Qwen-Image-ControlNet-Union"
controlnet = controlnet_spec.load(torch_dtype=torch.bfloat16)
pipeline.update_components(controlnet=controlnet)

update guider

change guidance_scale

cfg_guider_spec = pipeline.get_component_spec("guider")
cfg_guider_spec.config["guidance_scale"] = 4.5
pipeline.update_components(guider=cfg_guider_spec)

use a different guidance method

from diffusers import LayerSkipConfig, PerturbedAttentionGuidance

pag_config = LayerSkipConfig(indices=[2, 9], skip_attention=False, skip_attention_scores=True, skip_ff=False)
pag_guider = PerturbedAttentionGuidance(
    guidance_scale=5.0, perturbed_guidance_scale=2.5, perturbed_guidance_config=pag_config
)
pipeline.update_components(guider=pag_guider)

to run inference

You can use same pipeline to run all tasks we support, the code is pretty much same as in regular pipelines

# text2image
pipeline(prompt=prompt, ...).images[0]

# image2image
pipeline(prompt=prompt, image=...,  strength=..., ...).images[0]

# inpaint 
pipeline(prompt=prompt, image=...,  mask_image=..., strength=...,).images[0]

add controlnet to text2image, img2img, inpaint, just pass control_image along with any other controlnet related arguments

# text2image + controlnet
pipeline(prompt=prompt, control_image=, ...).images[0]

# image2image + controlnet
pipeline(prompt=prompt, image=...,  strength=..., control_image=..., ...).images[0]

# inpaint + controlnet
pipeline(prompt=prompt, image=...,  mask_image=..., strength=..., control_image=..., ...).images[0]

HuggingFaceDocBuilderDev · 2025-08-22T18:51:07Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

src/diffusers/guiders/classifier_free_guidance.py

src/diffusers/modular_pipelines/modular_pipeline.py

src/diffusers/modular_pipelines/qwenimage/before_denoise.py

sayakpaul

Thank you!

sayakpaul · 2025-08-27T12:23:43Z

src/diffusers/hooks/_helpers.py

 _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states
 # not sure what this is yet.
 _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
+_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states


For my understanding. This one is for?

for guiders/hooks

sayakpaul · 2025-08-27T12:24:50Z

src/diffusers/image_processor.py

        return image


+class InpaintProcessor(ConfigMixin):


Really nice!

(not for this PR, we could attempt to have an example of the processor for an inpaint pipeline)

src/diffusers/modular_pipelines/modular_pipeline.py

src/diffusers/modular_pipelines/qwenimage/before_denoise.py

src/diffusers/modular_pipelines/qwenimage/decoders.py

sayakpaul · 2025-08-27T12:43:01Z

src/diffusers/modular_pipelines/qwenimage/denoise.py

+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),


For the QwenImage pipeline, guidance_scale is akin to the one we have in Flux. However, I think we want to enable CFG with this which is done through true_cfg_scale. Should this be taken into account?

good questions,

the true_cfg_scale in flux/qwen is actually just guidance_scale in every other pipeline - it is part of guider and should be set in guider

we had to use a different name (true_cfg_scale) for flux because guidance_scale was already taken to use as an input for distilled model. I think it would have been a lot better if we had gave the distilled guidance a different name so that we can keep the definition of guidance_scale consistent across all pipelines

I'd like to fix it here in modular. IMO It won't confuse user too much because they won't be able to use guidance_scale or true_cfg_scale during runtime in modular as it is, so they will have to take some time to figure out how to use guidance properly and we will have chance to explain.

cc @DN6 @asomoza too, let me know if you have any thoughts around this

I think it would have been a lot better if we had gave the distilled guidance a different name so that we can keep the definition of guidance_scale consistent across all pipelines

I like this point a lot! However, we have guidance_scale in Flux (without the use of the Guider component):

diffusers/src/diffusers/modular_pipelines/flux/before_denoise.py

Line 337 in 67ffa70

InputParam("guidance_scale", default=3.5),

Maybe we could change that to something better suited (something like distilled_guidance_scale). This way, we can keep the meaning of guidance_scale consistent across the pipelines.

I completely agree, let's keep the guidance_scale consistent and use a different one for the distilled models.

So the proposal is that guidance scale would always imply CFG guidance scale?

I would argue that keeping guidance_scale for all guidance methods makes sense since it implies how large of a step you want take in the guidance direction.

Alternatively we could introduce the concept of a DistilledGuidance guider which is effectively a no-op and it makes it more explicit about exactly what's happening with latents rather than having to introduce new scale parameters, internal checks for negative embeds or checks like self._is_cfg_enabled?

Cool. Just for clarity, guidance_scale here would mean what true_cfg_scale means in the QwenImage pipelines, right?

and we don't require passing a negative prompt to use it in modular

Just a few notes for understanding.

I see that the default negative prompt we're using in modular is "":

diffusers/src/diffusers/modular_pipelines/qwenimage/encoders.py

Line 375 in f50b18e

negative_prompt = block_state.negative_prompt or ""

However, Qwen usually does " ". So, in case we don't require the user to not pass e negative prompt to enable CFG, maybe we could use " " instead of "".

sounds good!

jferments · 2025-09-04T01:26:03Z

Will there be any features added soon for training Qwen models with HF libraries? What are the major barriers right now to making this happen?

sayakpaul · 2025-09-04T02:50:23Z

This is not the right PR to discuss Qwen training.

You can check out https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_qwen.md as well as https://github.com/ostris/ai-toolkit for training with HF libs.

src/diffusers/modular_pipelines/qwenimage/before_denoise.py

yiyixuxu · 2025-09-08T09:02:09Z

src/diffusers/image_processor.py

+        else:
+            raise ValueError(f"Unsupported image type: {type(image)}")
+


Suggested change

else:

raise ValueError(f"Unsupported image type: {type(image)}")

remove this for now, will update in a seperate PR since it requires changes into the regular qwen/flux pipelines

yiyixuxu added 3 commits August 17, 2025 06:54

initial

3bd289f

Merge branch 'main' into modular-qwen

fa1a9cd

up up

44e058c

yiyixuxu added 11 commits August 22, 2025 21:41

Merge branch 'main' into modular-qwen

ff06e95

start to work on edit

49e683f

add support for qwen edit

f72763c

first dynamic block!

57a1bc6

add controlnet support!

5fbc817

style

100122c

up

84dbf17

up

0a9f7f9

up up

4483400

style

a562806

Merge branch 'main' into modular-qwen

30faada

yiyixuxu commented Aug 27, 2025

View reviewed changes

src/diffusers/guiders/classifier_free_guidance.py Outdated Show resolved Hide resolved

yiyixuxu commented Aug 27, 2025

View reviewed changes

src/diffusers/guiders/classifier_free_guidance.py Outdated Show resolved Hide resolved

yiyixuxu added 6 commits August 26, 2025 16:28

Apply suggestions from code review

2d5d876

up

b89cc40

add inpaint processor to doc

8dce330

refactor!

d16b7b9

add auto pipeline blocks, guider support

dd8d0f6

style + copies

2c05729

yiyixuxu commented Aug 27, 2025

View reviewed changes

src/diffusers/modular_pipelines/modular_pipeline.py Show resolved Hide resolved

yiyixuxu commented Aug 27, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu requested review from DN6, asomoza and sayakpaul August 27, 2025 10:46

sayakpaul approved these changes Aug 27, 2025

View reviewed changes

Merge branch 'main' into modular-qwen

78f0038

DN6 added the roadmap Add to current release roadmap label Sep 2, 2025

github-project-automation bot added this to Diffusers Roadmap 0.36 Sep 2, 2025

github-project-automation bot moved this to In Progress in Diffusers Roadmap 0.36 Sep 2, 2025

yiyixuxu added 5 commits September 2, 2025 07:19

support edit inpaint!

5ecbbff

style

ef66598

qwen image edit autopipeline!

eed3ae0

up up

a95651a

style

4efac2c

yiyixuxu added 4 commits September 7, 2025 10:36

up up some refactor

1668c77

img2img

5b408fa

more refactor

a7414e6

up

675ae14

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

yiyixuxu commented Sep 7, 2025

View reviewed changes

src/diffusers/modular_pipelines/qwenimage/before_denoise.py Outdated Show resolved Hide resolved

Apply suggestions from code review

6bf38c8

yiyixuxu commented Sep 8, 2025

View reviewed changes

yiyixuxu added 4 commits September 8, 2025 11:08

docstring etc

0e9c496

style + copy

f97e24a

fix

7084119

fix more docs5rings

9c5830e

yiyixuxu merged commit f50b18e into main Sep 8, 2025
17 checks passed

github-project-automation bot moved this from In Progress to Done in Diffusers Roadmap 0.36 Sep 8, 2025

yiyixuxu deleted the modular-qwen branch September 8, 2025 10:31

		else:
		raise ValueError(f"Unsupported image type: {type(image)}")

Uh oh!

[Modular] Qwen #12220

[Modular] Qwen #12220

Uh oh!

Conversation

yiyixuxu commented Aug 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Qwen-Image

QwenImage Edit

How to Use

to load from standard repo

add controlnet (we currently only have controlnet for Qwen-Image)

update guider

to run inference

Uh oh!

HuggingFaceDocBuilderDev commented Aug 22, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

sayakpaul left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jferments commented Sep 4, 2025

Uh oh!

sayakpaul commented Sep 4, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

7 participants

yiyixuxu commented Aug 22, 2025 •

edited

Loading