Skip to content

Commit 3c57672

Browse files
committed
Merge remote-tracking branch 'origin/wan22-lightx2v' into wan22-lightx2v
2 parents 2a5b07d + 64d9b04 commit 3c57672

File tree

11 files changed

+129
-14
lines changed

11 files changed

+129
-14
lines changed

docs/source/en/api/pipelines/qwenimage.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ image.save("qwen_fewsteps.png")
8686

8787
</details>
8888

89+
<Tip>
90+
91+
The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
92+
93+
</Tip>
94+
8995
## QwenImagePipeline
9096

9197
[[autodoc]] QwenImagePipeline

examples/dreambooth/README_qwen.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ Now, we can launch training using:
7575
```bash
7676
export MODEL_NAME="Qwen/Qwen-Image"
7777
export INSTANCE_DIR="dog"
78-
export OUTPUT_DIR="trained-sana-lora"
78+
export OUTPUT_DIR="trained-qwenimage-lora"
7979

80-
accelerate launch train_dreambooth_lora_sana.py \
80+
accelerate launch train_dreambooth_lora_qwenimage.py \
8181
--pretrained_model_name_or_path=$MODEL_NAME \
8282
--instance_data_dir=$INSTANCE_DIR \
8383
--output_dir=$OUTPUT_DIR \

src/diffusers/loaders/lora_conversion_utils.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2129,6 +2129,74 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
21292129

21302130

21312131
def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
2132+
has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
2133+
if has_lora_unet:
2134+
state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
2135+
2136+
def convert_key(key: str) -> str:
2137+
prefix = "transformer_blocks"
2138+
if "." in key:
2139+
base, suffix = key.rsplit(".", 1)
2140+
else:
2141+
base, suffix = key, ""
2142+
2143+
start = f"{prefix}_"
2144+
rest = base[len(start) :]
2145+
2146+
if "." in rest:
2147+
head, tail = rest.split(".", 1)
2148+
tail = "." + tail
2149+
else:
2150+
head, tail = rest, ""
2151+
2152+
# Protected n-grams that must keep their internal underscores
2153+
protected = {
2154+
# pairs
2155+
("to", "q"),
2156+
("to", "k"),
2157+
("to", "v"),
2158+
("to", "out"),
2159+
("add", "q"),
2160+
("add", "k"),
2161+
("add", "v"),
2162+
("txt", "mlp"),
2163+
("img", "mlp"),
2164+
("txt", "mod"),
2165+
("img", "mod"),
2166+
# triplets
2167+
("add", "q", "proj"),
2168+
("add", "k", "proj"),
2169+
("add", "v", "proj"),
2170+
("to", "add", "out"),
2171+
}
2172+
2173+
prot_by_len = {}
2174+
for ng in protected:
2175+
prot_by_len.setdefault(len(ng), set()).add(ng)
2176+
2177+
parts = head.split("_")
2178+
merged = []
2179+
i = 0
2180+
lengths_desc = sorted(prot_by_len.keys(), reverse=True)
2181+
2182+
while i < len(parts):
2183+
matched = False
2184+
for L in lengths_desc:
2185+
if i + L <= len(parts) and tuple(parts[i : i + L]) in prot_by_len[L]:
2186+
merged.append("_".join(parts[i : i + L]))
2187+
i += L
2188+
matched = True
2189+
break
2190+
if not matched:
2191+
merged.append(parts[i])
2192+
i += 1
2193+
2194+
head_converted = ".".join(merged)
2195+
converted_base = f"{prefix}.{head_converted}{tail}"
2196+
return converted_base + (("." + suffix) if suffix else "")
2197+
2198+
state_dict = {convert_key(k): v for k, v in state_dict.items()}
2199+
21322200
converted_state_dict = {}
21332201
all_keys = list(state_dict.keys())
21342202
down_key = ".lora_down.weight"

src/diffusers/loaders/lora_pipeline.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6685,7 +6685,8 @@ def lora_state_dict(
66856685
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
66866686

66876687
has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
6688-
if has_alphas_in_sd:
6688+
has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
6689+
if has_alphas_in_sd or has_lora_unet:
66896690
state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)
66906691

66916692
out = (state_dict, metadata) if return_lora_metadata else state_dict

src/diffusers/models/autoencoders/autoencoder_dc.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def __init__(
299299
act_fn: Union[str, Tuple[str]] = "silu",
300300
upsample_block_type: str = "pixel_shuffle",
301301
in_shortcut: bool = True,
302+
conv_act_fn: str = "relu",
302303
):
303304
super().__init__()
304305

@@ -349,7 +350,7 @@ def __init__(
349350
channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
350351

351352
self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
352-
self.conv_act = nn.ReLU()
353+
self.conv_act = get_activation(conv_act_fn)
353354
self.conv_out = None
354355

355356
if layers_per_block[0] > 0:
@@ -414,6 +415,12 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
414415
The normalization type(s) to use in the decoder.
415416
decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
416417
The activation function(s) to use in the decoder.
418+
encoder_out_shortcut (`bool`, defaults to `True`):
419+
Whether to use shortcut at the end of the encoder.
420+
decoder_in_shortcut (`bool`, defaults to `True`):
421+
Whether to use shortcut at the beginning of the decoder.
422+
decoder_conv_act_fn (`str`, defaults to `"relu"`):
423+
The activation function to use at the end of the decoder.
417424
scaling_factor (`float`, defaults to `1.0`):
418425
The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
419426
space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
@@ -441,6 +448,9 @@ def __init__(
441448
downsample_block_type: str = "pixel_unshuffle",
442449
decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
443450
decoder_act_fns: Union[str, Tuple[str]] = "silu",
451+
encoder_out_shortcut: bool = True,
452+
decoder_in_shortcut: bool = True,
453+
decoder_conv_act_fn: str = "relu",
444454
scaling_factor: float = 1.0,
445455
) -> None:
446456
super().__init__()
@@ -454,6 +464,7 @@ def __init__(
454464
layers_per_block=encoder_layers_per_block,
455465
qkv_multiscales=encoder_qkv_multiscales,
456466
downsample_block_type=downsample_block_type,
467+
out_shortcut=encoder_out_shortcut,
457468
)
458469
self.decoder = Decoder(
459470
in_channels=in_channels,
@@ -466,6 +477,8 @@ def __init__(
466477
norm_type=decoder_norm_types,
467478
act_fn=decoder_act_fns,
468479
upsample_block_type=upsample_block_type,
480+
in_shortcut=decoder_in_shortcut,
481+
conv_act_fn=decoder_conv_act_fn,
469482
)
470483

471484
self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)

src/diffusers/models/model_loading_utils.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -726,23 +726,29 @@ def _caching_allocator_warmup(
726726
very large margin.
727727
"""
728728
factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor()
729-
# Remove disk and cpu devices, and cast to proper torch.device
729+
730+
# Keep only accelerator devices
730731
accelerator_device_map = {
731732
param: torch.device(device)
732733
for param, device in expanded_device_map.items()
733734
if str(device) not in ["cpu", "disk"]
734735
}
735-
total_byte_count = defaultdict(lambda: 0)
736+
if not accelerator_device_map:
737+
return
738+
739+
elements_per_device = defaultdict(int)
736740
for param_name, device in accelerator_device_map.items():
737741
try:
738-
param = model.get_parameter(param_name)
742+
p = model.get_parameter(param_name)
739743
except AttributeError:
740-
param = model.get_buffer(param_name)
741-
# The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
742-
param_byte_count = param.numel() * param.element_size()
744+
try:
745+
p = model.get_buffer(param_name)
746+
except AttributeError:
747+
raise AttributeError(f"Parameter or buffer with name={param_name} not found in model")
743748
# TODO: account for TP when needed.
744-
total_byte_count[device] += param_byte_count
749+
elements_per_device[device] += p.numel()
745750

746751
# This will kick off the caching allocator to avoid having to Malloc afterwards
747-
for device, byte_count in total_byte_count.items():
748-
_ = torch.empty(byte_count // factor, dtype=dtype, device=device, requires_grad=False)
752+
for device, elem_count in elements_per_device.items():
753+
warmup_elems = max(1, elem_count // factor)
754+
_ = torch.empty(warmup_elems, dtype=dtype, device=device, requires_grad=False)

src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,11 @@ def __call__(
480480
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
481481
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
482482
the text `prompt`, usually at the expense of lower image quality.
483+
484+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
485+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
486+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
487+
enable classifier-free guidance computations.
483488
num_images_per_prompt (`int`, *optional*, defaults to 1):
484489
The number of images to generate per prompt.
485490
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,11 @@ def __call__(
597597
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
598598
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
599599
the text `prompt`, usually at the expense of lower image quality.
600+
601+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
602+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
603+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
604+
enable classifier-free guidance computations.
600605
num_images_per_prompt (`int`, *optional*, defaults to 1):
601606
The number of images to generate per prompt.
602607
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,11 @@ def __call__(
568568
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
569569
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
570570
the text `prompt`, usually at the expense of lower image quality.
571+
572+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
573+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
574+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
575+
enable classifier-free guidance computations.
571576
num_images_per_prompt (`int`, *optional*, defaults to 1):
572577
The number of images to generate per prompt.
573578
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,11 @@ def __call__(
698698
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
699699
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
700700
the text `prompt`, usually at the expense of lower image quality.
701+
702+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
703+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
704+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
705+
enable classifier-free guidance computations.
701706
num_images_per_prompt (`int`, *optional*, defaults to 1):
702707
The number of images to generate per prompt.
703708
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

0 commit comments

Comments
 (0)