Merge remote-tracking branch 'origin/wan22-lightx2v' into wan22-lightx2v

linoytsaban · linoytsaban · commit 3c5767288ab3 · 2025-08-18T21:32:01.000+03:00
diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md
@@ -86,6 +86,12 @@ image.save("qwen_fewsteps.png")
 
 </details>
 
+<Tip>
+
+The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
+
+</Tip>
+
 ## QwenImagePipeline
 
 [[autodoc]] QwenImagePipeline
diff --git a/examples/dreambooth/README_qwen.md b/examples/dreambooth/README_qwen.md
@@ -75,9 +75,9 @@ Now, we can launch training using:
 ```bash
 export MODEL_NAME="Qwen/Qwen-Image"
 export INSTANCE_DIR="dog"
-export OUTPUT_DIR="trained-sana-lora"
+export OUTPUT_DIR="trained-qwenimage-lora"
 
-accelerate launch train_dreambooth_lora_sana.py \
+accelerate launch train_dreambooth_lora_qwenimage.py \
   --pretrained_model_name_or_path=$MODEL_NAME  \
   --instance_data_dir=$INSTANCE_DIR \
   --output_dir=$OUTPUT_DIR \
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -2129,6 +2129,74 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
 
 
 def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
+    has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
+    if has_lora_unet:
+        state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
+
+        def convert_key(key: str) -> str:
+            prefix = "transformer_blocks"
+            if "." in key:
+                base, suffix = key.rsplit(".", 1)
+            else:
+                base, suffix = key, ""
+
+            start = f"{prefix}_"
+            rest = base[len(start) :]
+
+            if "." in rest:
+                head, tail = rest.split(".", 1)
+                tail = "." + tail
+            else:
+                head, tail = rest, ""
+
+            # Protected n-grams that must keep their internal underscores
+            protected = {
+                # pairs
+                ("to", "q"),
+                ("to", "k"),
+                ("to", "v"),
+                ("to", "out"),
+                ("add", "q"),
+                ("add", "k"),
+                ("add", "v"),
+                ("txt", "mlp"),
+                ("img", "mlp"),
+                ("txt", "mod"),
+                ("img", "mod"),
+                # triplets
+                ("add", "q", "proj"),
+                ("add", "k", "proj"),
+                ("add", "v", "proj"),
+                ("to", "add", "out"),
+            }
+
+            prot_by_len = {}
+            for ng in protected:
+                prot_by_len.setdefault(len(ng), set()).add(ng)
+
+            parts = head.split("_")
+            merged = []
+            i = 0
+            lengths_desc = sorted(prot_by_len.keys(), reverse=True)
+
+            while i < len(parts):
+                matched = False
+                for L in lengths_desc:
+                    if i + L <= len(parts) and tuple(parts[i : i + L]) in prot_by_len[L]:
+                        merged.append("_".join(parts[i : i + L]))
+                        i += L
+                        matched = True
+                        break
+                if not matched:
+                    merged.append(parts[i])
+                    i += 1
+
+            head_converted = ".".join(merged)
+            converted_base = f"{prefix}.{head_converted}{tail}"
+            return converted_base + (("." + suffix) if suffix else "")
+
+        state_dict = {convert_key(k): v for k, v in state_dict.items()}
+
     converted_state_dict = {}
     all_keys = list(state_dict.keys())
     down_key = ".lora_down.weight"
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -6685,7 +6685,8 @@ def lora_state_dict(
             state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
 
         has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
-        if has_alphas_in_sd:
+        has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
+        if has_alphas_in_sd or has_lora_unet:
             state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)
 
         out = (state_dict, metadata) if return_lora_metadata else state_dict
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -299,6 +299,7 @@ def __init__(
         act_fn: Union[str, Tuple[str]] = "silu",
         upsample_block_type: str = "pixel_shuffle",
         in_shortcut: bool = True,
+        conv_act_fn: str = "relu",
     ):
         super().__init__()
 
@@ -349,7 +350,7 @@ def __init__(
         channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
 
         self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
-        self.conv_act = nn.ReLU()
+        self.conv_act = get_activation(conv_act_fn)
         self.conv_out = None
 
         if layers_per_block[0] > 0:
@@ -414,6 +415,12 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             The normalization type(s) to use in the decoder.
         decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
             The activation function(s) to use in the decoder.
+        encoder_out_shortcut  (`bool`, defaults to `True`):
+            Whether to use shortcut at the end of the encoder.
+        decoder_in_shortcut (`bool`, defaults to `True`):
+            Whether to use shortcut at the beginning of the decoder.
+        decoder_conv_act_fn (`str`, defaults to `"relu"`):
+            The activation function to use at the end of the decoder.
         scaling_factor (`float`, defaults to `1.0`):
             The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
             space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
@@ -441,6 +448,9 @@ def __init__(
         downsample_block_type: str = "pixel_unshuffle",
         decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
         decoder_act_fns: Union[str, Tuple[str]] = "silu",
+        encoder_out_shortcut: bool = True,
+        decoder_in_shortcut: bool = True,
+        decoder_conv_act_fn: str = "relu",
         scaling_factor: float = 1.0,
     ) -> None:
         super().__init__()
@@ -454,6 +464,7 @@ def __init__(
             layers_per_block=encoder_layers_per_block,
             qkv_multiscales=encoder_qkv_multiscales,
             downsample_block_type=downsample_block_type,
+            out_shortcut=encoder_out_shortcut,
         )
         self.decoder = Decoder(
             in_channels=in_channels,
@@ -466,6 +477,8 @@ def __init__(
             norm_type=decoder_norm_types,
             act_fn=decoder_act_fns,
             upsample_block_type=upsample_block_type,
+            in_shortcut=decoder_in_shortcut,
+            conv_act_fn=decoder_conv_act_fn,
         )
 
         self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -726,23 +726,29 @@ def _caching_allocator_warmup(
     very large margin.
     """
     factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor()
-    # Remove disk and cpu devices, and cast to proper torch.device
+
+    # Keep only accelerator devices
     accelerator_device_map = {
         param: torch.device(device)
         for param, device in expanded_device_map.items()
         if str(device) not in ["cpu", "disk"]
     }
-    total_byte_count = defaultdict(lambda: 0)
+    if not accelerator_device_map:
+        return
+
+    elements_per_device = defaultdict(int)
     for param_name, device in accelerator_device_map.items():
         try:
-            param = model.get_parameter(param_name)
+            p = model.get_parameter(param_name)
         except AttributeError:
-            param = model.get_buffer(param_name)
-        # The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
-        param_byte_count = param.numel() * param.element_size()
+            try:
+                p = model.get_buffer(param_name)
+            except AttributeError:
+                raise AttributeError(f"Parameter or buffer with name={param_name} not found in model")
         # TODO: account for TP when needed.
-        total_byte_count[device] += param_byte_count
+        elements_per_device[device] += p.numel()
 
     # This will kick off the caching allocator to avoid having to Malloc afterwards
-    for device, byte_count in total_byte_count.items():
-        _ = torch.empty(byte_count // factor, dtype=dtype, device=device, requires_grad=False)
+    for device, elem_count in elements_per_device.items():
+        warmup_elems = max(1, elem_count // factor)
+        _ = torch.empty(warmup_elems, dtype=dtype, device=device, requires_grad=False)
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
@@ -480,6 +480,11 @@ def __call__(
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
+
+                This parameter in the pipeline is there to support future guidance-distilled models when they come up.
+                Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
+                please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
+                enable classifier-free guidance computations.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
@@ -597,6 +597,11 @@ def __call__(
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
+
+                This parameter in the pipeline is there to support future guidance-distilled models when they come up.
+                Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
+                please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
+                enable classifier-free guidance computations.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
@@ -568,6 +568,11 @@ def __call__(
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
+
+                This parameter in the pipeline is there to support future guidance-distilled models when they come up.
+                Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
+                please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
+                enable classifier-free guidance computations.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
@@ -698,6 +698,11 @@ def __call__(
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
+
+                This parameter in the pipeline is there to support future guidance-distilled models when they come up.
+                Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
+                please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
+                enable classifier-free guidance computations.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
@@ -339,7 +339,8 @@ def offload_models(
             original_devices = [next(m.parameters()).device for m in modules]
         else:
             assert len(modules) == 1
-            original_devices = modules[0].device
+            # For DiffusionPipeline, wrap the device in a list to make it iterable
+            original_devices = [modules[0].device]
         # move to target device
         for m in modules:
             m.to(device)