huggingface
diff --git a/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 57 additions & 0 deletions b/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 36 additions & 0 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 5 additions & 1 deletion b/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 31 additions & 24 deletions b/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 31 additions & 24 deletions
diff --git a/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 0 additions & 2 deletions
@@ -24,6 +24,63 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 
 </Tip>
 
+## LoRA for faster inference
+
+Use a LoRA from `lightx2v/Qwen-Image-Lightning` to speed up inference by reducing the
+number of steps. Refer to the code snippet below:
+
+<details>
+<summary>Code</summary>
+
+```py
+from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+import torch 
+import math
+
+ckpt_id = "Qwen/Qwen-Image"
+
+# From
+# https://github.com/ModelTC/Qwen-Image-Lightning/blob/342260e8f5468d2f24d084ce04f55e101007118b/generate_with_diffusers.py#L82C9-L97C10
+scheduler_config = {
+    "base_image_seq_len": 256,
+    "base_shift": math.log(3),  # We use shift=3 in distillation
+    "invert_sigmas": False,
+    "max_image_seq_len": 8192,
+    "max_shift": math.log(3),  # We use shift=3 in distillation
+    "num_train_timesteps": 1000,
+    "shift": 1.0,
+    "shift_terminal": None,  # set shift_terminal to None
+    "stochastic_sampling": False,
+    "time_shift_type": "exponential",
+    "use_beta_sigmas": False,
+    "use_dynamic_shifting": True,
+    "use_exponential_sigmas": False,
+    "use_karras_sigmas": False,
+}
+scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+pipe = DiffusionPipeline.from_pretrained(
+    ckpt_id, scheduler=scheduler, torch_dtype=torch.bfloat16
+).to("cuda")
+pipe.load_lora_weights(
+    "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors"
+)
+
+prompt = "a tiny astronaut hatching from an egg on the moon, Ultra HD, 4K, cinematic composition."
+negative_prompt = " "
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1024,
+    height=1024,
+    num_inference_steps=8,
+    true_cfg_scale=1.0,
+    generator=torch.manual_seed(0),
+).images[0]
+image.save("qwen_fewsteps.png")
+```
+
+</details>
+
 ## QwenImagePipeline
 
 [[autodoc]] QwenImagePipeline
 
@@ -2077,3 +2077,39 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
     converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
     converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
     return converted_state_dict
+
+
+def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
+    converted_state_dict = {}
+    all_keys = list(state_dict.keys())
+    down_key = ".lora_down.weight"
+    up_key = ".lora_up.weight"
+
+    def get_alpha_scales(down_weight, alpha_key):
+        rank = down_weight.shape[0]
+        alpha = state_dict.pop(alpha_key).item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+
+    for k in all_keys:
+        if k.endswith(down_key):
+            diffusers_down_key = k.replace(down_key, ".lora_A.weight")
+            diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
+            alpha_key = k.replace(down_key, ".alpha")
+
+            down_weight = state_dict.pop(k)
+            up_weight = state_dict.pop(k.replace(down_key, up_key))
+            scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+            converted_state_dict[diffusers_down_key] = down_weight * scale_down
+            converted_state_dict[diffusers_up_key] = up_weight * scale_up
+
+    if len(state_dict) > 0:
+        raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
+
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict
@@ -49,6 +49,7 @@
     _convert_non_diffusers_lora_to_diffusers,
     _convert_non_diffusers_ltxv_lora_to_diffusers,
     _convert_non_diffusers_lumina2_lora_to_diffusers,
+    _convert_non_diffusers_qwen_lora_to_diffusers,
     _convert_non_diffusers_wan_lora_to_diffusers,
     _convert_xlabs_flux_lora_to_diffusers,
     _maybe_map_sgm_blocks_to_diffusers,
@@ -6548,7 +6549,6 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin):
 
     @classmethod
     @validate_hf_hub_args
-    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.lora_state_dict
     def lora_state_dict(
         cls,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -6642,6 +6642,10 @@ def lora_state_dict(
             logger.warning(warn_msg)
             state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
 
+        has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
+        if has_alphas_in_sd:
+            state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)
+
         out = (state_dict, metadata) if return_lora_metadata else state_dict
         return out
 
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import functools
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -162,15 +163,15 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
         self.axes_dim = axes_dim
         pos_index = torch.arange(1024)
         neg_index = torch.arange(1024).flip(0) * -1 - 1
-        self.pos_freqs = torch.cat(
+        pos_freqs = torch.cat(
             [
                 self.rope_params(pos_index, self.axes_dim[0], self.theta),
                 self.rope_params(pos_index, self.axes_dim[1], self.theta),
                 self.rope_params(pos_index, self.axes_dim[2], self.theta),
             ],
             dim=1,
         )
-        self.neg_freqs = torch.cat(
+        neg_freqs = torch.cat(
             [
                 self.rope_params(neg_index, self.axes_dim[0], self.theta),
                 self.rope_params(neg_index, self.axes_dim[1], self.theta),
@@ -179,6 +180,8 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
             dim=1,
         )
         self.rope_cache = {}
+        self.register_buffer("pos_freqs", pos_freqs, persistent=False)
+        self.register_buffer("neg_freqs", neg_freqs, persistent=False)
 
         # 是否使用 scale rope
         self.scale_rope = scale_rope
@@ -198,33 +201,17 @@ def forward(self, video_fhw, txt_seq_lens, device):
         Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
         txt_length: [bs] a list of 1 integers representing the length of the text
         """
-        if self.pos_freqs.device != device:
-            self.pos_freqs = self.pos_freqs.to(device)
-            self.neg_freqs = self.neg_freqs.to(device)
-
         if isinstance(video_fhw, list):
             video_fhw = video_fhw[0]
         frame, height, width = video_fhw
         rope_key = f"{frame}_{height}_{width}"
 
-        if rope_key not in self.rope_cache:
-            seq_lens = frame * height * width
-            freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
-            if self.scale_rope:
-                freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-                freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-                freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
-
-            else:
-                freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-
-            freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-            self.rope_cache[rope_key] = freqs.clone().contiguous()
-        vid_freqs = self.rope_cache[rope_key]
+        if not torch.compiler.is_compiling():
+            if rope_key not in self.rope_cache:
+                self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width)
+            vid_freqs = self.rope_cache[rope_key]
+        else:
+            vid_freqs = self._compute_video_freqs(frame, height, width)
 
         if self.scale_rope:
             max_vid_index = max(height // 2, width // 2)
@@ -236,6 +223,25 @@ def forward(self, video_fhw, txt_seq_lens, device):
 
         return vid_freqs, txt_freqs
 
+    @functools.lru_cache(maxsize=None)
+    def _compute_video_freqs(self, frame, height, width):
+        seq_lens = frame * height * width
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+
+        freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+
 
 class QwenDoubleStreamAttnProcessor2_0:
     """
@@ -482,6 +488,7 @@ class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
     _supports_gradient_checkpointing = True
     _no_split_modules = ["QwenImageTransformerBlock"]
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["QwenImageTransformerBlock"]
 
     @register_to_config
     def __init__(
 
@@ -25,7 +25,6 @@
     _import_structure["modular_pipeline"] = [
         "ModularPipelineBlocks",
         "ModularPipeline",
-        "PipelineBlock",
         "AutoPipelineBlocks",
         "SequentialPipelineBlocks",
         "LoopSequentialPipelineBlocks",
@@ -59,7 +58,6 @@
             LoopSequentialPipelineBlocks,
             ModularPipeline,
             ModularPipelineBlocks,
-            PipelineBlock,
             PipelineState,
             SequentialPipelineBlocks,
         )