huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/_toctree.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 16 additions & 5 deletions b/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎docs/source/en/quicktour.md‎
Lines changed: 155 additions & 249 deletions b/‎docs/source/en/quicktour.md‎
Lines changed: 155 additions & 249 deletions
diff --git a/‎docs/source/en/stable_diffusion.md‎
Lines changed: 15 additions & 9 deletions b/‎docs/source/en/stable_diffusion.md‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/model_loading_utils.py‎
Lines changed: 11 additions & 6 deletions b/‎src/diffusers/models/model_loading_utils.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 2 additions & 3 deletions b/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_cogview4.py‎
Lines changed: 34 additions & 2 deletions b/‎src/diffusers/models/transformers/transformer_cogview4.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 33 additions & 23 deletions b/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 33 additions & 23 deletions
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 1 deletion
@@ -5,7 +5,7 @@
   - local: installation
     title: Installation
   - local: quicktour
-    title: Quicktour
+    title: Quickstart
   - local: stable_diffusion
     title: Basic performance
 
 
@@ -16,7 +16,12 @@
 
 Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
 
-Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more.
+Qwen-Image comes in the following variants:
+
+| model type | model id |
+|:----------:|:--------:|
+| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
+| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
 
 <Tip>
 
@@ -87,10 +92,6 @@ image.save("qwen_fewsteps.png")
   - all
   - __call__
 
-## QwenImagePipelineOutput
-
-[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
-
 ## QwenImageImg2ImgPipeline
 
 [[autodoc]] QwenImageImg2ImgPipeline
@@ -102,3 +103,13 @@ image.save("qwen_fewsteps.png")
 [[autodoc]] QwenImageInpaintPipeline
   - all
   - __call__
+
+## QwenImageEditPipeline
+
+[[autodoc]] QwenImageEditPipeline
+  - all
+  - __call__
+
+## QwenImagePipelineOutput
+
+[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
@@ -22,14 +22,17 @@ This guide recommends some basic performance tips for using the [`DiffusionPipel
 
 Reducing the amount of memory used indirectly speeds up generation and can help a model fit on device.
 
+The [`~DiffusionPipeline.enable_model_cpu_offload`] method moves a model to the CPU when it is not in use to save GPU memory.
+
 ```py
 import torch
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
 pipeline.enable_model_cpu_offload()
 
 prompt = """
@@ -44,7 +47,7 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
 
 Denoising is the most computationally demanding process during diffusion. Methods that optimizes this process accelerates inference speed. Try the following methods for a speed up.
 
-- Add `.to("cuda")` to place the pipeline on a GPU. Placing a model on an accelerator, like a GPU, increases speed because it performs computations in parallel.
+- Add `device_map="cuda"` to place the pipeline on a GPU. Placing a model on an accelerator, like a GPU, increases speed because it performs computations in parallel.
 - Set `torch_dtype=torch.bfloat16` to execute the pipeline in half-precision. Reducing the data type precision increases speed because it takes less time to perform computations in a lower precision.
 
 ```py
@@ -54,8 +57,9 @@ from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  torch_dtype=torch.bfloat16,
+  device_map="cuda
+)
 ```
 
 - Use a faster scheduler, such as [`DPMSolverMultistepScheduler`], which only requires ~20-25 steps.
@@ -88,8 +92,9 @@ Many modern diffusion models deliver high-quality images out-of-the-box. However
 
     pipeline = DiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-xl-base-1.0",
-        torch_dtype=torch.bfloat16
-    ).to("cuda")
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
 
     prompt = """
     cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
@@ -109,8 +114,9 @@ Many modern diffusion models deliver high-quality images out-of-the-box. However
 
     pipeline = DiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-xl-base-1.0",
-        torch_dtype=torch.bfloat16
-    ).to("cuda")
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
     pipeline.scheduler = HeunDiscreteScheduler.from_config(pipeline.scheduler.config)
 
     prompt = """
 
@@ -489,6 +489,7 @@
             "PixArtAlphaPipeline",
             "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
+            "QwenImageEditPipeline",
             "QwenImageImg2ImgPipeline",
             "QwenImageInpaintPipeline",
             "QwenImagePipeline",
@@ -1123,6 +1124,7 @@
             PixArtAlphaPipeline,
             PixArtSigmaPAGPipeline,
             PixArtSigmaPipeline,
+            QwenImageEditPipeline,
             QwenImageImg2ImgPipeline,
             QwenImageInpaintPipeline,
             QwenImagePipeline,
 
@@ -17,7 +17,6 @@
 import functools
 import importlib
 import inspect
-import math
 import os
 from array import array
 from collections import OrderedDict, defaultdict
@@ -717,27 +716,33 @@ def _expand_device_map(device_map, param_names):
 
 
 # Adapted from: https://github.com/huggingface/transformers/blob/0687d481e2c71544501ef9cb3eef795a6e79b1de/src/transformers/modeling_utils.py#L5859
-def _caching_allocator_warmup(model, expanded_device_map: Dict[str, torch.device], dtype: torch.dtype) -> None:
+def _caching_allocator_warmup(
+    model, expanded_device_map: Dict[str, torch.device], dtype: torch.dtype, hf_quantizer: Optional[DiffusersQuantizer]
+) -> None:
     """
     This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
     device. It allows to have one large call to Malloc, instead of recursively calling it later when loading the model,
     which is actually the loading speed bottleneck. Calling this function allows to cut the model loading time by a
     very large margin.
     """
+    factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor()
     # Remove disk and cpu devices, and cast to proper torch.device
     accelerator_device_map = {
         param: torch.device(device)
         for param, device in expanded_device_map.items()
         if str(device) not in ["cpu", "disk"]
     }
-    parameter_count = defaultdict(lambda: 0)
+    total_byte_count = defaultdict(lambda: 0)
     for param_name, device in accelerator_device_map.items():
         try:
             param = model.get_parameter(param_name)
         except AttributeError:
             param = model.get_buffer(param_name)
-        parameter_count[device] += math.prod(param.shape)
+        # The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
+        param_byte_count = param.numel() * param.element_size()
+        # TODO: account for TP when needed.
+        total_byte_count[device] += param_byte_count
 
     # This will kick off the caching allocator to avoid having to Malloc afterwards
-    for device, param_count in parameter_count.items():
-        _ = torch.empty(param_count, dtype=dtype, device=device, requires_grad=False)
+    for device, byte_count in total_byte_count.items():
+        _ = torch.empty(byte_count // factor, dtype=dtype, device=device, requires_grad=False)
@@ -1532,10 +1532,9 @@ def _load_pretrained_model(
         # tensors using their expected shape and not performing any initialization of the memory (empty data).
         # When the actual device allocations happen, the allocator already has a pool of unused device memory
         # that it can re-use for faster loading of the model.
-        # TODO: add support for warmup with hf_quantizer
-        if device_map is not None and hf_quantizer is None:
+        if device_map is not None:
             expanded_device_map = _expand_device_map(device_map, expected_keys)
-            _caching_allocator_warmup(model, expanded_device_map, dtype)
+            _caching_allocator_warmup(model, expanded_device_map, dtype, hf_quantizer)
 
         offload_index = {} if device_map is not None and "disk" in device_map.values() else None
         state_dict_folder, state_dict_index = None, None
 
@@ -28,7 +28,7 @@
 from ..embeddings import CogView3CombinedTimestepSizeEmbeddings
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
-from ..normalization import AdaLayerNormContinuous
+from ..normalization import LayerNorm, RMSNorm
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -584,6 +584,38 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
         return (freqs.cos(), freqs.sin())
 
 
+class CogView4AdaLayerNormContinuous(nn.Module):
+    """
+    CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
+    Linear on conditioning embedding.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        bias: bool = True,
+        norm_type: str = "layer_norm",
+    ):
+        super().__init__()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        # *** NO SiLU here ***
+        emb = self.linear(conditioning_embedding.to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
 class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
     r"""
     Args:
@@ -666,7 +698,7 @@ def __init__(
         )
 
         # 4. Output projection
-        self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
+        self.norm_out = CogView4AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True)
 
         self.gradient_checkpointing = False
 
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import functools
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -161,17 +160,17 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
         super().__init__()
         self.theta = theta
         self.axes_dim = axes_dim
-        pos_index = torch.arange(1024)
-        neg_index = torch.arange(1024).flip(0) * -1 - 1
-        pos_freqs = torch.cat(
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
             [
                 self.rope_params(pos_index, self.axes_dim[0], self.theta),
                 self.rope_params(pos_index, self.axes_dim[1], self.theta),
                 self.rope_params(pos_index, self.axes_dim[2], self.theta),
             ],
             dim=1,
         )
-        neg_freqs = torch.cat(
+        self.neg_freqs = torch.cat(
             [
                 self.rope_params(neg_index, self.axes_dim[0], self.theta),
                 self.rope_params(neg_index, self.axes_dim[1], self.theta),
@@ -180,10 +179,8 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
             dim=1,
         )
         self.rope_cache = {}
-        self.register_buffer("pos_freqs", pos_freqs, persistent=False)
-        self.register_buffer("neg_freqs", neg_freqs, persistent=False)
 
-        # 是否使用 scale rope
+        # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
         self.scale_rope = scale_rope
 
     def rope_params(self, index, dim, theta=10000):
@@ -201,35 +198,48 @@ def forward(self, video_fhw, txt_seq_lens, device):
         Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
         txt_length: [bs] a list of 1 integers representing the length of the text
         """
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+
         if isinstance(video_fhw, list):
             video_fhw = video_fhw[0]
-        frame, height, width = video_fhw
-        rope_key = f"{frame}_{height}_{width}"
-
-        if not torch.compiler.is_compiling():
-            if rope_key not in self.rope_cache:
-                self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width)
-            vid_freqs = self.rope_cache[rope_key]
-        else:
-            vid_freqs = self._compute_video_freqs(frame, height, width)
+        if not isinstance(video_fhw, list):
+            video_fhw = [video_fhw]
+
+        vid_freqs = []
+        max_vid_index = 0
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            rope_key = f"{idx}_{height}_{width}"
+
+            if not torch.compiler.is_compiling():
+                if rope_key not in self.rope_cache:
+                    self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width, idx)
+                video_freq = self.rope_cache[rope_key]
+            else:
+                video_freq = self._compute_video_freqs(frame, height, width, idx)
+            video_freq = video_freq.to(device)
+            vid_freqs.append(video_freq)
 
-        if self.scale_rope:
-            max_vid_index = max(height // 2, width // 2)
-        else:
-            max_vid_index = max(height, width)
+            if self.scale_rope:
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
+            else:
+                max_vid_index = max(height, width, max_vid_index)
 
         max_len = max(txt_seq_lens)
         txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
 
         return vid_freqs, txt_freqs
 
     @functools.lru_cache(maxsize=None)
-    def _compute_video_freqs(self, frame, height, width):
+    def _compute_video_freqs(self, frame, height, width, idx=0):
         seq_lens = frame * height * width
         freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
         freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
 
-        freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
         if self.scale_rope:
             freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
             freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
 
@@ -391,6 +391,7 @@
         "QwenImagePipeline",
         "QwenImageImg2ImgPipeline",
         "QwenImageInpaintPipeline",
+        "QwenImageEditPipeline",
     ]
 try:
     if not is_onnx_available():
@@ -708,7 +709,12 @@
         from .paint_by_example import PaintByExamplePipeline
         from .pia import PIAPipeline
         from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
-        from .qwenimage import QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, QwenImagePipeline
+        from .qwenimage import (
+            QwenImageEditPipeline,
+            QwenImageImg2ImgPipeline,
+            QwenImageInpaintPipeline,
+            QwenImagePipeline,
+        )
         from .sana import SanaControlNetPipeline, SanaPipeline, SanaSprintImg2ImgPipeline, SanaSprintPipeline
         from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline