Fix/qwen image (#197)

akaitsuki-ii · web-flow · commit 0590444c9db0 · 2025-11-05T16:24:35.000+08:00
* several fixes for qwen image

* fix gelu

* fix batch_cfg with padding
diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -262,16 +262,11 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
     encoder_dtype: torch.dtype = torch.bfloat16
     vae_dtype: torch.dtype = torch.float32
 
+    load_encoder: bool = True
+
     # override OptimizationConfig
     fbcache_relative_l1_threshold = 0.009
 
-    # override BaseConfig
-    vae_tiled: bool = True
-    vae_tile_size: Tuple[int, int] = (34, 34)
-    vae_tile_stride: Tuple[int, int] = (18, 16)
-
-    load_encoder: bool = True
-
     @classmethod
     def basic_config(
         cls,
diff --git a/diffsynth_engine/models/basic/transformer_helper.py b/diffsynth_engine/models/basic/transformer_helper.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import math
 
 
@@ -91,8 +92,8 @@ class NewGELUActivation(nn.Module):
     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
 
-    def forward(self, input: "torch.Tensor") -> "torch.Tensor":
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 
 
 class ApproximateGELU(nn.Module):
@@ -115,3 +116,36 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         return x * torch.sigmoid(1.702 * x)
+
+
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias, device=device, dtype=dtype)
+        self.approximate = approximate
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return F.gelu(gate, approximate=self.approximate)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        x = self.gelu(x)
+        return x
diff --git a/diffsynth_engine/models/qwen_image/qwen_image_dit.py b/diffsynth_engine/models/qwen_image/qwen_image_dit.py
@@ -6,7 +6,7 @@
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
-from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, ApproximateGELU, RMSNorm
+from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, GELU, RMSNorm
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.parallel import (
@@ -144,7 +144,7 @@ def __init__(
         super().__init__()
         inner_dim = int(dim * 4)
         self.net = nn.ModuleList([])
-        self.net.append(ApproximateGELU(dim, inner_dim, device=device, dtype=dtype))
+        self.net.append(GELU(dim, inner_dim, approximate="tanh", device=device, dtype=dtype))
         self.net.append(nn.Dropout(dropout))
         self.net.append(nn.Linear(inner_dim, dim_out, device=device, dtype=dtype))
 
@@ -155,8 +155,8 @@ def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
 
 
 def apply_rotary_emb_qwen(x: torch.Tensor, freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]]):
-    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-    x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))  # (b, s, h, d) -> (b, s, h, d/2, 2)
+    x_out = torch.view_as_real(x_rotated * freqs_cis.unsqueeze(1)).flatten(3)  # (b, s, h, d/2, 2) -> (b, s, h, d)
     return x_out.type_as(x)
 
 
@@ -200,13 +200,13 @@ def forward(
         img_q, img_k, img_v = self.to_q(image), self.to_k(image), self.to_v(image)
         txt_q, txt_k, txt_v = self.add_q_proj(text), self.add_k_proj(text), self.add_v_proj(text)
 
-        img_q = rearrange(img_q, "b s (h d) -> b h s d", h=self.num_heads)
-        img_k = rearrange(img_k, "b s (h d) -> b h s d", h=self.num_heads)
-        img_v = rearrange(img_v, "b s (h d) -> b h s d", h=self.num_heads)
+        img_q = rearrange(img_q, "b s (h d) -> b s h d", h=self.num_heads)
+        img_k = rearrange(img_k, "b s (h d) -> b s h d", h=self.num_heads)
+        img_v = rearrange(img_v, "b s (h d) -> b s h d", h=self.num_heads)
 
-        txt_q = rearrange(txt_q, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_k = rearrange(txt_k, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_v = rearrange(txt_v, "b s (h d) -> b h s d", h=self.num_heads)
+        txt_q = rearrange(txt_q, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_k = rearrange(txt_k, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_v = rearrange(txt_v, "b s (h d) -> b s h d", h=self.num_heads)
 
         img_q, img_k = self.norm_q(img_q), self.norm_k(img_k)
         txt_q, txt_k = self.norm_added_q(txt_q), self.norm_added_k(txt_k)
@@ -218,13 +218,9 @@ def forward(
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs)
 
-        joint_q = torch.cat([txt_q, img_q], dim=2)
-        joint_k = torch.cat([txt_k, img_k], dim=2)
-        joint_v = torch.cat([txt_v, img_v], dim=2)
-
-        joint_q = joint_q.transpose(1, 2)
-        joint_k = joint_k.transpose(1, 2)
-        joint_v = joint_v.transpose(1, 2)
+        joint_q = torch.cat([txt_q, img_q], dim=1)
+        joint_k = torch.cat([txt_k, img_k], dim=1)
+        joint_v = torch.cat([txt_v, img_v], dim=1)
 
         attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
         joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, attn_mask=attn_mask, **attn_kwargs)
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -24,7 +24,7 @@
 from diffsynth_engine.models.qwen_image import QwenImageVAE
 from diffsynth_engine.tokenizers import Qwen2TokenizerFast, Qwen2VLProcessor
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
-from diffsynth_engine.pipelines.utils import calculate_shift
+from diffsynth_engine.pipelines.utils import calculate_shift, pad_and_concat
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
 from diffsynth_engine.utils.constants import (
@@ -148,9 +148,17 @@ def __init__(
         self.prompt_template_encode_start_idx = 34
         # qwen image edit
         self.edit_system_prompt = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
-        self.edit_prompt_template_encode = "<|im_start|>system\n" + self.edit_system_prompt + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        )
         # qwen image edit plus
-        self.edit_plus_prompt_template_encode = "<|im_start|>system\n" + self.edit_system_prompt + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_plus_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        )
 
         self.edit_prompt_template_encode_start_idx = 64
 
@@ -490,8 +498,8 @@ def predict_noise_with_cfg(
         else:
             # cfg by predict noise in one batch
             bs, _, h, w = latents.shape
-            prompt_emb = torch.cat([prompt_emb, negative_prompt_emb], dim=0)
-            prompt_emb_mask = torch.cat([prompt_emb_mask, negative_prompt_emb_mask], dim=0)
+            prompt_emb = pad_and_concat(prompt_emb, negative_prompt_emb)
+            prompt_emb_mask = pad_and_concat(prompt_emb_mask, negative_prompt_emb_mask)
             if entity_prompt_embs is not None:
                 entity_prompt_embs = [
                     torch.cat([x, y], dim=0) for x, y in zip(entity_prompt_embs, negative_entity_prompt_embs)
diff --git a/diffsynth_engine/pipelines/utils.py b/diffsynth_engine/pipelines/utils.py
@@ -1,3 +1,7 @@
+import torch
+import torch.nn.functional as F
+
+
 def accumulate(result, new_item):
     if result is None:
         return new_item
@@ -17,3 +21,51 @@ def calculate_shift(
     b = base_shift - m * base_seq_len
     mu = image_seq_len * m + b
     return mu
+
+
+def pad_and_concat(
+    tensor1: torch.Tensor,
+    tensor2: torch.Tensor,
+    concat_dim: int = 0,
+    pad_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Concatenate two tensors along a specified dimension after padding along another dimension.
+
+    Assumes input tensors have shape (b, s, d), where:
+    - b: batch dimension
+    - s: sequence dimension (may differ)
+    - d: feature dimension
+
+    Args:
+        tensor1: First tensor with shape (b1, s1, d)
+        tensor2: Second tensor with shape (b2, s2, d)
+        concat_dim: Dimension to concatenate along, default is 0 (batch dimension)
+        pad_dim: Dimension to pad along, default is 1 (sequence dimension)
+
+    Returns:
+        Concatenated tensor, shape depends on concat_dim and pad_dim choices
+    """
+    assert tensor1.dim() == tensor2.dim(), "Both tensors must have the same number of dimensions"
+    assert concat_dim != pad_dim, "concat_dim and pad_dim cannot be the same"
+
+    len1, len2 = tensor1.shape[pad_dim], tensor2.shape[pad_dim]
+    max_len = max(len1, len2)
+
+    # Calculate the position of pad_dim in the padding list
+    # Padding format: from the last dimension, each pair represents (dim_n_left, dim_n_right, ..., dim_0_left, dim_0_right)
+    ndim = tensor1.dim()
+    padding = [0] * (2 * ndim)
+    pad_right_idx = -2 * pad_dim - 1
+
+    if len1 < max_len:
+        pad_len = max_len - len1
+        padding[pad_right_idx] = pad_len
+        tensor1 = F.pad(tensor1, padding, mode="constant", value=0)
+    elif len2 < max_len:
+        pad_len = max_len - len2
+        padding[pad_right_idx] = pad_len
+        tensor2 = F.pad(tensor2, padding, mode="constant", value=0)
+
+    # Concatenate along the specified dimension
+    return torch.cat([tensor1, tensor2], dim=concat_dim)
diff --git a/diffsynth_engine/tokenizers/base.py b/diffsynth_engine/tokenizers/base.py
@@ -1,10 +1,16 @@
 # Modified from transformers.tokenization_utils_base
 from typing import Dict, List, Union, overload
+from enum import Enum
 
 
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 
 
+class PaddingStrategy(str, Enum):
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+
+
 class BaseTokenizer:
     SPECIAL_TOKENS_ATTRIBUTES = [
         "bos_token",
diff --git a/diffsynth_engine/tokenizers/qwen2.py b/diffsynth_engine/tokenizers/qwen2.py
@@ -4,7 +4,7 @@
 from typing import Dict, List, Union, Optional
 from tokenizers import Tokenizer as TokenizerFast, AddedToken
 
-from diffsynth_engine.tokenizers.base import BaseTokenizer, TOKENIZER_CONFIG_FILE
+from diffsynth_engine.tokenizers.base import BaseTokenizer, PaddingStrategy, TOKENIZER_CONFIG_FILE
 
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
@@ -165,22 +165,28 @@ def __call__(
         texts: Union[str, List[str]],
         max_length: Optional[int] = None,
         padding_side: Optional[str] = None,
+        padding_strategy: Union[PaddingStrategy, str] = "longest",
         **kwargs,
     ) -> Dict[str, "torch.Tensor"]:
         """
         Tokenize text and prepare for model inputs.
 
         Args:
-            text (`str`, `List[str]`, *optional*):
+            texts (`str`, `List[str]`):
                 The sequence or batch of sequences to be encoded.
 
             max_length (`int`, *optional*):
-                Each encoded sequence will be truncated or padded to max_length.
+                Maximum length of the encoded sequences.
 
             padding_side (`str`, *optional*):
                 The side on which the padding should be applied. Should be selected between `"right"` and `"left"`.
                 Defaults to `"right"`.
 
+            padding_strategy (`PaddingStrategy`, `str`, *optional*):
+                If `"longest"`, will pad the sequences to the longest sequence in the batch.
+                If `"max_length"`, will pad the sequences to the `max_length` argument.
+                Defaults to `"longest"`.
+
         Returns:
             `Dict[str, "torch.Tensor"]`: tensor dict compatible with model_input_names.
         """
@@ -190,7 +196,9 @@ def __call__(
 
         batch_ids = self.batch_encode(texts)
         ids_lens = [len(ids_) for ids_ in batch_ids]
-        max_length = max_length if max_length is not None else min(max(ids_lens), self.model_max_length)
+        max_length = max_length if max_length is not None else self.model_max_length
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = min(max(ids_lens), max_length)
         padding_side = padding_side if padding_side is not None else self.padding_side
 
         encoded = torch.zeros(len(texts), max_length, dtype=torch.long)