supports sequence parallel and use custom image size for Qwen Image (#186)

akaitsuki-ii · web-flow · commit d0763b2e86ca · 2025-10-22T18:03:58.000+08:00
* supports qwen image sequence parallel

* use custom image size
diff --git a/diffsynth_engine/models/qwen_image/qwen_image_dit.py b/diffsynth_engine/models/qwen_image/qwen_image_dit.py
@@ -9,7 +9,12 @@
 from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, ApproximateGELU, RMSNorm
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
-from diffsynth_engine.utils.parallel import cfg_parallel, cfg_parallel_unshard
+from diffsynth_engine.utils.parallel import (
+    cfg_parallel,
+    cfg_parallel_unshard,
+    sequence_parallel,
+    sequence_parallel_unshard,
+)
 
 
 class QwenImageDiTStateDictConverter(StateDictConverter):
@@ -498,14 +503,18 @@ def forward(
                     image.dtype,
                 )
 
-            for block in self.transformer_blocks:
-                text, image = block(
-                    image=image, text=text, temb=conditioning, rotary_emb=rotary_emb, attn_mask=attn_mask
-                )
-            image = self.norm_out(image, conditioning)
-            image = self.proj_out(image)
+            # warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
+            img_freqs, txt_freqs = rotary_emb
+            with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
+                rotary_emb = (img_freqs, txt_freqs)
+                for block in self.transformer_blocks:
+                    text, image = block(
+                        image=image, text=text, temb=conditioning, rotary_emb=rotary_emb, attn_mask=attn_mask
+                    )
+                image = self.norm_out(image, conditioning)
+                image = self.proj_out(image)
+                (image,) = sequence_parallel_unshard((image,), seq_dims=(1,), seq_lens=(image_seq_len,))
             image = image[:, :image_seq_len]
-
             image = self.unpatchify(image, h, w)
 
         (image,) = cfg_parallel_unshard((image,), use_cfg=use_cfg)
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -561,8 +561,8 @@ def __call__(
         # single image for edit, list for edit plus(QwenImageEdit2509)
         input_image: List[Image.Image] | Image.Image | None = None,
         cfg_scale: float = 4.0,  # true cfg
-        height: int = 1328,
-        width: int = 1328,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         seed: int | None = None,
         controlnet_params: List[QwenImageControlNetParams] | QwenImageControlNetParams = [],
@@ -571,7 +571,9 @@ def __call__(
         entity_prompts: Optional[List[str]] = None,
         entity_masks: Optional[List[Image.Image]] = None,
     ):
+        assert (height is None) == (width is None), "height and width should be set together"
         is_edit_plus = isinstance(input_image, list)
+
         if input_image is not None:
             if not isinstance(input_image, list):
                 input_image = [input_image]
@@ -583,9 +585,11 @@ def __call__(
                 vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
                 condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
                 vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
+            if width is None and height is None:
+                width, height = vae_images[-1].size
 
-            width, height = vae_images[-1].size
-
+        if width is None and height is None:
+            width, height = 1328, 1328
         self.validate_image_size(height, width, minimum=64, multiple_of=16)
 
         if not isinstance(controlnet_params, list):