fix qwen image 2511 cfg parallel && scheduler default param(#214)

qzzz95 · web-flow · commit 4215ce48b5b2 · 2025-12-29T10:09:32.000+08:00
add log

fix

remove debug log

fix image resize sample &amp; sigmas for qwen edit 2511

set shift_terminal==0.02

调整 vae 运算顺序

remove debug code

删除无用代码

删除无用代码
diff --git a/diffsynth_engine/models/basic/attention.py b/diffsynth_engine/models/basic/attention.py
@@ -343,7 +343,7 @@ def long_context_attention(
                     f"head_dim={q.shape[-1]}, but aiter_flash_attn only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
                 )
         if SDPA_AVAILABLE:
-            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+            return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
         if FLASH_ATTN_2_AVAILABLE:
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         raise ValueError("No available long context attention implementation")
@@ -379,7 +379,7 @@ def long_context_attention(
         if attn_impl == "fa2":
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         if attn_impl == "sdpa":
-            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+            return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
         if attn_impl == "sage":
             return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
         if attn_impl == "sparge":
diff --git a/diffsynth_engine/models/qwen_image/qwen_image_dit.py b/diffsynth_engine/models/qwen_image/qwen_image_dit.py
@@ -286,16 +286,15 @@ def _modulate(self, x, mod_params, index=None):
             shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]
             scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
             gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
-            index_expanded = index.unsqueeze(-1)
             shift_0_exp = shift_0.unsqueeze(1)
             shift_1_exp = shift_1.unsqueeze(1)
             scale_0_exp = scale_0.unsqueeze(1)
             scale_1_exp = scale_1.unsqueeze(1)
             gate_0_exp = gate_0.unsqueeze(1)
             gate_1_exp = gate_1.unsqueeze(1)
-            shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
-            scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
-            gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
+            shift_result = torch.where(index == 0, shift_0_exp, shift_1_exp)
+            scale_result = torch.where(index == 0, scale_0_exp, scale_1_exp)
+            gate_result = torch.where(index == 0, gate_0_exp, gate_1_exp)
         else:
             shift_result = shift.unsqueeze(1)
             scale_result = scale.unsqueeze(1)
@@ -514,6 +513,7 @@ def forward(
                     device=timestep.device,
                     dtype=torch.int,
                 )
+                modulate_index = modulate_index.unsqueeze(-1)
             rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
 
             image = self.img_in(image)
@@ -535,7 +535,7 @@ def forward(
 
             # warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
             img_freqs, txt_freqs = rotary_emb
-            with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
+            with sequence_parallel((image, text, img_freqs, txt_freqs, modulate_index), seq_dims=(1, 1, 0, 0, 1)):
                 rotary_emb = (img_freqs, txt_freqs)
                 for block in self.transformer_blocks:
                     text, image = block(
diff --git a/diffsynth_engine/models/qwen_image/qwen_image_vae.py b/diffsynth_engine/models/qwen_image/qwen_image_vae.py
@@ -685,7 +685,6 @@ def encode(self, x, scale):
         x = patchify(x, patch_size=2 if self.in_channels == 12 else 1)
         t = x.shape[2]
         iter_ = 1 + (t - 1) // 4
-
         for i in range(iter_):
             if i == 0:
                 out = self.encoder(x[:, :, :1, :, :], feat_cache=feat_cache)
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -165,7 +165,7 @@ def __init__(
         self.edit_prompt_template_encode_start_idx = 64
 
         # sampler
-        self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
+        self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True, shift_terminal=0.02)
         self.sampler = FlowMatchEulerSampler()
         # models
         self.tokenizer = tokenizer
@@ -690,8 +690,9 @@ def __call__(
                 img_width, img_height = img.size
                 condition_width, condition_height = self.calculate_dimensions(384 * 384, img_width / img_height)
                 vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
-                condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
-                vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
+                condition_images.append(img.resize((condition_width, condition_height)))
+                vae_images.append(img.resize((vae_width, vae_height)))
+
             if width is None and height is None:
                 width, height = vae_images[-1].size