Skip to content

Commit 4215ce4

Browse files
authored
fix qwen image 2511 cfg parallel && scheduler default param(#214)
add log fix remove debug log fix image resize sample & sigmas for qwen edit 2511 set shift_terminal==0.02 调整 vae 运算顺序 remove debug code 删除无用代码 删除无用代码
1 parent f3d1dc4 commit 4215ce4

File tree

4 files changed

+11
-11
lines changed

4 files changed

+11
-11
lines changed

diffsynth_engine/models/basic/attention.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ def long_context_attention(
343343
f"head_dim={q.shape[-1]}, but aiter_flash_attn only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
344344
)
345345
if SDPA_AVAILABLE:
346-
return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
346+
return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
347347
if FLASH_ATTN_2_AVAILABLE:
348348
return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
349349
raise ValueError("No available long context attention implementation")
@@ -379,7 +379,7 @@ def long_context_attention(
379379
if attn_impl == "fa2":
380380
return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
381381
if attn_impl == "sdpa":
382-
return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
382+
return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
383383
if attn_impl == "sage":
384384
return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
385385
if attn_impl == "sparge":

diffsynth_engine/models/qwen_image/qwen_image_dit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,16 +286,15 @@ def _modulate(self, x, mod_params, index=None):
286286
shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]
287287
scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
288288
gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
289-
index_expanded = index.unsqueeze(-1)
290289
shift_0_exp = shift_0.unsqueeze(1)
291290
shift_1_exp = shift_1.unsqueeze(1)
292291
scale_0_exp = scale_0.unsqueeze(1)
293292
scale_1_exp = scale_1.unsqueeze(1)
294293
gate_0_exp = gate_0.unsqueeze(1)
295294
gate_1_exp = gate_1.unsqueeze(1)
296-
shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
297-
scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
298-
gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
295+
shift_result = torch.where(index == 0, shift_0_exp, shift_1_exp)
296+
scale_result = torch.where(index == 0, scale_0_exp, scale_1_exp)
297+
gate_result = torch.where(index == 0, gate_0_exp, gate_1_exp)
299298
else:
300299
shift_result = shift.unsqueeze(1)
301300
scale_result = scale.unsqueeze(1)
@@ -514,6 +513,7 @@ def forward(
514513
device=timestep.device,
515514
dtype=torch.int,
516515
)
516+
modulate_index = modulate_index.unsqueeze(-1)
517517
rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
518518

519519
image = self.img_in(image)
@@ -535,7 +535,7 @@ def forward(
535535

536536
# warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
537537
img_freqs, txt_freqs = rotary_emb
538-
with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
538+
with sequence_parallel((image, text, img_freqs, txt_freqs, modulate_index), seq_dims=(1, 1, 0, 0, 1)):
539539
rotary_emb = (img_freqs, txt_freqs)
540540
for block in self.transformer_blocks:
541541
text, image = block(

diffsynth_engine/models/qwen_image/qwen_image_vae.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,6 @@ def encode(self, x, scale):
685685
x = patchify(x, patch_size=2 if self.in_channels == 12 else 1)
686686
t = x.shape[2]
687687
iter_ = 1 + (t - 1) // 4
688-
689688
for i in range(iter_):
690689
if i == 0:
691690
out = self.encoder(x[:, :, :1, :, :], feat_cache=feat_cache)

diffsynth_engine/pipelines/qwen_image.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def __init__(
165165
self.edit_prompt_template_encode_start_idx = 64
166166

167167
# sampler
168-
self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
168+
self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True, shift_terminal=0.02)
169169
self.sampler = FlowMatchEulerSampler()
170170
# models
171171
self.tokenizer = tokenizer
@@ -690,8 +690,9 @@ def __call__(
690690
img_width, img_height = img.size
691691
condition_width, condition_height = self.calculate_dimensions(384 * 384, img_width / img_height)
692692
vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
693-
condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
694-
vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
693+
condition_images.append(img.resize((condition_width, condition_height)))
694+
vae_images.append(img.resize((vae_width, vae_height)))
695+
695696
if width is None and height is None:
696697
width, height = vae_images[-1].size
697698

0 commit comments

Comments
 (0)