qwen_image: propagate attention mask. (#11966)

maromri · web-flow · commit d7f3241bf6b1 · 2026-01-22T20:02:31.000-05:00
diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
@@ -170,8 +170,14 @@ def forward(
         joint_query = apply_rope1(joint_query, image_rotary_emb)
         joint_key = apply_rope1(joint_key, image_rotary_emb)
 
+        if encoder_hidden_states_mask is not None:
+            attn_mask = torch.zeros((batch_size, 1, seq_txt + seq_img), dtype=hidden_states.dtype, device=hidden_states.device)
+            attn_mask[:, 0, :seq_txt] = encoder_hidden_states_mask
+        else:
+            attn_mask = None
+
         joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads,
-                                                         attention_mask, transformer_options=transformer_options,
+                                                         attn_mask, transformer_options=transformer_options,
                                                          skip_reshape=True)
 
         txt_attn_output = joint_hidden_states[:, :seq_txt, :]
@@ -430,6 +436,9 @@ def _forward(
         encoder_hidden_states = context
         encoder_hidden_states_mask = attention_mask
 
+        if encoder_hidden_states_mask is not None and not torch.is_floating_point(encoder_hidden_states_mask):
+            encoder_hidden_states_mask = (encoder_hidden_states_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
+
         hidden_states, img_ids, orig_shape = self.process_img(x)
         num_embeds = hidden_states.shape[1]
 
diff --git a/comfy/model_base.py b/comfy/model_base.py
@@ -1578,6 +1578,9 @@ def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
 
     def extra_conds(self, **kwargs):
         out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
         cross_attn = kwargs.get("cross_attn", None)
         if cross_attn is not None:
             out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)