remove txt_seq_lens and use bool mask

kashif · kashif · commit ac5ac24d9895 · 2025-11-29T15:39:36.000Z
diff --git a/examples/dreambooth/train_dreambooth_lora_qwen_image.py b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
@@ -1513,14 +1513,12 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     height=model_input.shape[3],
                     width=model_input.shape[4],
                 )
-                print(f"{prompt_embeds_mask.sum(dim=1).tolist()=}")
                 model_pred = transformer(
                     hidden_states=packed_noisy_model_input,
                     encoder_hidden_states=prompt_embeds,
                     encoder_hidden_states_mask=prompt_embeds_mask,
                     timestep=timesteps / 1000,
                     img_shapes=img_shapes,
-                    txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
                     return_dict=False,
                 )[0]
                 model_pred = QwenImagePipeline._unpack_latents(
diff --git a/src/diffusers/models/controlnets/controlnet_qwenimage.py b/src/diffusers/models/controlnets/controlnet_qwenimage.py
@@ -189,12 +189,11 @@ def forward(
         encoder_hidden_states_mask: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         img_shapes: Optional[List[Tuple[int, int, int]]] = None,
-        txt_seq_lens: Optional[List[int]] = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
         """
-        The [`FluxTransformer2DModel`] forward method.
+        The [`QwenImageControlNetModel`] forward method.
 
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
@@ -205,26 +204,24 @@ def forward(
                 The scale factor for ControlNet outputs.
             encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                 Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
-                from the embeddings of input conditions.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+                Mask for the encoder hidden states. Expected to have 1.0 for valid tokens and 0.0 for padding tokens.
+                Used in the attention processor to prevent attending to padding tokens. The mask can have any pattern
+                (not just contiguous valid tokens followed by padding) since it's applied element-wise in attention.
             timestep ( `torch.LongTensor`):
                 Used to indicate denoising step.
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-                A list of tensors that if specified are added to the residuals of transformer blocks.
-            txt_seq_lens (`List[int]`, *optional*):
-                Optional text sequence lengths. If omitted, or shorter than the encoder hidden states length, the model
-                derives the length from the encoder hidden states (or their mask).
+            img_shapes (`List[Tuple[int, int, int]]`, *optional*):
+                Image shapes for RoPE computation.
             joint_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
 
         Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
+            If `return_dict` is True, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a `tuple` where
+            the first element is the controlnet block samples.
         """
         if joint_attention_kwargs is not None:
             joint_attention_kwargs = joint_attention_kwargs.copy()
@@ -247,13 +244,9 @@ def forward(
 
         temb = self.time_text_embed(timestep, hidden_states)
 
-        batch_size, text_seq_len = encoder_hidden_states.shape[:2]
-        if txt_seq_lens is not None:
-            if len(txt_seq_lens) != batch_size:
-                raise ValueError(f"`txt_seq_lens` must have length {batch_size}, but got {len(txt_seq_lens)} instead.")
-            text_seq_len = max(text_seq_len, max(txt_seq_lens))
-        elif encoder_hidden_states_mask is not None:
-            text_seq_len = max(text_seq_len, int(encoder_hidden_states_mask.sum(dim=1).max().item()))
+        # Use the encoder_hidden_states sequence length for RoPE computation
+        # The mask is used for attention masking in the attention processor
+        _, text_seq_len = encoder_hidden_states.shape[:2]
 
         image_rotary_emb = self.pos_embed(img_shapes, text_seq_len, device=hidden_states.device)
 
@@ -332,7 +325,6 @@ def forward(
         encoder_hidden_states_mask: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         img_shapes: Optional[List[Tuple[int, int, int]]] = None,
-        txt_seq_lens: Optional[List[int]] = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[QwenImageControlNetOutput, Tuple]:
@@ -350,7 +342,6 @@ def forward(
                     encoder_hidden_states_mask=encoder_hidden_states_mask,
                     timestep=timestep,
                     img_shapes=img_shapes,
-                    txt_seq_lens=txt_seq_lens,
                     joint_attention_kwargs=joint_attention_kwargs,
                     return_dict=return_dict,
                 )
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -330,6 +330,8 @@ def __call__(
         joint_value = torch.cat([txt_value, img_value], dim=1)
 
         # If an encoder_hidden_states_mask is provided, turn it into a broadcastable attention mask.
+        # The encoder_hidden_states_mask is expected to have 1.0 for valid tokens and 0.0 for padding.
+        # We convert it to a boolean mask where True means "attend" and False means "mask out" (don't attend).
         if encoder_hidden_states_mask is not None and attention_mask is None:
             batch_size, image_seq_len = hidden_states.shape[:2]
             text_seq_len = encoder_hidden_states.shape[1]
@@ -345,7 +347,9 @@ def __call__(
                     f"must match encoder_hidden_states sequence length ({text_seq_len})."
                 )
 
-            text_attention_mask = encoder_hidden_states_mask.to(dtype=torch.bool, device=hidden_states.device)
+            # Convert mask to boolean: 1/1.0 -> True (attend), 0/0.0 -> False (don't attend)
+            # This is the correct semantics for PyTorch's scaled_dot_product_attention with boolean masks.
+            text_attention_mask = encoder_hidden_states_mask.bool()
             image_attention_mask = torch.ones(
                 (batch_size, image_seq_len), dtype=torch.bool, device=hidden_states.device
             )
@@ -592,7 +596,6 @@ def forward(
         encoder_hidden_states_mask: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         img_shapes: Optional[List[Tuple[int, int, int]]] = None,
-        txt_seq_lens: Optional[List[int]] = None,
         guidance: torch.Tensor = None,  # TODO: this should probably be removed
         attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_block_samples=None,
@@ -606,17 +609,22 @@ def forward(
                 Input `hidden_states`.
             encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                 Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
-                Mask of the input conditions.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+                Mask for the encoder hidden states. Expected to have 1.0 for valid tokens and 0.0 for padding tokens.
+                Used in the attention processor to prevent attending to padding tokens. The mask can have any pattern
+                (not just contiguous valid tokens followed by padding) since it's applied element-wise in attention.
             timestep ( `torch.LongTensor`):
                 Used to indicate denoising step.
-            txt_seq_lens (`List[int]`, *optional*):
-                Optional text sequence lengths. If not provided, or if any provided values are shorter than the encoder
-                hidden states length, the model falls back to the encoder hidden states length.
+            img_shapes (`List[Tuple[int, int, int]]`, *optional*):
+                Image shapes for RoPE computation.
+            guidance (`torch.Tensor`, *optional*):
+                Guidance tensor for conditional generation.
             attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_block_samples (*optional*):
+                ControlNet block samples to add to the transformer blocks.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                 tuple.
@@ -646,13 +654,9 @@ def forward(
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)
         encoder_hidden_states = self.txt_in(encoder_hidden_states)
 
-        batch_size, text_seq_len = encoder_hidden_states.shape[:2]
-        if txt_seq_lens is not None:
-            if len(txt_seq_lens) != batch_size:
-                raise ValueError(f"`txt_seq_lens` must have length {batch_size}, but got {len(txt_seq_lens)} instead.")
-            text_seq_len = max(text_seq_len, max(txt_seq_lens))
-        elif encoder_hidden_states_mask is not None:
-            text_seq_len = max(text_seq_len, int(encoder_hidden_states_mask.sum(dim=1).max().item()))
+        # Use the encoder_hidden_states sequence length for RoPE computation
+        # The mask is used for attention masking in the attention processor
+        _, text_seq_len = encoder_hidden_states.shape[:2]
 
         if guidance is not None:
             guidance = guidance.to(hidden_states.dtype) * 1000
diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -90,16 +90,20 @@ def test_gradient_checkpointing_is_applied(self):
         expected_set = {"QwenImageTransformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
 
-    def test_accepts_short_txt_seq_lens(self):
+    def test_infers_text_seq_len_from_mask(self):
         init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict).to(torch_device)
 
-        # Provide a deliberately short txt_seq_lens to ensure the model falls back to the embedding length.
-        inputs["txt_seq_lens"] = [2] * inputs["encoder_hidden_states"].shape[0]
+        # Create a mask with only 2 valid tokens (rest are padding)
+        encoder_hidden_states_mask = inputs["encoder_hidden_states_mask"].clone()
+        encoder_hidden_states_mask[:, 2:] = 0  # Only first 2 tokens are valid
+
+        inputs["encoder_hidden_states_mask"] = encoder_hidden_states_mask
 
         with torch.no_grad():
             output = model(**inputs)
 
+        # The model should infer text_seq_len=2 from the mask for RoPE computation
         self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
 
     def test_builds_attention_mask_from_encoder_mask(self):
@@ -111,13 +115,33 @@ def test_builds_attention_mask_from_encoder_mask(self):
         encoder_hidden_states_mask[:, -2:] = 0
 
         inputs["encoder_hidden_states_mask"] = encoder_hidden_states_mask
-        inputs.pop("txt_seq_lens", None)
 
         with torch.no_grad():
             output = model(**inputs)
 
         self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
 
+    def test_non_contiguous_attention_mask(self):
+        """Test that non-contiguous masks work correctly (e.g., [1, 0, 1, 0, 1, 0, 0])"""
+        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        # Create a non-contiguous mask pattern: valid, padding, valid, padding, etc.
+        encoder_hidden_states_mask = inputs["encoder_hidden_states_mask"].clone()
+        # Pattern: [True, False, True, False, True, False, False]
+        encoder_hidden_states_mask[:, 1] = 0
+        encoder_hidden_states_mask[:, 3] = 0
+        encoder_hidden_states_mask[:, 5:] = 0
+
+        inputs["encoder_hidden_states_mask"] = encoder_hidden_states_mask
+
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # The model should handle non-contiguous masks correctly
+        # RoPE uses the full sequence length, attention masking handles the pattern
+        self.assertEqual(output.sample.shape[1], inputs["hidden_states"].shape[1])
+
 
 class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
     model_class = QwenImageTransformer2DModel