split attention

dxqb · dxqb · commit 2f868793b4e4 · 2025-12-21T20:37:13.000+01:00
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -186,6 +186,7 @@ class AttentionBackendName(str, Enum):
     _NATIVE_MATH = "_native_math"
     _NATIVE_NPU = "_native_npu"
     _NATIVE_XLA = "_native_xla"
+    SPLIT = "split"
 
     # `sageattention`
     SAGE = "sage"
@@ -503,7 +504,7 @@ def _prepare_for_flash_attn_or_sage_varlen_without_mask(
     cu_seqlens_k = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
     cu_seqlens_q[1:] = torch.cumsum(seqlens_q, dim=0)
     cu_seqlens_k[1:] = torch.cumsum(seqlens_k, dim=0)
-    max_seqlen_q = seqlens_q.max().item()
+    max_seqlen_q = seqlens_q.max().item() #TODO item() is inefficient and breaks torch.compile graphs. Use 'seq_len' parameter instead (see split attention backend)
     max_seqlen_k = seqlens_k.max().item()
     return (seqlens_q, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k)
 
@@ -1975,6 +1976,45 @@ def _native_attention(
 
     return out
 
+@_AttentionBackendRegistry.register(
+    AttentionBackendName.SPLIT,
+    constraints=[_check_device, _check_shape],
+    supports_context_parallel=True,
+)
+def _split_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    seq_len: Optional[torch.Tensor] = None, #attn_mask is ignored if seq_len is passed
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    enable_gqa: bool = False,
+    return_lse: bool = False,
+    _parallel_config: Optional["ParallelConfig"] = None,
+) -> torch.Tensor:
+    if seq_len is None:
+        return _native_attention(query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa, return_lse, _parallel_config)
+
+    batch_size, batch_seq_len = query.shape[:2]
+    if any(sample_seq_len > batch_seq_len for sample_seq_len in seq_len):
+        raise ValueError("Attention sequence lengths cannot be longer than maximum sequence length")
+    if len(seq_len) != batch_size:
+        raise ValueError("Attention sequence lengths must match the batch size")
+
+    result = []
+    for index, sample_seq_len in enumerate(seq_len):
+        sliced_query = query[index, :sample_seq_len, :, :].unsqueeze(0)
+        sliced_key =   key  [index, :sample_seq_len, :, :].unsqueeze(0)
+        sliced_value = value[index, :sample_seq_len, :, :].unsqueeze(0)
+        sliced_result = _native_attention(sliced_query, sliced_key, sliced_value, None, dropout_p, is_causal, scale, enable_gqa, return_lse, _parallel_config)
+
+        padding = torch.zeros((1, batch_seq_len - sample_seq_len) + sliced_result.shape[2:], device=sliced_result.device, dtype=sliced_result.dtype)
+        padded_result = torch.cat([sliced_result, padding], dim=1)
+        result.append(padded_result)
+    return torch.cat(result, dim=0)
+
 
 @_AttentionBackendRegistry.register(
     AttentionBackendName._NATIVE_CUDNN,
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -150,7 +150,7 @@ def compute_text_seq_len_from_mask(
     """
     batch_size, text_seq_len = encoder_hidden_states.shape[:2]
     if encoder_hidden_states_mask is None:
-        return text_seq_len, None, None
+        return text_seq_len, [text_seq_len] * batch_size, None
 
     if encoder_hidden_states_mask.shape[:2] != (batch_size, text_seq_len):
         raise ValueError(
@@ -165,7 +165,7 @@ def compute_text_seq_len_from_mask(
     active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
     has_active = encoder_hidden_states_mask.any(dim=1)
     per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
-    return text_seq_len, per_sample_len, encoder_hidden_states_mask
+    return text_seq_len, per_sample_len.tolist(), encoder_hidden_states_mask
 
 
 class QwenTimestepProjEmbeddings(nn.Module):
@@ -492,6 +492,7 @@ def __call__(
         encoder_hidden_states_mask: torch.FloatTensor = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         image_rotary_emb: Optional[torch.Tensor] = None,
+        encoder_hidden_states_len: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         if encoder_hidden_states is None:
             raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
@@ -537,16 +538,17 @@ def __call__(
 
         # Concatenate for joint attention
         # Order: [text, image]
-        joint_query = torch.cat([txt_query, img_query], dim=1)
-        joint_key = torch.cat([txt_key, img_key], dim=1)
-        joint_value = torch.cat([txt_value, img_value], dim=1)
+        joint_query = torch.cat([img_query, txt_query], dim=1)
+        joint_key = torch.cat([img_key, txt_key], dim=1)
+        joint_value = torch.cat([img_value, txt_value], dim=1)
 
         # If an encoder_hidden_states_mask is provided, create a joint attention mask.
         # The encoder_hidden_states_mask is expected to have 1.0 for valid tokens and 0.0 for padding.
         # We convert it to a boolean mask where True means "attend" and False means "mask out" (don't attend).
         # Only create the mask if there's actual padding, otherwise keep attention_mask=None for better SDPA performance.
+        batch_size, image_seq_len = hidden_states.shape[:2]
+        attention_kwargs = {}
         if encoder_hidden_states_mask is not None and attention_mask is None:
-            batch_size, image_seq_len = hidden_states.shape[:2]
             text_seq_len = encoder_hidden_states.shape[1]
 
             if encoder_hidden_states_mask.shape[0] != batch_size:
@@ -568,7 +570,8 @@ def __call__(
             )
             # Create 2D joint mask [batch_size, text_seq_len + image_seq_len]
             # The attention dispatch will normalize this and extract sequence lengths
-            attention_mask = torch.cat([text_attention_mask, image_attention_mask], dim=1)
+            attention_mask = torch.cat([image_attention_mask, text_attention_mask], dim=1)
+            attention_kwargs['seq_len'] = [text_sample_len + image_seq_len for text_sample_len in encoder_hidden_states_len]
 
         # Compute joint attention
         joint_hidden_states = dispatch_attention_fn(
@@ -580,15 +583,16 @@ def __call__(
             is_causal=False,
             backend=self._attention_backend,
             parallel_config=self._parallel_config,
+            attention_kwargs=attention_kwargs,
         )
 
         # Reshape back
         joint_hidden_states = joint_hidden_states.flatten(2, 3)
         joint_hidden_states = joint_hidden_states.to(joint_query.dtype)
 
         # Split attention outputs back
-        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
-        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
+        img_attn_output = joint_hidden_states[:, :image_seq_len, :]  # Image part
+        txt_attn_output = joint_hidden_states[:, image_seq_len:, :]  # Text part
 
         # Apply output projections
         img_attn_output = attn.to_out[0](img_attn_output)
@@ -694,6 +698,7 @@ def forward(
         encoder_hidden_states_mask: torch.Tensor,
         temb: torch.Tensor,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        encoder_hidden_states_len: Optional[torch.Tensor] = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -728,6 +733,7 @@ def forward(
             encoder_hidden_states=txt_modulated,  # Text stream (will be processed as "context")
             encoder_hidden_states_mask=encoder_hidden_states_mask,
             image_rotary_emb=image_rotary_emb,
+            encoder_hidden_states_len=encoder_hidden_states_len,
             **joint_attention_kwargs,
         )
 
@@ -947,7 +953,9 @@ def forward(
         encoder_hidden_states = self.txt_in(encoder_hidden_states)
 
         # Use the encoder_hidden_states sequence length for RoPE computation and normalize mask
-        text_seq_len, _, encoder_hidden_states_mask = compute_text_seq_len_from_mask(
+        if torch.all(encoder_hidden_states_mask):
+            encoder_hidden_states_mask = None
+        text_seq_len, text_seq_len_per_sample, encoder_hidden_states_mask = compute_text_seq_len_from_mask(
             encoder_hidden_states, encoder_hidden_states_mask
         )
 
@@ -971,6 +979,7 @@ def forward(
                     encoder_hidden_states_mask,
                     temb,
                     image_rotary_emb,
+                    text_seq_len_per_sample,
                     attention_kwargs,
                     modulate_index,
                 )
@@ -982,6 +991,7 @@ def forward(
                     encoder_hidden_states_mask=encoder_hidden_states_mask,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
+                    encoder_hidden_states_len=text_seq_len_per_sample,
                     joint_attention_kwargs=attention_kwargs,
                     modulate_index=modulate_index,
                 )
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
@@ -473,6 +473,7 @@ def __call__(
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
+        batch_negative: bool = False, #TODO remove, only for testing
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -603,23 +604,35 @@ def __call__(
             )
 
         do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
-        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
-            prompt=prompt,
-            prompt_embeds=prompt_embeds,
-            prompt_embeds_mask=prompt_embeds_mask,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
-        )
-        if do_true_cfg:
-            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
-                prompt=negative_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                prompt_embeds_mask=negative_prompt_embeds_mask,
+        if do_true_cfg and batch_negative:
+            combined_prompt_embeds, combined_prompt_embeds_mask = self.encode_prompt(
+                prompt=[prompt, negative_prompt],
+#                prompt_embeds=prompt_embeds,
+#                prompt_embeds_mask=prompt_embeds_mask,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+            )
+            dtype = combined_prompt_embeds.dtype
+        else:
+            prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+                prompt=prompt,
+                prompt_embeds=prompt_embeds,
+                prompt_embeds_mask=prompt_embeds_mask,
                 device=device,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
             )
+            dtype = prompt_embeds.dtype
+            if do_true_cfg:
+                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                    prompt=negative_prompt,
+                    prompt_embeds=negative_prompt_embeds,
+                    prompt_embeds_mask=negative_prompt_embeds_mask,
+                    device=device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    max_sequence_length=max_sequence_length,
+                )
 
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
@@ -628,7 +641,7 @@ def __call__(
             num_channels_latents,
             height,
             width,
-            prompt_embeds.dtype,
+            dtype,
             device,
             generator,
             latents,
@@ -682,31 +695,50 @@ def __call__(
                 self._current_timestep = t
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
-                with self.transformer.cache_context("cond"):
+                if do_true_cfg and batch_negative:
                     noise_pred = self.transformer(
-                        hidden_states=latents,
-                        timestep=timestep / 1000,
-                        guidance=guidance,
-                        encoder_hidden_states_mask=prompt_embeds_mask,
-                        encoder_hidden_states=prompt_embeds,
+                        hidden_states=torch.cat([latents] * 2, dim=0),
+                        timestep=torch.cat([timestep] * 2, dim=0) / 1000,
+                        guidance=torch.cat([guidance] * 2, dim=0) if guidance is not None else None,
+                        encoder_hidden_states_mask=combined_prompt_embeds_mask,
+                        encoder_hidden_states=combined_prompt_embeds,
                         img_shapes=img_shapes,
                         attention_kwargs=self.attention_kwargs,
                         return_dict=False,
                     )[0]
+                    noise_pred, neg_noise_pred = torch.chunk(noise_pred, 2, dim=0)
+
+                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
 
-                if do_true_cfg:
-                    with self.transformer.cache_context("uncond"):
-                        neg_noise_pred = self.transformer(
+                    cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                    noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                    noise_pred = comb_pred * (cond_norm / noise_norm)
+                else:
+                    with self.transformer.cache_context("cond"):
+                        noise_pred = self.transformer(
                             hidden_states=latents,
                             timestep=timestep / 1000,
                             guidance=guidance,
-                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
-                            encoder_hidden_states=negative_prompt_embeds,
+                            encoder_hidden_states_mask=prompt_embeds_mask,
+                            encoder_hidden_states=prompt_embeds,
                             img_shapes=img_shapes,
                             attention_kwargs=self.attention_kwargs,
                             return_dict=False,
                         )[0]
-                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+
+                    if do_true_cfg:
+                        with self.transformer.cache_context("uncond"):
+                            neg_noise_pred = self.transformer(
+                                hidden_states=latents,
+                                timestep=timestep / 1000,
+                                guidance=guidance,
+                                encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                                encoder_hidden_states=negative_prompt_embeds,
+                                img_shapes=img_shapes,
+                                attention_kwargs=self.attention_kwargs,
+                                return_dict=False,
+                            )[0]
+                        comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
 
                     cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
                     noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)