modify the attention processor with set_attn_processor and change SanaAttnProcessor3_0 to SanaVanillaAttnProcessor

lawrence-cj · lawrence-cj · commit 86bef58618c5 · 2025-05-08T15:29:49.000+08:00
diff --git a/examples/research_projects/sana/train_sana_sprint_diffusers.py b/examples/research_projects/sana/train_sana_sprint_diffusers.py
@@ -86,6 +86,86 @@
     "User Prompt: ",
 ]
 
+class SanaVanillaAttnProcessor:
+    r"""
+    Processor for implementing scaled dot-product attention to support JVP calculation during training.
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+    ) -> torch.Tensor:
+        B, H, L, S = *query.size()[:-1], key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        attn_bias = torch.zeros(B, H, L, S, dtype=query.dtype, device=query.device)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return attn_weight @ value
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = self.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
 
 
 class Text2ImageDataset(Dataset):
@@ -109,7 +189,6 @@ def __init__(self, hf_dataset, resolution=1024):
         T.Lambda(lambda img: img.convert("RGB")),
         T.Resize(resolution),  # Image.BICUBIC
         T.CenterCrop(resolution),
-        # T.RandomHorizontalFlip(),
         T.ToTensor(),
         T.Normalize([0.5], [0.5]),
     ])
@@ -132,7 +211,7 @@ def __getitem__(self, idx):
             'image': image_tensor
         }
 
-# TODO here
+
 def save_model_card(
     repo_id: str,
     images=None,
@@ -807,7 +886,6 @@ def forward(self, hidden_states, encoder_hidden_states, timestep, guidance=None,
         return (trigflow_model_out,)
 
 
-
 def compute_density_for_timestep_sampling_scm(
     batch_size: int, logit_mean: float = None, logit_std: float = None
 ):
@@ -820,7 +898,6 @@ def compute_density_for_timestep_sampling_scm(
     return u
 
 
-
 def main(args):
     if args.report_to == "wandb" and args.hub_token is not None:
         raise ValueError(
@@ -872,7 +949,6 @@ def main(args):
     if args.seed is not None:
         set_seed(args.seed)
 
-
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.output_dir is not None:
@@ -904,8 +980,9 @@ def main(args):
 
     ori_transformer = SanaTransformer2DModel.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant,
-        guidance_embeds=True, cross_attention_type='vanilla'
+        guidance_embeds=True,
     )
+    ori_transformer.set_attn_processor(SanaVanillaAttnProcessor())
 
     ori_transformer_no_guide = SanaTransformer2DModel.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="transformer", revision=args.revision, variant=args.variant,
@@ -929,7 +1006,6 @@ def main(args):
 
     zero_state_dict = {}
 
-
     target_device = accelerator.device
     param_w1 = guidance_embedder_module.linear_1.weight
     zero_state_dict['linear_1.weight'] = torch.zeros(param_w1.shape, device=target_device)
@@ -941,7 +1017,6 @@ def main(args):
     zero_state_dict['linear_2.bias'] = torch.zeros(param_b2.shape, device=target_device)
     guidance_embedder_module.load_state_dict(zero_state_dict, strict=False, assign=True)
 
-
     transformer = SanaTrigFlow(ori_transformer, guidance=True).train()
     pretrained_model = SanaTrigFlow(ori_transformer_no_guide, guidance=False).eval()
 
@@ -951,7 +1026,6 @@ def main(args):
         head_block_ids=args.head_block_ids,
     ).train()
 
-
     transformer.requires_grad_(True)
     pretrained_model.requires_grad_(False)
     disc.model.requires_grad_(False)
@@ -1005,7 +1079,6 @@ def main(args):
     if args.gradient_checkpointing:
         transformer.enable_gradient_checkpointing()
 
-
     def unwrap_model(model):
         model = accelerator.unwrap_model(model)
         model = model._orig_mod if is_compiled_module(model) else model
@@ -1063,7 +1136,6 @@ def load_model_hook(models, input_dir):
         accelerator.register_save_state_pre_hook(save_model_hook)
         accelerator.register_load_state_pre_hook(load_model_hook)
 
-
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
     if args.allow_tf32 and torch.cuda.is_available():
@@ -1087,7 +1159,6 @@ def load_model_hook(models, input_dir):
     else:
         optimizer_class = torch.optim.AdamW
 
-
     # Optimization parameters
     optimizer_G = optimizer_class(
         transformer.parameters(),
@@ -1391,12 +1462,10 @@ def model_wrapper(scaled_x_t, t):
                     z_D = torch.randn_like(model_input) * sigma_data
                     noised_predicted_x0 = torch.cos(t_D) * pred_x_0 + torch.sin(t_D) * z_D
 
-
                     # Calculate adversarial loss
                     pred_fake = disc(hidden_states=(noised_predicted_x0 / sigma_data), timestep=t_D.flatten(), encoder_hidden_states=prompt_embeds, encoder_attention_mask=prompt_attention_mask)
                     adv_loss = -torch.mean(pred_fake)
 
-
                     # Total loss = sCM loss + LADD loss
 
                     total_loss = args.scm_lambda * loss + adv_loss * args.adv_lambda
@@ -1405,8 +1474,6 @@ def model_wrapper(scaled_x_t, t):
 
                     accelerator.backward(total_loss)
 
-
-
                     if accelerator.sync_gradients:
                         grad_norm = accelerator.clip_grad_norm_(transformer.parameters(), args.gradient_clip)
                         if torch.logical_or(grad_norm.isnan(), grad_norm.isinf()):
@@ -1504,7 +1571,6 @@ def model_wrapper(scaled_x_t, t):
 
                     accelerator.backward(loss_D)
 
-
                     if accelerator.sync_gradients:
                         grad_norm = accelerator.clip_grad_norm_(disc.parameters(), args.gradient_clip)
                         if torch.logical_or(grad_norm.isnan(), grad_norm.isinf()):
@@ -1519,7 +1585,6 @@ def model_wrapper(scaled_x_t, t):
                         optimizer_D.step()
                         optimizer_D.zero_grad(set_to_none=True)
 
-
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 progress_bar.update(1)
@@ -1584,7 +1649,6 @@ def model_wrapper(scaled_x_t, t):
                 images = None
                 del pipeline
 
-
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         transformer = unwrap_model(transformer)
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
@@ -186,91 +185,6 @@ def __call__(
         return hidden_states
 
 
-class SanaAttnProcessor3_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-
-    @staticmethod
-    def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
-    ) -> torch.Tensor:
-        B, H, L, S = *query.size()[:-1], key.size(-2)
-        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
-        attn_bias = torch.zeros(B, H, L, S, dtype=query.dtype, device=query.device)
-
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        attn_weight = query @ key.transpose(-2, -1) * scale_factor
-        attn_weight += attn_bias
-        attn_weight = torch.softmax(attn_weight, dim=-1)
-        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-        return attn_weight @ value
-
-    #     return x
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = self.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
 class SanaTransformerBlock(nn.Module):
     r"""
     Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
@@ -291,7 +205,6 @@ def __init__(
         attention_out_bias: bool = True,
         mlp_ratio: float = 2.5,
         qk_norm: Optional[str] = None,
-        cross_attention_type: str = "flash",
     ) -> None:
         super().__init__()
 
@@ -310,12 +223,6 @@ def __init__(
         )
 
         # 2. Cross Attention
-        if cross_attention_type == "flash":
-            cross_attention_processor = SanaAttnProcessor2_0()
-        elif cross_attention_type == "vanilla":
-            cross_attention_processor = SanaAttnProcessor3_0()
-        else:
-            raise ValueError(f"Cross attention type {cross_attention_type} is not defined.")
         if cross_attention_dim is not None:
             self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             self.attn2 = Attention(
@@ -328,7 +235,7 @@ def __init__(
                 dropout=dropout,
                 bias=True,
                 out_bias=attention_out_bias,
-                processor=cross_attention_processor,
+                processor=SanaAttnProcessor2_0(),
             )
 
         # 3. Feed-forward
@@ -453,7 +360,6 @@ def __init__(
         guidance_embeds_scale: float = 0.1,
         qk_norm: Optional[str] = None,
         timestep_scale: float = 1.0,
-        cross_attention_type: str = "flash",
     ) -> None:
         super().__init__()
 
@@ -496,7 +402,6 @@ def __init__(
                     norm_eps=norm_eps,
                     mlp_ratio=mlp_ratio,
                     qk_norm=qk_norm,
-                    cross_attention_type=cross_attention_type,
                 )
                 for _ in range(num_layers)
             ]