huggingface · yiyixuxu · Jan 24, 2025 · Dec 30, 2024 · Dec 30, 2024 · Jan 3, 2025
diff --git a/examples/dreambooth/train_dreambooth_lora_sana.py b/examples/dreambooth/train_dreambooth_lora_sana.py
@@ -63,6 +63,7 @@
     is_wandb_available,
 )
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available
 from diffusers.utils.torch_utils import is_compiled_module
 
 
@@ -74,6 +75,9 @@
 
 logger = get_logger(__name__)
 
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
+
 
 def save_model_card(
     repo_id: str,
@@ -920,8 +924,7 @@ def main(args):
                     image.save(image_filename)
 
             del pipeline
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
+            free_memory()
 
     # Handle the repository creation
     if accelerator.is_main_process:
@@ -979,10 +982,10 @@ def main(args):
         )
 
     # VAE should always be kept in fp32 for SANA (?)
-    vae.to(dtype=torch.float32)
+    vae.to(accelerator.device, dtype=torch.float32)
     transformer.to(accelerator.device, dtype=weight_dtype)
     # because Gemma2 is particularly suited for bfloat16.
-    text_encoder.to(dtype=torch.bfloat16)
+    text_encoder.to(accelerator.device, dtype=torch.bfloat16)
 
     # Initialize a text encoding pipeline and keep it to CPU for now.
     text_encoding_pipeline = SanaPipeline.from_pretrained(

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3147,7 +3147,16 @@ def __call__(
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             # scaled_dot_product_attention expects attention_mask shape to be
             # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+            attn_mask = attention_mask[0]
+            seq_len = hidden_states.shape[1]
+            attention_mask = attn_mask.repeat_interleave(seq_len * batch_size, dim=0)
+            attention_mask = attention_mask.view(batch_size, 1, -1, attention_mask.shape[-1])
+
+            if attention_mask.dtype != torch.uint8:
+                if attention_mask.dtype == torch.bool:
+                    attention_mask = torch.logical_not(attention_mask.bool())
+                else:
+                    attention_mask = attention_mask.to(torch.uint8)
 
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -19,11 +19,12 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers, is_torch_npu_available
 from ..attention_processor import (
     Attention,
     AttentionProcessor,
     AttnProcessor2_0,
+    AttnProcessorNPU,
     SanaLinearAttnProcessor2_0,
 )
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
@@ -119,6 +120,12 @@ def __init__(
         # 2. Cross Attention
         if cross_attention_dim is not None:
             self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+            if is_torch_npu_available():
+                attn_processor = AttnProcessorNPU()
+            else:
+                attn_processor = AttnProcessor2_0()
+
             self.attn2 = Attention(
                 query_dim=dim,
                 cross_attention_dim=cross_attention_dim,
@@ -127,7 +134,7 @@ def __init__(
                 dropout=dropout,
                 bias=True,
                 out_bias=attention_out_bias,
-                processor=AttnProcessor2_0(),
+                processor=attn_processor,
             )
 
         # 3. Feed-forward