entrpn
diff --git a/‎examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py‎
Lines changed: 14 additions & 24 deletions b/‎examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py‎
Lines changed: 14 additions & 24 deletions
diff --git a/‎src/diffusers/models/attention.py‎
Lines changed: 73 additions & 76 deletions b/‎src/diffusers/models/attention.py‎
Lines changed: 73 additions & 76 deletions
@@ -549,10 +549,6 @@ def encode_prompt(batch, text_encoders, tokenizers, proportion_empty_prompts, ca
 
 
     prompt_embeds = torch.concat(prompt_embeds_list, dim=-1).to(dtype=dtype)
-    print(prompt_embeds.shape)
-    p3d = (0,0, 0, 128-77)
-    prompt_embeds = F.pad(prompt_embeds, p3d, "constant", 0)
-    print(prompt_embeds.shape)
     pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1).to(dtype=dtype)
     return {"prompt_embeds": prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds}
 
@@ -667,11 +663,7 @@ def main(args):
       revision=args.revision,
       use_fast=False
     )
-
-    # from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
-
-    # unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
-    unet.enable_xla_flash_attention(partition_spec=("data", None, None, None))
+    unet.enable_xla_attention(partition_spec=("data", None, None, None))
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -758,11 +750,14 @@ def preprocess_train(examples):
     from datasets.fingerprint import Hasher
     # import pdb; pdb.set_trace()
     old_batch_size = args.train_batch_size
-    args.train_batch_size=21
+    old_arg = args.output_dir
+    args.output_dir = '/tmp/trained-model/'
+    args.train_batch_size=22
     new_fingerprint = Hasher.hash(args)
     args.train_batch_size=64
     new_fingerprint_for_vae = Hasher.hash((args.pretrained_model_name_or_path, args))
     args.train_batch_size=old_batch_size
+    args.output_dir = old_arg
     train_dataset_with_embeddings = train_dataset.map(
         compute_embeddings_fn, batched=True, batch_size=50, new_fingerprint=new_fingerprint
     )
@@ -829,7 +824,6 @@ def collate_fn(examples):
         print(f"  Total optimization steps = {args.max_train_steps}")
 
     # unet = add_checkpoints(unet)
-    # import pdb; pdb.set_trace()
     trainer = TrainSD(
         weight_dtype=weight_dtype,
         device=device,
@@ -840,19 +834,15 @@ def collate_fn(examples):
         args=args,
     )
     trainer.start_training()
-    # unet = trainer.unet.to("cpu")
-    # vae = trainer.vae.to("cpu")
-    # text_encoder = trainer.text_encoder.to("cpu")
-
-    # pipeline = StableDiffusionXLPipeline.from_pretrained(
-    #     args.pretrained_model_name_or_path,
-    #     text_encoder=text_encoder,
-    #     vae=vae,
-    #     unet=unet,
-    #     revision=args.revision,
-    #     variant=args.variant,
-    # )
-    # pipeline.save_pretrained(args.output_dir)
+    unet = trainer.unet.to("cpu")
+
+    pipeline = StableDiffusionXLPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        unet=unet,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    pipeline.save_pretrained(args.output_dir)
 
     # if xm.is_master_ordinal() and args.push_to_hub:
     #     save_model_card(args, repo_id, repo_folder=args.output_dir)
 
@@ -340,7 +340,7 @@ def __init__(
         self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
         self.use_layer_norm = norm_type == "layer_norm"
         self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
-        assert norm_type in ["layer_norm", "layer_norm_i2vgen"]
+
         if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
             raise ValueError(
                 f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
@@ -359,7 +359,6 @@ def __init__(
             self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
         else:
             self.pos_embed = None
-        assert self.pos_embed == None
 
         # Define 3 blocks. Each block has its own normalization layer.
         # 1. Self-Attn
@@ -468,7 +467,6 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
         self._chunk_size = chunk_size
         self._chunk_dim = dim
 
-    # @xp.trace_me("BasicTransformerBlock")
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -480,42 +478,39 @@ def forward(
         class_labels: Optional[torch.LongTensor] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.Tensor:
-        # import pdb; pdb.set_trace()
-        # if cross_attention_kwargs is not None:
-        #     if cross_attention_kwargs.get("scale", None) is not None:
-        #         logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
 
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 0. Self-Attention
-        # batch_size = hidden_states.shape[0]
-
-        # if self.norm_type == "ada_norm":
-        #     norm_hidden_states = self.norm1(hidden_states, timestep)
-        # elif self.norm_type == "ada_norm_zero":
-        #     norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-        #         hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-        #     )
-        # elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
-        norm_hidden_states = self.norm1(hidden_states)
-        # elif self.norm_type == "ada_norm_continuous":
-        #     norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        # elif self.norm_type == "ada_norm_single":
-        #     shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-        #         self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
-        #     ).chunk(6, dim=1)
-        #     norm_hidden_states = self.norm1(hidden_states)
-        #     norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-        # else:
-        #     raise ValueError("Incorrect norm used")
-
-        # if self.pos_embed is not None:
-        #     norm_hidden_states = self.pos_embed(norm_hidden_states)
+        batch_size = hidden_states.shape[0]
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
 
         # 1. Prepare GLIGEN inputs
-        assert cross_attention_kwargs is None
-        cross_attention_kwargs = {}
-        # cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
-        # gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
 
         attn_output = self.attn1(
             norm_hidden_states,
@@ -524,33 +519,36 @@ def forward(
             **cross_attention_kwargs,
         )
 
-        # if self.norm_type == "ada_norm_zero":
-        #     attn_output = gate_msa.unsqueeze(1) * attn_output
-        # elif self.norm_type == "ada_norm_single":
-        #     attn_output = gate_msa * attn_output
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
 
         hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
         # 1.2 GLIGEN Control
-        # if gligen_kwargs is not None:
-        #     hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
 
         # 3. Cross-Attention
         if self.attn2 is not None:
-            # if self.norm_type == "ada_norm":
-            #     norm_hidden_states = self.norm2(hidden_states, timestep)
-            # elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
-            norm_hidden_states = self.norm2(hidden_states)
-            # elif self.norm_type == "ada_norm_single":
-            #     # For PixArt norm2 isn't applied here:
-            #     # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
-            #     norm_hidden_states = hidden_states
-            # elif self.norm_type == "ada_norm_continuous":
-            #     norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
-            # else:
-            #     raise ValueError("Incorrect norm")
-
-            # if self.pos_embed is not None and self.norm_type != "ada_norm_single":
-            #     norm_hidden_states = self.pos_embed(norm_hidden_states)
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
 
             attn_output = self.attn2(
                 norm_hidden_states,
@@ -562,33 +560,32 @@ def forward(
 
         # 4. Feed-forward
         # i2vgen doesn't have this norm 🤷‍♂️
-        # if self.norm_type == "ada_norm_continuous":
-        #     norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        # elif not self.norm_type == "ada_norm_single":
-        norm_hidden_states = self.norm3(hidden_states)
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
 
-        # if self.norm_type == "ada_norm_zero":
-        #     norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
-        # if self.norm_type == "ada_norm_single":
-        #     norm_hidden_states = self.norm2(hidden_states)
-        #     norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
 
-        assert self._chunk_size == None
-        # if self._chunk_size is not None:
-        #     # "feed_forward_chunk_size" can be used to save memory
-        #     ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
-        # else:
-        ff_output = self.ff(norm_hidden_states)
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
 
-        # if self.norm_type == "ada_norm_zero":
-        #     ff_output = gate_mlp.unsqueeze(1) * ff_output
-        # elif self.norm_type == "ada_norm_single":
-        #     ff_output = gate_mlp * ff_output
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
 
         hidden_states = ff_output + hidden_states
-        # if hidden_states.ndim == 4:
-        #     hidden_states = hidden_states.squeeze(1)
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
 
         return hidden_states