Latest SDXL commits

bhavya01 · bhavya01 · commit a8afedfd743c · 2025-04-03T19:07:03.000Z
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py
@@ -829,7 +829,7 @@ def collate_fn(examples):
         print(f"  Total optimization steps = {args.max_train_steps}")
     
     # unet = add_checkpoints(unet)
-
+    # import pdb; pdb.set_trace()
     trainer = TrainSD(
         weight_dtype=weight_dtype,
         device=device,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3510,7 +3510,7 @@ def __call__(
         # if attn.group_norm is not None:
         #     hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
-        
+        # batch_size = hidden_states.shape[0]
         
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
@@ -3528,10 +3528,10 @@ def __call__(
 
         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
+        """
         assert attn.norm_q is None
         assert attn.norm_k is None
-        """
+
         # if attn.norm_q is not None:
         #     query = attn.norm_q(query)
         # if attn.norm_k is not None:
@@ -3557,26 +3557,27 @@ def __call__(
         #     # logger.warning(
         #     #     "Unable to use the flash attention pallas kernel API call due to QKV sequence length < 4096."
         #     # )
-        #     hidden_states = self.scaled_dot_product_attention_compiled(
-        #         query, key, value
-        #     )
+        # hidden_states = self.scaled_dot_product_attention(
+        #     query, key, value
+        # )
         
         #*hidden_states = JaxFun.apply(query, key, value)
+        import pdb; pdb.set_trace()
         hidden_states = JaxFun.apply(hidden_states, encoder_hidden_states, attn.to_q.weight, attn.to_k.weight, attn.to_v.weight, attn.heads)
         hidden_states = hidden_states.to(input_dtype)
 
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)
         # dropout
-        hidden_states = attn.to_out[1](hidden_states)
+        # hidden_states = attn.to_out[1](hidden_states)
 
         # if input_ndim == 4:
         #     hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
 
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
+        # if attn.residual_connection:
+        #     hidden_states = hidden_states + residual
 
-        hidden_states = hidden_states / attn.rescale_output_factor
+        # hidden_states = hidden_states / attn.rescale_output_factor
 
         return hidden_states
 
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
@@ -24,7 +24,7 @@
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import LegacyModelMixin
 from ..normalization import AdaLayerNormSingle
-
+import torch_xla.debug.profiler as xp
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -321,6 +321,7 @@ def _init_patched_inputs(self, norm_type):
                 in_features=self.caption_channels, hidden_size=self.inner_dim
             )
 
+    @xp.trace_me("Transformer2Dmodel")
     def forward(
         self,
         hidden_states: torch.Tensor,