Add inference code

bhavya01 · bhavya01 · commit c6ff80ce9e9f · 2025-04-04T17:31:11.000Z
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/inference_sdxl.py b/examples/research_projects/pytorch_xla/training/text_to_image/inference_sdxl.py
@@ -0,0 +1,36 @@
+import torch
+import os
+import sys
+import  numpy as np
+
+import torch_xla.core.xla_model as xm
+from time import time
+from diffusers import StableDiffusionXLPipeline
+import torch_xla.runtime as xr
+
+CACHE_DIR = os.environ.get("CACHE_DIR", '/mnt/bbahl/xla_cache/')
+if CACHE_DIR:
+    xr.initialize_cache(CACHE_DIR, readonly=False)
+
+
+device = xm.xla_device()
+model_path = "/mnt/bbahl/trained-model"
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    model_path, 
+    torch_dtype=torch.bfloat16
+)
+pipe.to(device)
+prompt = ["A naruto with green eyes and red legs."]
+
+pipe.unet.enable_xla_attention()
+# pipe.vae.enable_xla_attention()
+start = time()
+print("compiling...")
+import pdb; pdb.set_trace()
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+print(f"compile time: {time() - start}")
+print("generate...")
+start = time()
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+print(f"generation time (after compile) : {time() - start}")
+image.save("naruto.png")
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -16,7 +16,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-import torch_xla.debug.profiler as xp
 
 from ..utils import deprecate, logging
 from ..utils.torch_utils import maybe_allow_in_graph
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -14,11 +14,9 @@
 import inspect
 import math
 from typing import Callable, List, Optional, Tuple, Union
-import functools
 import torch
 import torch.nn.functional as F
 from torch import nn
-import torch_xla.debug.profiler as xp
 from ..image_processor import IPAdapterMaskProcessor
 from ..utils import deprecate, is_torch_xla_available, logging
 from ..utils.import_utils import is_torch_npu_available, is_torch_xla_version, is_xformers_available
@@ -3239,6 +3237,24 @@ def __call__(
 
         return hidden_states
 
+def scaled_dot_product_attention(q, k, v):
+    """
+    Compute the attention weights and output using scaled dot-product attention.
+
+    Args:
+        q (`torch.Tensor`):
+            Query tensor of shape (batch_size, num_heads, seq_length, head_dim).
+        k (`torch.Tensor`):
+            Key tensor of shape (batch_size, num_heads, seq_length, head_dim).
+        v (`torch.Tensor`):
+            Value tensor of shape (batch_size, num_heads, seq_length, head_dim).
+
+    Returns:
+        `torch.Tensor`: The output tensor after applying attention.
+    """
+    attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.size(-1))
+    attn_weights = F.softmax(attn_weights, dim=-1)
+    return torch.matmul(attn_weights, v)
 
 class AttnProcessor2_0:
     r"""
@@ -3268,7 +3284,6 @@ def __call__(
             hidden_states = attn.spatial_norm(hidden_states, temb)
 
         input_ndim = hidden_states.ndim
-
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
@@ -3311,8 +3326,8 @@ def __call__(
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        hidden_states = scaled_dot_product_attention(
+            query, key, value
         )
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
@@ -3464,6 +3479,10 @@ def __call__(
         *args,
         **kwargs,
     ) -> torch.Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         hidden_states = CrossAttention.apply(hidden_states, encoder_hidden_states, attn.to_q.weight, attn.to_k.weight, attn.to_v.weight, attn.heads)
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
@@ -24,7 +24,6 @@
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import LegacyModelMixin
 from ..normalization import AdaLayerNormSingle
-import torch_xla.debug.profiler as xp
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -321,7 +320,6 @@ def _init_patched_inputs(self, norm_type):
                 in_features=self.caption_channels, hidden_size=self.inner_dim
             )
 
-    @xp.trace_me("Transformer2Dmodel")
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -1109,7 +1109,7 @@ def __call__(
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, sigmas
+            self.scheduler, num_inference_steps, 'cpu', timesteps, sigmas
         )
 
         # 5. Prepare latent variables
@@ -1209,8 +1209,9 @@ def __call__(
 
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                prompt_embeds = prompt_embeds.to(dtype=torch.bfloat16)
                 # predict the noise residual
-                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                added_cond_kwargs = {"text_embeds": add_text_embeds.to(dtype=torch.bfloat16), "time_ids": add_time_ids.to(dtype=torch.bfloat16)}
                 if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
                     added_cond_kwargs["image_embeds"] = image_embeds
                 noise_pred = self.unet(
@@ -1295,7 +1296,8 @@ def __call__(
                 self.vae.to(dtype=torch.float16)
         else:
             image = latents
-
+        xm.mark_step()
+        image = image.to('cpu')
         if not output_type == "latent":
             # apply watermark if available
             if self.watermark is not None: