optimizations - XLA_DISABLE_FUNCTIONALIZATION=1

jfacevedo-google · jfacevedo-google · commit fad802038ed1 · 2025-03-25T17:41:08.000Z
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/README_sdxl.md b/examples/research_projects/pytorch_xla/training/text_to_image/README_sdxl.md
@@ -44,13 +44,15 @@ Install PyTorch and PyTorch/XLA nightly versions:
 gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
 --project=${PROJECT_ID} --zone=${ZONE} --worker=all \
 --command='
-pip install torch==2.6.0+cpu.cxx11.abi \
-https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0%2Bcxx11-cp310-cp310-manylinux_2_28_x86_64.whl \
-  'torch_xla[tpu]' \
+pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev-cp310-cp310-linux_x86_64.whl' \
   -f https://storage.googleapis.com/libtpu-releases/index.html \
-  -f https://storage.googleapis.com/libtpu-wheels/index.html \
-  -f https://download.pytorch.org/whl/torch
-pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+  -f https://storage.googleapis.com/libtpu-wheels/index.html
+
+# Optional: if you're using custom kernels, install pallas dependencies
+pip install 'torch_xla[pallas]' \
+  -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
+  -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 '
 ```
 
@@ -72,7 +74,6 @@ cd diffusers
 git checkout main
 cd examples/research_projects/pytorch_xla/training/text_to_image/
 pip3 install -r requirements_sdxl.txt
-pip3 install pillow --upgrade
 cd ../../../../../
 pip3 install .'
 ```
@@ -94,14 +95,14 @@ are fixed.
 gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
 --project=${PROJECT_ID} --zone=${ZONE} --worker=all \
 --command='
-export XLA_DISABLE_FUNCTIONALIZATION=0
+export XLA_DISABLE_FUNCTIONALIZATION=1
 export PROFILE_DIR=/tmp/
 export CACHE_DIR=/tmp/
 export DATASET_NAME=lambdalabs/naruto-blip-captions
-export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
+export GLOBAL_BATCH_SIZE=32
 export TRAIN_STEPS=50
 export OUTPUT_DIR=/tmp/trained-model/
-python diffusers/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 --dataset_name=$DATASET_NAME --resolution=1024 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=8 --loader_prefetch_size=4 --device_prefetch_size=4'
+python examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0 --dataset_name=$DATASET_NAME --resolution=1024 --center_crop --random_flip --train_batch_size=$GLOBAL_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=8 --loader_prefetch_size=16 --device_prefetch_size=16'
 ```
 
 Pass `--print_loss` if you would like to see the loss printed at every step. Be aware that printing the loss at every step disrupts the optimized flow execution, thus the step time will be longer. 
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/requirements_sdxl.txt b/examples/research_projects/pytorch_xla/training/text_to_image/requirements_sdxl.txt
@@ -1,6 +1,4 @@
 accelerate>=0.16.0
-torch==2.5.1
-torchvision==0.20.1
 transformers>=4.25.1
 datasets>=2.19.1
 ftfy
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py
@@ -31,6 +31,7 @@
 from diffusers.utils import is_wandb_available
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 
+# torch._dynamo.config.force_parameter_static_shapes = False
 
 if is_wandb_available():
     pass
@@ -148,16 +149,12 @@ def start_training(self):
         for step in range(0, self.args.max_train_steps):
             print("step: ", step)
             batch = next(self.dataloader)
-            if step == measure_start_step and PROFILE_DIR is not None:
-                xm.wait_device_ops()
-                xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
+            if step == measure_start_step:
+                if PROFILE_DIR is not None:
+                    xm.wait_device_ops()
+                    xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
                 last_time = time.time()
-            loss = self.step_fn(
-                batch["model_input"],
-                batch["prompt_embeds"],
-                batch["pooled_prompt_embeds"],
-                batch["original_sizes"],
-                batch["crop_top_lefts"])
+            loss = self.step_fn(batch)
             self.global_step += 1
 
             def print_loss_closure(step, loss):
@@ -182,15 +179,15 @@ def print_loss_closure(step, loss):
 
     def step_fn(
         self,
-        model_input,
-        prompt_embeds,
-        pooled_prompt_embeds,
-        original_sizes,
-        crop_top_lefts
+        batch
     ):
         with xp.Trace("model.forward"):
             self.optimizer.zero_grad()
-            
+            model_input = batch["model_input"]
+            prompt_embeds = batch["prompt_embeds"]
+            pooled_prompt_embeds = batch["pooled_prompt_embeds"]
+            original_sizes = batch["original_sizes"]
+            crop_top_lefts = batch["crop_top_lefts"]
             
             noise = torch.randn_like(model_input).to(self.device, dtype=self.weight_dtype)
             bsz = model_input.shape[0]
@@ -638,6 +635,7 @@ def main(args):
     text_encoder_2 = text_encoder_2.to(device, dtype=weight_dtype)
     vae = vae.to(device, dtype=weight_dtype)
     unet = unet.to(device, dtype=weight_dtype)
+    #unet = torch.compile(unet, backend='openxla', dynamic=True)
     optimizer = setup_optimizer(unet, args)
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3241,7 +3241,6 @@ def __call__(
 def xla_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
     L, S = query.size(-2), key.size(-2)
     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
-    attn_bias = torch.zeros(L, S, dtype=query.dtype)
     if is_causal:
         assert attn_mask is None
         temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
@@ -3254,7 +3253,6 @@ def xla_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_
         else:
             attn_bias += attn_mask
     attn_weight = query @ key.transpose(-2, -1) * scale_factor
-    attn_weight += attn_bias
     attn_weight = torch.softmax(attn_weight, dim=-1)
     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
     return attn_weight @ value
@@ -3330,7 +3328,7 @@ def __call__(
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = self.xla_scaled_dot_product_attention(
+        hidden_states = F.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
 
@@ -3428,7 +3426,7 @@ def __call__(
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         if all(tensor.shape[2] >= 4096 for tensor in [query, key, value]):
-            logger.warning("Using flash attention")
+            # logger.warning("Using flash attention")
             if attention_mask is not None:
                 attention_mask = attention_mask.view(batch_size, 1, 1, attention_mask.shape[-1])
                 # Convert mask to float and replace 0s with -inf and 1s with 0
@@ -3444,9 +3442,9 @@ def __call__(
             partition_spec = self.partition_spec if is_spmd() else None
             hidden_states = flash_attention(query, key, value, causal=False, partition_spec=partition_spec)
         else:
-            logger.warning(
-                "Unable to use the flash attention pallas kernel API call due to QKV sequence length < 4096."
-            )
+            # logger.warning(
+            #     "Unable to use the flash attention pallas kernel API call due to QKV sequence length < 4096."
+            # )
             hidden_states = xla_scaled_dot_product_attention(
                 query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
             )