update ptxla example based on Pei's comments.

jfacevedo-google · jfacevedo-google · commit 96af06e84355 · 2024-11-01T23:36:58.000Z
diff --git a/examples/research_projects/pytorch_xla/README.md b/examples/research_projects/pytorch_xla/README.md
@@ -97,7 +97,7 @@ export DATASET_NAME=lambdalabs/naruto-blip-captions
 export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
 export TRAIN_STEPS=50
 export OUTPUT_DIR=/tmp/trained-model/
-python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=4 --loader_prefetch_size=4 --device_prefetch_size=4'
+python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=8 --loader_prefetch_size=4 --device_prefetch_size=4'
    
 ```
 
diff --git a/examples/research_projects/pytorch_xla/train_text_to_image_xla.py b/examples/research_projects/pytorch_xla/train_text_to_image_xla.py
@@ -32,7 +32,7 @@
 if is_wandb_available():
     pass
 
-PROFILE_DIR=os.environ.get('PROFILE_DIR', None)
+PROFILE_DIR = os.environ.get('PROFILE_DIR', None)
 CACHE_DIR = os.environ.get('CACHE_DIR', None)
 if CACHE_DIR:
     xr.initialize_cache(CACHE_DIR, readonly=False)
@@ -363,6 +363,14 @@ def parse_args():
             "Number of subprocesses to use for data loading to cpu."
         ),
     )
+    parser.add_argument(
+        "--loader_prefetch_factor",
+        type=int,
+        default=2,
+        help=(
+            "Number of batches loaded in advance by each worker."
+        ),
+    )
     parser.add_argument(
         "--device_prefetch_size",
         type=int,
@@ -579,7 +587,7 @@ def preprocess_train(examples):
         return examples
 
     train_dataset = dataset["train"]
-    train_dataset.set_format('torch')
+    train_dataset.set_format("torch")
     train_dataset.set_transform(preprocess_train)
 
     def collate_fn(examples):
@@ -601,6 +609,7 @@ def collate_fn(examples):
         collate_fn=collate_fn,
         num_workers=args.dataloader_num_workers,
         batch_size=args.train_batch_size,
+        prefetch_factor=args.loader_prefetch_factor,
     )
 
     train_dataloader = pl.MpDeviceLoader(
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -20,7 +20,7 @@
 from torch import nn
 
 from ..image_processor import IPAdapterMaskProcessor
-from ..utils import deprecate, logging, is_torch_xla_available
+from ..utils import deprecate, is_torch_xla_available, logging
 from ..utils.import_utils import is_torch_npu_available, is_xformers_available
 from ..utils.torch_utils import is_torch_version, maybe_allow_in_graph
 
@@ -2484,7 +2484,7 @@ def __call__(
                 attention_mask = attention_mask.view(batch_size, 1, 1, attention_mask.shape[-1])
                 # Convert mask to float and replace 0s with -inf and 1s with 0
                 attention_mask = attention_mask.float().masked_fill(attention_mask == 0, float('-inf')).masked_fill(attention_mask == 1, float(0.0))
-            
+
                 # Apply attention mask to key
                 key = key + attention_mask
             query /= math.sqrt(query.shape[3])