Add gradient checkpointing

bhavya01 · bhavya01 · commit 3e0cd93b378c · 2025-03-30T00:26:12.000Z
diff --git a/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/pytorch_xla/training/text_to_image/train_text_to_image_sdxl.py
@@ -21,8 +21,13 @@
 from torchvision import transforms
 from torchvision.transforms.functional import crop
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, AutoTokenizer
+from transformers.trainer_pt_utils import get_module_class_from_name
 from viztracer import VizTracer
 
+from torch._dispatch.python import suspend_functionalization
+from torch._subclasses.functional_tensor import disable_functional_mode
+
+from torch_xla.distributed.fsdp import checkpoint_module
 from diffusers import (
     AutoencoderKL,
     DDPMScheduler,
@@ -118,6 +123,35 @@ def main(args):
     model_card.save(os.path.join(repo_folder, "README.md"))
 
 
+def wrap_module(
+    mod: torch.nn.Module, transform, prefix: tuple[str, ...] = tuple()
+) -> torch.nn.Module:
+    """
+    Recursively transforms the modules by calling `transform` on them.
+
+    You may use this to apply sharding, checkpointing, optimization barriers, etc.
+
+    Start from the leaf modules and work our way up, to handle cases where one
+    module is the child of another. The child modules will be transformed first,
+    and then the parent module will be transformed, possibly with transformed
+    children.
+    """
+    new_children = {}
+    for name, child in mod.named_children():
+        new_children[name] = wrap_module(child, transform, prefix + (name,))
+    for name, new_child in new_children.items():
+        mod.set_submodule(name, new_child)
+    return transform(mod)
+
+def add_checkpoints(model):
+    remat_classes = [get_module_class_from_name(model, "BasicTransformerBlock")]
+    import pdb; pdb.set_trace()
+    def maybe_checkpoint(mod):
+        if isinstance(mod, tuple(remat_classes)):
+            return checkpoint_module(mod)
+        return mod
+    return wrap_module(model, maybe_checkpoint)
+
 class TrainSD:
     def __init__(
         self,
@@ -163,13 +197,14 @@ def start_training(self):
                 tracer = VizTracer()
             else:
                 tracer = None
-            loss = self.step_fn(
-                tracer,
-                batch["model_input"],
-                batch["prompt_embeds"],
-                batch["pooled_prompt_embeds"],
-                batch["original_sizes"],
-                batch["crop_top_lefts"])
+            with suspend_functionalization(), disable_functional_mode():
+                loss = self.step_fn(
+                    tracer,
+                    batch["model_input"],
+                    batch["prompt_embeds"],
+                    batch["pooled_prompt_embeds"],
+                    batch["original_sizes"],
+                    batch["crop_top_lefts"])
             self.global_step += 1
 
             def print_loss_closure(step, loss):
@@ -647,9 +682,9 @@ def main(args):
       use_fast=False
     )
 
-    from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
+    # from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear
 
-    unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
+    # unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
     unet.enable_xla_flash_attention(partition_spec=("data", None, None, None))
 
     vae.requires_grad_(False)
@@ -810,6 +845,8 @@ def collate_fn(examples):
             f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_hosts}"
         )
         print(f"  Total optimization steps = {args.max_train_steps}")
+    
+    unet = add_checkpoints(unet)
 
     trainer = TrainSD(
         weight_dtype=weight_dtype,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3390,12 +3390,12 @@ def scaled_dot_product_attention_jax(query, key, value):
     # x = wrapped_attention(query, key, value)
     # return x
 
-@functools.lru_cache(maxsize=16)
+@functools.lru_cache(maxsize=256)
 def _get_jax_forward_function():
     """Cached factory function to create JAX forward functions"""
     return scaled_dot_product_attention_jax
 
-@functools.lru_cache(maxsize=16)
+@functools.lru_cache(maxsize=256)
 def _get_jax_backward_function():
     """Cached factory function to create JAX backward functions"""
     jax_f = _get_jax_forward_function()
@@ -3419,14 +3419,12 @@ def scaled_dot_product_attention_jax_wrapper(query, key, value, grad_output=None
 class JaxFun(torch.autograd.Function):
       @staticmethod
       def forward(ctx, query, key, value):
-        # sample_inputs = [abstractify(query), abstractify(key), abstractify(value)]
         ctx.save_for_backward(query, key, value)
         out = scaled_dot_product_attention_jax_wrapper(query, key, value)
         return out
 
       @staticmethod
       def backward(ctx, grad_out):
-        # import pdb; pdb.set_trace()
         query, key, value = ctx.saved_tensors
         q_grad, k_grad, v_grad = scaled_dot_product_attention_jax_wrapper(query, key, value, grad_output=grad_out, is_forward=False)
         return q_grad, k_grad, v_grad
@@ -3453,6 +3451,7 @@ def flash_attention(self, query, key, value):
         p = self.partition_spec if is_spmd() else None
         return flash_attention(query, key, value, causal=False, partition_spec=p)
     
+    @xp.trace_me("scaled_dot_product_attention")
     def scaled_dot_product_attention(self, query, key, value) -> torch.Tensor:
         scale_factor = 1 / math.sqrt(query.size(-1))
         attn_weight = query @ key.transpose(-2, -1) * scale_factor
@@ -3537,10 +3536,10 @@ def __call__(
             # logger.warning(
             #     "Unable to use the flash attention pallas kernel API call due to QKV sequence length < 4096."
             # )
-            # hidden_states = self.scaled_dot_product_attention(
-            #     query, key, value
-            # )
-            hidden_states = JaxFun.apply(query, key, value)
+            hidden_states = self.scaled_dot_product_attention(
+                query, key, value
+            )
+            # hidden_states = JaxFun.apply(query, key, value)
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
diff --git a/train.sh b/train.sh
@@ -1,9 +1,9 @@
 export XLA_DISABLE_FUNCTIONALIZATION=1
-export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
+# export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
 export PROFILE_DIR=/mnt/bbahl/xla_profile/
 export CACHE_DIR=/mnt/bbahl/xla_cache/
 export DATASET_NAME=lambdalabs/naruto-blip-captions
-export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
+export PER_HOST_BATCH_SIZE=40 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
 export TRAIN_STEPS=50
 export PROFILE_START_STEP=10
 export OUTPUT_DIR=/tmp/trained-model/