gradient checkpointing

a-r-r-o-w · a-r-r-o-w · commit 015cc78bffc1 · 2024-10-22T03:27:32.000+02:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py b/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py
@@ -512,12 +512,26 @@ def forward(self, sample: torch.Tensor) -> torch.Tensor:
         sample = self.temp_conv_in(sample)
         sample = sample + residual
 
-        # Down blocks
-        for down_block in self.down_blocks:
-            sample = down_block(sample)
+        if self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
 
-        # Mid block
-        sample = self.mid_block(sample)
+                return custom_forward
+
+            # Down blocks
+            for down_block in self.down_blocks:
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+
+            # Mid block
+            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+        else:
+            # Down blocks
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+
+            # Mid block
+            sample = self.mid_block(sample)
 
         # Post process
         sample = sample.permute(0, 2, 1, 3, 4).flatten(0, 1)
@@ -625,7 +639,6 @@ def __init__(
         self.temp_conv_out = nn.Conv3d(block_out_channels[0], block_out_channels[0], (3, 1, 1), padding=(1, 0, 0))
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
-        # TODO(aryan): implement gradient checkpointing
         self.gradient_checkpointing = False
 
     def forward(self, sample: torch.Tensor) -> torch.Tensor:
@@ -641,13 +654,34 @@ def forward(self, sample: torch.Tensor) -> torch.Tensor:
 
         upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
 
-        # Mid block
-        sample = self.mid_block(sample)
-        sample = sample.to(upscale_dtype)
+        if self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # Mid block
+            sample = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                sample
+            )
+
+            # Up blocks
+            for up_block in self.up_blocks:
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(up_block),
+                    sample
+                )
 
-        # Up blocks
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
+        else:
+            # Mid block
+            sample = self.mid_block(sample)
+            sample = sample.to(upscale_dtype)
+
+            # Up blocks
+            for up_block in self.up_blocks:
+                sample = up_block(sample)
 
         # Post process
         sample = sample.permute(0, 2, 1, 3, 4).flatten(0, 1)
@@ -783,6 +817,10 @@ def __init__(
             self.sample_size - self.tile_overlap[1],
         )  # (16, 112, 192)
 
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (AllegroEncoder3D, AllegroDecoder3D)):
+            module.gradient_checkpointing = value
+
     def encode(
         self, input_imgs: torch.Tensor, return_dict: bool = True, local_batch_size=1
     ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py
@@ -13,14 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import logging
+from ...utils import is_torch_version, logging
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
 from ..attention_processor import AllegroAttnProcessor2_0, Attention
@@ -335,14 +335,34 @@ def forward(
 
         for i, block in enumerate(self.transformer_blocks):
             # TODO(aryan): Implement gradient checkpointing
-            hidden_states = block(
-                hidden_states=hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                temb=timestep,
-                attention_mask=attention_mask,
-                encoder_attention_mask=encoder_attention_mask,
-                image_rotary_emb=image_rotary_emb,
-            )
+            if self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep,
+                    attention_mask,
+                    encoder_attention_mask,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=timestep,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
 
         # 3. Output
         shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)