update

DN6 · DN6 · commit 1fdae85f494a · 2024-08-14T14:19:20.000+02:00
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -449,7 +449,7 @@ def forward(
             norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
         elif self.norm_type == "ada_norm_single":
             shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+                self.scale_shift_table[None].to(timestep.dtype) + timestep.reshape(batch_size, 6, -1)
             ).chunk(6, dim=1)
             norm_hidden_states = self.norm1(hidden_states)
             norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -60,6 +60,8 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
             Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
     """
 
+    _always_upcast_modules = ["MaskConditionDecoder"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -70,6 +70,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
+    _always_upcast_modules = ["Decoder"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -192,6 +192,7 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["TemporalDecoder"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/autoencoders/autoencoder_oobleck.py b/src/diffusers/models/autoencoders/autoencoder_oobleck.py
@@ -317,6 +317,7 @@ class AutoencoderOobleck(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = False
+    _always_upcast_modules = ["OobleckEncoder", "OobleckDecoder"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -330,7 +330,7 @@ def decode(
             Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.
 
         """
-        z = (z * self.config.scaling_factor - self.means) / self.stds
+        z = (z * self.config.scaling_factor - self.means.to(z.dtype)) / self.stds.to(z.dtype)
 
         scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
         z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)
diff --git a/src/diffusers/models/autoencoders/vq_model.py b/src/diffusers/models/autoencoders/vq_model.py
@@ -71,6 +71,8 @@ class VQModel(ModelMixin, ConfigMixin):
             Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
     """
 
+    _always_upcast_modules = ["Decoder", "VectorQuantizer"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -264,28 +264,61 @@ def disable_xformers_memory_efficient_attention(self) -> None:
         self.set_use_memory_efficient_attention_xformers(False)
 
     def enable_layerwise_upcasting(self, upcast_dtype=None):
+        r"""
+        Enable layerwise dynamic upcasting. This allows models to be loaded into the GPU in a low memory dtype e.g.
+        torch.float8_e4m3fn, but perform inference using a dtype that is supported by the GPU, by upcasting the
+        individual modules in the model to the appropriate dtype right before the foward pass.
+
+        The module is then moved back to the low memory dtype after the foward pass.
+        """
+
         upcast_dtype = upcast_dtype or torch.float32
-        downcast_dtype = self.dtype
+        original_dtype = self.dtype
 
-        def upcast_hook_fn(module):
+        def upcast_dtype_hook_fn(module, *args, **kwargs):
             module = module.to(upcast_dtype)
 
-        def downcast_hook_fn(module):
-            module = module.to(downcast_dtype)
+        def cast_to_original_dtype_hook_fn(module, *args, **kwargs):
+            module = module.to(original_dtype)
 
         def fn_recursive_upcast(module):
+            """In certain cases modules will apply casting internally or reference the dtype of internal blocks.
+
+            e.g.
+
+            ```
+            class MyModel(nn.Module):
+                def forward(self, x):
+                    dtype = next(iter(self.blocks.parameters())).dtype
+                    x = self.blocks(x) + torch.ones(x.size()).to(dtype)
+            ```
+            Layerwise upcasting will not work here, since the internal blocks remain in the low memory dtype until
+            their `forward` method is called. We need to add the upcast hook on the entire module in order for the
+            operation to work.
+
+            The `_always_upcast_modules` class attribute is a list of modules within the model that we must upcast
+            entirely, rather than layerwise.
+
+            """
+            if hasattr(self, "_always_upcast_modules") and module.__class__.__name__ in self._always_upcast_modules:
+                # Upcast entire module and exist recursion
+                module.register_forward_pre_hook(upcast_dtype_hook_fn)
+                module.register_forward_hook(cast_to_original_dtype_hook_fn)
+
+                return
+
             has_children = list(module.children())
             if not has_children:
-                module.register_forward_pre_hook(upcast_hook_fn)
-                module.register_forward_hook(downcast_hook_fn)
+                module.register_forward_pre_hook(upcast_dtype_hook_fn)
+                module.register_forward_hook(cast_to_original_dtype_hook_fn)
 
             for child in module.children():
                 fn_recursive_upcast(child)
 
         for module in self.children():
             fn_recursive_upcast(module)
 
-    def disable_dynamic_upcasting(self):
+    def disable_layerwise_upcasting(self):
         def fn_recursive_upcast(module):
             has_children = list(module.children())
             if not has_children:
diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -259,6 +259,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["AuraFlowPatchEmbed"]
 
     @register_to_config
     def __init__(
@@ -440,11 +441,15 @@ def forward(
 
         # Apply patch embedding, timestep embedding, and project the caption embeddings.
         hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
-        temb = self.time_step_embed(timestep).to(dtype=next(self.parameters()).dtype)
+        temb = self.time_step_embed(timestep).to(dtype=hidden_states.dtype)
         temb = self.time_step_proj(temb)
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)
         encoder_hidden_states = torch.cat(
-            [self.register_tokens.repeat(encoder_hidden_states.size(0), 1, 1), encoder_hidden_states], dim=1
+            [
+                self.register_tokens.to(encoder_hidden_states.dtype).repeat(encoder_hidden_states.size(0), 1, 1),
+                encoder_hidden_states,
+            ],
+            dim=1,
         )
 
         # MMDiT blocks.
diff --git a/src/diffusers/models/transformers/dit_transformer_2d.py b/src/diffusers/models/transformers/dit_transformer_2d.py
@@ -65,6 +65,7 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["PatchEmbed"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -244,6 +244,8 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
             Whether or not to use style condition and image meta size. True for version <=1.1, False for version >= 1.2
     """
 
+    _always_upcast_modules = ["HunyuanDiTAttentionPool"]
+
     @register_to_config
     def __init__(
         self,
@@ -484,7 +486,9 @@ def forward(
         text_embedding_mask = torch.cat([text_embedding_mask, text_embedding_mask_t5], dim=-1)
         text_embedding_mask = text_embedding_mask.unsqueeze(2).bool()
 
-        encoder_hidden_states = torch.where(text_embedding_mask, encoder_hidden_states, self.text_embedding_padding)
+        encoder_hidden_states = torch.where(
+            text_embedding_mask, encoder_hidden_states, self.text_embedding_padding.to(encoder_hidden_states.dtype)
+        )
 
         skips = []
         for layer, block in enumerate(self.blocks):
diff --git a/src/diffusers/models/transformers/latte_transformer_3d.py b/src/diffusers/models/transformers/latte_transformer_3d.py
@@ -64,6 +64,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
         video_length (`int`, *optional*):
             The number of frames in the video-like data.
     """
+    _always_upcast_modules = ["PatchEmbed"]
 
     @register_to_config
     def __init__(
@@ -301,7 +302,9 @@ def forward(
                 hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
 
         embedded_timestep = embedded_timestep.repeat_interleave(num_frame, dim=0).view(-1, embedded_timestep.shape[-1])
-        shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+        shift, scale = (self.scale_shift_table[None].to(embedded_timestep.dtype) + embedded_timestep[:, None]).chunk(
+            2, dim=1
+        )
         hidden_states = self.norm_out(hidden_states)
         # Modulation
         hidden_states = hidden_states * (1 + scale) + shift
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -79,6 +79,7 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
+    _always_upcast_modules = ["PatchEmbed"]
 
     @register_to_config
     def __init__(
@@ -414,7 +415,8 @@ def custom_forward(*inputs):
 
         # 3. Output
         shift, scale = (
-            self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
+            self.scale_shift_table[None].to(embedded_timestep.dtype)
+            + embedded_timestep[:, None].to(self.scale_shift_table.device)
         ).chunk(2, dim=1)
         hidden_states = self.norm_out(hidden_states)
         # Modulation
diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py
@@ -289,7 +289,7 @@ def forward(
 
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might be fp16, so we need to cast here.
-        timesteps_projected = timesteps_projected.to(dtype=self.dtype)
+        timesteps_projected = timesteps_projected.to(dtype=hidden_states.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
 
         if self.embedding_proj_norm is not None:
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -54,6 +54,7 @@ class SD3Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["PatchEmbed"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
@@ -283,7 +283,7 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = t_emb.to(dtype=sample.dtype)
         emb = self.time_embedding(t_emb)
 
         if self.class_embedding is not None:
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
@@ -641,7 +641,7 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = t_emb.to(dtype=sample.dtype)
 
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -590,7 +590,7 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = t_emb.to(dtype=sample.dtype)
         t_emb = self.time_embedding(t_emb, timestep_cond)
 
         # 2. FPS
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
@@ -2152,7 +2152,7 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = t_emb.to(dtype=sample.dtype)
 
         emb = self.time_embedding(t_emb, timestep_cond)
         aug_emb = None
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -19,7 +19,6 @@
 import os
 import re
 import sys
-from collections import OrderedDict
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
@@ -1173,93 +1172,6 @@ def reset_device_map(self):
                     component.to("cpu")
             self.hf_device_map = None
 
-    def enable_dynamic_upcasting(
-        self,
-        components: Optional[List[str]] = None,
-        upcast_dtype: Optional[torch.dtype] = None,
-    ):
-        r"""
-        Enable module-wise dynamic upcasting. This allows models to be loaded into the GPU in a low memory dtype e.g.
-        torch.float8_e4m3fn, but perform inference using a dtype that is supported on the GPU, by casting the module to
-        the appropriate dtype right before the foward pass. The module is then moved back to the low memory dtype after
-        the foward pass.
-
-        """
-        if components is None:
-            raise ValueError("Please provide a list of pipeline component names to apply dynamic upcasting")
-
-        def fn_recursive_upcast(module, dtype, original_dtype, keep_in_fp32_modules):
-            has_children = list(module.children())
-            upcast_dtype = dtype
-            downcast_dtype = original_dtype
-
-            def upcast_hook_fn(module, inputs):
-                module = module.to(upcast_dtype)
-
-            def downcast_hook_fn(module, *args, **kwargs):
-                module = module.to(downcast_dtype)
-
-            if not has_children:
-                module.register_forward_pre_hook(upcast_hook_fn)
-                module.register_forward_hook(downcast_hook_fn)
-
-            for name, child in module.named_children():
-                if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules):
-                    dtype = torch.float32
-                else:
-                    dtype = upcast_dtype
-
-                fn_recursive_upcast(child, dtype, original_dtype, keep_in_fp32_modules)
-
-        for component in components:
-            if not hasattr(self, component):
-                raise ValueError(f"Pipeline has no component named: {component}")
-
-            component_module = getattr(self, component)
-            if not isinstance(component_module, torch.nn.Module):
-                raise ValueError(
-                    f"Pipeline component: {component} is not a torch.nn.Module. Cannot apply dynamic upcasting."
-                )
-
-            use_keep_in_fp32_modules = (
-                hasattr(component_module, "_keep_in_fp32_modules")
-                and (component_module._keep_in_fp32_modules is not None)
-                and (upcast_dtype != torch.float32)
-            )
-            if use_keep_in_fp32_modules:
-                keep_in_fp32_modules = component_module._keep_in_fp32_modules
-            else:
-                keep_in_fp32_modules = []
-
-            original_dtype = component_module.dtype
-            for name, module in component_module.named_children():
-                fn_recursive_upcast(module, upcast_dtype, original_dtype, keep_in_fp32_modules)
-
-    def disable_dynamic_upcasting(
-        self,
-    ):
-        def fn_recursive_upcast(module):
-            has_children = list(module.children())
-            if not has_children:
-                module._forward_pre_hooks = OrderedDict()
-                module._forward_hooks = OrderedDict()
-
-            for child in module.children():
-                fn_recursive_upcast(child)
-
-        for component in self.components:
-            if not hasattr(self, component):
-                raise ValueError(f"Pipeline has no component named: {component}")
-
-            component_module = getattr(self, component)
-            if not issubclass(component_module, torch.nn.Module):
-                raise ValueError(
-                    f"Pipeline component: {component} is not an torch.nn.Module. Cannot apply dynamic upcasting."
-                )
-
-            for module in component_module.children():
-                fn_recursive_upcast(module)
-
     @classmethod
     @validate_hf_hub_args
     def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py