update mixin

a-r-r-o-w · a-r-r-o-w · commit 341fbfca1d4a · 2025-01-02T22:09:09.000+01:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -60,6 +60,8 @@ class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
             Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
     """
 
+    _always_upcast_modules = ["MaskConditionDecoder"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/autoencoders/vq_model.py b/src/diffusers/models/autoencoders/vq_model.py
@@ -71,6 +71,8 @@ class VQModel(ModelMixin, ConfigMixin):
             Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
     """
 
+    _always_upcast_modules = ["VectorQuantizer"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/layerwise_upcasting_utils.py b/src/diffusers/models/layerwise_upcasting_utils.py
@@ -45,6 +45,8 @@ class LayerwiseUpcastingHook(ModelHook):
     in the output, but can significantly reduce the memory footprint.
     """
 
+    _is_stateful = False
+
     def __init__(self, storage_dtype: torch.dtype, compute_dtype: torch.dtype) -> None:
         self.storage_dtype = storage_dtype
         self.compute_dtype = compute_dtype
@@ -56,8 +58,8 @@ def init_hook(self, module: torch.nn.Module):
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         module.to(dtype=self.compute_dtype)
         # How do we account for LongTensor, BoolTensor, etc.?
-        # args = tuple(align_maybe_tensor_dtype(arg, self.compute_dtype) for arg in args)
-        # kwargs = {k: align_maybe_tensor_dtype(v, self.compute_dtype) for k, v in kwargs.items()}
+        # args = tuple(_align_maybe_tensor_dtype(arg, self.compute_dtype) for arg in args)
+        # kwargs = {k: _align_maybe_tensor_dtype(v, self.compute_dtype) for k, v in kwargs.items()}
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
@@ -105,7 +107,7 @@ class LayerwiseUpcastingGranularity(str, Enum):
     torch.nn.Linear,
 ]
 
-_DEFAULT_PYTORCH_LAYER_SKIP_MODULES_PATTERN = ["pos_embed", "patch_embed", "norm"]
+_DEFAULT_SKIP_MODULES_PATTERN = ["pos_embed", "patch_embed", "norm"]
 # fmt: on
 
 
@@ -114,9 +116,27 @@ def apply_layerwise_upcasting(
     storage_dtype: torch.dtype,
     compute_dtype: torch.dtype,
     granularity: LayerwiseUpcastingGranularity = LayerwiseUpcastingGranularity.PYTORCH_LAYER,
-    skip_modules_pattern: List[str] = [],
+    skip_modules_pattern: List[str] = _DEFAULT_SKIP_MODULES_PATTERN,
     skip_modules_classes: List[Type[torch.nn.Module]] = [],
 ) -> torch.nn.Module:
+    r"""
+    Applies layerwise upcasting to a given module. The module expected here is a Diffusers ModelMixin but it can be any
+    nn.Module using diffusers layers or pytorch primitives.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module to attach the hook to.
+        storage_dtype (`torch.dtype`):
+            The dtype to cast the module to before the forward pass.
+        compute_dtype (`torch.dtype`):
+            The dtype to cast the module to during the forward pass.
+        granularity (`LayerwiseUpcastingGranularity`, *optional*, defaults to `LayerwiseUpcastingGranularity.PYTORCH_LAYER`):
+            The granularity of the layerwise upcasting process.
+        skip_modules_pattern (`List[str]`, defaults to `["pos_embed", "patch_embed", "norm"]`):
+            A list of patterns to match the names of the modules to skip during the layerwise upcasting process.
+        skip_modules_classes (`List[Type[torch.nn.Module]]`, defaults to `[]`):
+            A list of module classes to skip during the layerwise upcasting process.
+    """
     if granularity == LayerwiseUpcastingGranularity.DIFFUSERS_LAYER:
         return _apply_layerwise_upcasting_diffusers_layer(
             module, storage_dtype, compute_dtype, skip_modules_pattern, skip_modules_classes
@@ -153,7 +173,7 @@ def _apply_layerwise_upcasting_diffusers_layer(
     module: torch.nn.Module,
     storage_dtype: torch.dtype,
     compute_dtype: torch.dtype,
-    skip_modules_pattern: List[str] = _DEFAULT_PYTORCH_LAYER_SKIP_MODULES_PATTERN,
+    skip_modules_pattern: List[str] = _DEFAULT_SKIP_MODULES_PATTERN,
     skip_modules_classes: List[Type[torch.nn.Module]] = [],
 ) -> torch.nn.Module:
     for name, submodule in module.named_modules():
@@ -173,7 +193,7 @@ def _apply_layerwise_upcasting_pytorch_layer(
     module: torch.nn.Module,
     storage_dtype: torch.dtype,
     compute_dtype: torch.dtype,
-    skip_modules_pattern: List[str] = _DEFAULT_PYTORCH_LAYER_SKIP_MODULES_PATTERN,
+    skip_modules_pattern: List[str] = _DEFAULT_SKIP_MODULES_PATTERN,
     skip_modules_classes: List[Type[torch.nn.Module]] = [],
 ) -> torch.nn.Module:
     for name, submodule in module.named_modules():
@@ -189,7 +209,7 @@ def _apply_layerwise_upcasting_pytorch_layer(
     return module
 
 
-def align_maybe_tensor_dtype(input: Any, dtype: torch.dtype) -> Any:
+def _align_maybe_tensor_dtype(input: Any, dtype: torch.dtype) -> Any:
     r"""
     Aligns the dtype of a tensor or a list of tensors to a given dtype.
 
@@ -199,14 +219,15 @@ def align_maybe_tensor_dtype(input: Any, dtype: torch.dtype) -> Any:
             types, it will be returned as is.
         dtype (`torch.dtype`):
             The dtype to align the tensor(s) to.
+
     Returns:
         `Any`:
             The tensor or list of tensors aligned to the given dtype.
     """
     if isinstance(input, torch.Tensor):
         return input.to(dtype=dtype)
     if isinstance(input, (list, tuple)):
-        return [align_maybe_tensor_dtype(t, dtype) for t in input]
+        return [_align_maybe_tensor_dtype(t, dtype) for t in input]
     if isinstance(input, dict):
-        return {k: align_maybe_tensor_dtype(v, dtype) for k, v in input.items()}
+        return {k: _align_maybe_tensor_dtype(v, dtype) for k, v in input.items()}
     return input
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -56,6 +56,7 @@
     load_or_create_model_card,
     populate_model_card,
 )
+from .layerwise_upcasting_utils import LayerwiseUpcastingGranularity, apply_layerwise_upcasting
 from .model_loading_utils import (
     _determine_device_map,
     _fetch_index_file,
@@ -150,6 +151,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _keys_to_ignore_on_load_unexpected = None
     _no_split_modules = None
     _keep_in_fp32_modules = None
+    _always_upcast_modules = None
 
     def __init__(self):
         super().__init__()
@@ -314,6 +316,67 @@ def disable_xformers_memory_efficient_attention(self) -> None:
         """
         self.set_use_memory_efficient_attention_xformers(False)
 
+    def enable_layerwise_upcasting(
+        self,
+        storage_dtype: torch.dtype = torch.float8_e4m3fn,
+        compute_dtype: Optional[torch.dtype] = None,
+        granularity: LayerwiseUpcastingGranularity = LayerwiseUpcastingGranularity.PYTORCH_LAYER,
+    ) -> None:
+        r"""
+        Activates layerwise upcasting for the current model.
+
+        Layerwise upcasting is a technique that casts the model weights to a lower precision dtype for storage but
+        upcasts them on-the-fly to a higher precision dtype for computation. This process can significantly reduce the
+        memory footprint from model weights, but may lead to some quality degradation in the outputs. Most degradations
+        are negligible, mostly stemming from weight casting in normalization and modulation layers.
+
+        By default, most models in diffusers set the `_always_upcast_modules` attribute to ignore patch embedding,
+        positional embedding and normalization layers. This is because these layers are most likely precision-critical
+        for quality. If you wish to change this behavior, you can set the `_always_upcast_modules` attribute to `None`,
+        or call [`~apply_layerwise_upcasting`] with custom arguments.
+
+        Example:
+            Using [`~models.ModelMixin.enable_layerwise_upcasting`]:
+
+            ```python
+            >>> from diffusers import CogVideoXTransformer3DModel, apply_layerwise_upcasting
+
+            >>> transformer = CogVideoXTransformer3DModel.from_pretrained(
+            ...     "THUDM/CogVideoX-5b", subfolder="transformer", torch_dtype=torch.bfloat16
+            ... )
+
+            >>> # Enable layerwise upcasting via the model, which ignores certain modules by default
+            >>> transformer.enable_layerwise_upcasting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
+
+            >>> # Or, enable layerwise upcasting with custom arguments via the `apply_layerwise_upcasting` function
+            >>> apply_layerwise_upcasting(
+            ...     transformer, torch.float8_e4m3fn, torch.bfloat16, skip_modules_pattern=["patch_embed", "norm.*"]
+            ... )
+            ```
+
+        Args:
+            storage_dtype (`torch.dtype`):
+                The dtype to which the model should be cast for storage.
+            compute_dtype (`torch.dtype`):
+                The dtype to which the model weights should be cast during the forward pass.
+            granularity (`LayerwiseUpcastingGranularity`, defaults to "pytorch_layer"):
+                The granularity of the layerwise upcasting process. Read the documentation of
+                [`~LayerwiseUpcastingGranularity`] for more information.
+        """
+
+        skip_modules_pattern = []
+        if self._keep_in_fp32_modules is not None:
+            skip_modules_pattern.extend(self._keep_in_fp32_modules)
+        if self._always_upcast_modules is not None:
+            skip_modules_pattern.extend(self._always_upcast_modules)
+        skip_modules_pattern = list(set(skip_modules_pattern))
+
+        if compute_dtype is None:
+            logger.info("`compute_dtype` not provided when enabling layerwise upcasting. Using `storage_dtype`.")
+            compute_dtype = self.dtype
+
+        apply_layerwise_upcasting(self, storage_dtype, compute_dtype, granularity, skip_modules_pattern)
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -275,6 +275,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
     """
 
     _no_split_modules = ["AuraFlowJointTransformerBlock", "AuraFlowSingleTransformerBlock", "AuraFlowPatchEmbed"]
+    _always_upcast_modules = ["pos_embed", "norm.*"]
     _supports_gradient_checkpointing = True
 
     @register_to_config
diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -209,6 +209,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             Scaling factor to apply in 3D positional embeddings across temporal dimensions.
     """
 
+    _always_upcast_modules = ["patch_embed", "norm.*"]
     _supports_gradient_checkpointing = True
 
     @register_to_config
diff --git a/src/diffusers/models/transformers/dit_transformer_2d.py b/src/diffusers/models/transformers/dit_transformer_2d.py
@@ -64,6 +64,7 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
             A small constant added to the denominator in normalization layers to prevent division by zero.
     """
 
+    _always_upcast_modules = ["pos_embed", "norm.*"]
     _supports_gradient_checkpointing = True
 
     @register_to_config
diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -244,6 +244,8 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
             Whether or not to use style condition and image meta size. True for version <=1.1, False for version >= 1.2
     """
 
+    _always_upcast_modules = ["pos_embed", "norm.*", "pooler"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/transformers/latte_transformer_3d.py b/src/diffusers/models/transformers/latte_transformer_3d.py
@@ -65,6 +65,8 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
             The number of frames in the video-like data.
     """
 
+    _always_upcast_modules = ["pos_embed", "norm.*"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/transformers/lumina_nextdit2d.py b/src/diffusers/models/transformers/lumina_nextdit2d.py
@@ -221,6 +221,8 @@ class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
             overall scale of the model's operations.
     """
 
+    _always_upcast_modules = ["patch_embedder", "norm.*", "ffn_norm.*"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -79,6 +79,7 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
+    _always_upcast_modules = ["pos_embed", "norm.*", "adaln_single"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -222,6 +222,7 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["SanaTransformerBlock", "PatchEmbed"]
+    _always_upcast_modules = ["patch_embed", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
@@ -66,6 +66,7 @@ class Transformer2DModel(LegacyModelMixin, LegacyConfigMixin):
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock"]
+    _always_upcast_modules = ["latent_image_embedding", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py
@@ -221,6 +221,9 @@ class AllegroTransformer3DModel(ModelMixin, ConfigMixin):
             Scaling factor to apply in 3D positional embeddings across time dimension.
     """
 
+    _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["pos_embed", "norm.*", "adaln_single"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -166,6 +166,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["patch_embed", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
@@ -241,6 +241,7 @@ class FluxTransformer2DModel(
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    _always_upcast_modules = ["pos_embed", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -542,6 +542,7 @@ class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["x_embedder", "context_embedder", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
@@ -295,6 +295,7 @@ class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_mochi.py b/src/diffusers/models/transformers/transformer_mochi.py
@@ -336,6 +336,7 @@ class MochiTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOri
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["MochiTransformerBlock"]
+    _always_upcast_modules = ["patch_embed", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -127,6 +127,7 @@ class SD3Transformer2DModel(
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["pos_embed", "norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py
@@ -67,6 +67,8 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
             The maximum length of the sequence over which to apply positional embeddings.
     """
 
+    _always_upcast_modules = ["norm.*"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py
@@ -71,6 +71,8 @@ class UNet1DModel(ModelMixin, ConfigMixin):
             Experimental feature for using a UNet without upsampling.
     """
 
+    _always_upcast_modules = ["norm.*"]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
@@ -90,6 +90,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
@@ -166,6 +166,7 @@ class conditioning with `class_embed_type` equal to `None`.
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
+    _always_upcast_modules = ["norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
@@ -97,6 +97,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
     """
 
     _supports_gradient_checkpointing = False
+    _always_upcast_modules = ["norm.*"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
@@ -1301,6 +1301,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Peft
     """
 
     _supports_gradient_checkpointing = True
+    _always_upcast_modules = ["norm.*"]
 
     @register_to_config
     def __init__(