diff --git a/scripts/convert_dcae_to_diffusers.py b/scripts/convert_dcae_to_diffusers.py
new file mode 100644
index 000000000000..f43d7972dcd2
--- /dev/null
+++ b/scripts/convert_dcae_to_diffusers.py
@@ -0,0 +1,275 @@
+import argparse
+from typing import Any, Dict
+
+import torch
+from safetensors.torch import load_file
+
+from diffusers import AutoencoderDC
+
+
+def remove_keys_(key: str, state_dict: Dict[str, Any]):
+    state_dict.pop(key)
+
+
+def remap_qkv_(key: str, state_dict: Dict[str, Any]):
+    # qkv = state_dict.pop(key)
+    # q, k, v = torch.chunk(qkv, 3, dim=0)
+    # parent_module, _, _ = key.rpartition(".qkv.conv.weight")
+    # state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
+    # state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
+    # state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
+    state_dict[key.replace("qkv.conv", "to_qkv")] = state_dict.pop(key)
+
+
+VAE_KEYS_RENAME_DICT = {
+    # common
+    "main.": "",
+    "op_list.": "",
+    "context_module": "attn",
+    "local_module": "conv_out",
+    # NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
+    # If there were more scales, there would be more layers, so a loop would be better to handle this
+    "aggreg.0.0": "to_qkv_multiscale.0.proj_in",
+    "aggreg.0.1": "to_qkv_multiscale.0.proj_out",
+    "norm.": "norm.norm.",
+    "depth_conv.conv": "conv_depth",
+    "inverted_conv.conv": "conv_inverted",
+    "point_conv.conv": "conv_point",
+    "point_conv.norm": "norm",
+    "conv.conv.": "conv.",
+    "conv1.conv": "conv1",
+    "conv2.conv": "conv2",
+    "conv2.norm": "norm",
+    "proj.conv": "proj_out",
+    "proj.norm": "norm_out",
+    # encoder
+    "encoder.project_in.conv": "encoder.conv_in",
+    "encoder.project_out.0.conv": "encoder.conv_out",
+    # decoder
+    "decoder.project_in.conv": "decoder.conv_in",
+    "decoder.project_out.0": "decoder.norm_out.norm",
+    "decoder.project_out.2.conv": "decoder.conv_out",
+}
+
+VAE_SPECIAL_KEYS_REMAP = {
+    "qkv.conv.weight": remap_qkv_,
+}
+
+
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+
+
+def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+
+
+def convert_vae(ckpt_path: str, dtype: torch.dtype):
+    original_state_dict = get_state_dict(load_file(ckpt_path))
+    vae = AutoencoderDC(
+        in_channels=3,
+        latent_channels=32,
+        encoder_block_types=(
+            "ResBlock",
+            "ResBlock",
+            "ResBlock",
+            "EfficientViTBlock",
+            "EfficientViTBlock",
+            "EfficientViTBlock",
+        ),
+        decoder_block_types=(
+            "ResBlock",
+            "ResBlock",
+            "ResBlock",
+            "EfficientViTBlock",
+            "EfficientViTBlock",
+            "EfficientViTBlock",
+        ),
+        block_out_channels=(128, 256, 512, 512, 1024, 1024),
+        encoder_layers_per_block=(2, 2, 2, 3, 3, 3),
+        decoder_layers_per_block=(3, 3, 3, 3, 3, 3),
+        encoder_qkv_multiscales=((), (), (), (5,), (5,), (5,)),
+        decoder_qkv_multiscales=((), (), (), (5,), (5,), (5,)),
+        downsample_block_type="Conv",
+        upsample_block_type="interpolate",
+        decoder_norm_types="rms_norm",
+        decoder_act_fns="silu",
+        scaling_factor=0.41407,
+    ).to(dtype=dtype)
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    vae.load_state_dict(original_state_dict, strict=True)
+    return vae
+
+
+def get_vae_config(name: str):
+    if name in ["dc-ae-f32c32-sana-1.0"]:
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": ("ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"),
+            "decoder_block_types": ("ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"),
+            "block_out_channels": (128, 256, 512, 512, 1024, 1024),
+            "encoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "decoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "encoder_layers_per_block": (2, 2, 2, 3, 3, 3),
+            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
+            "downsample_block_type": "Conv",
+            "upsample_block_type": "interpolate",
+            "scaling_factor": 0.41407,
+        }
+    elif name in ["dc-ae-f32c32-in-1.0", "dc-ae-f32c32-mix-1.0"]:
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_norm_types": ["batch_norm", "batch_norm", "batch_norm", "rms_norm", "rms_norm", "rms_norm"],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu"],
+        }
+    elif name in ["dc-ae-f128c512-in-1.0", "dc-ae-f128c512-mix-1.0"]:
+        config = {
+            "latent_channels": 512,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu", "silu"],
+        }
+    elif name in ["dc-ae-f64c128-in-1.0", "dc-ae-f64c128-mix-1.0"]:
+        config = {
+            "latent_channels": 128,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu"],
+        }
+
+    return config
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
+    return parser.parse_args()
+
+
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+
+VARIANT_MAPPING = {
+    "fp32": None,
+    "fp16": "fp16",
+    "bf16": "bf16",
+}
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    dtype = DTYPE_MAPPING[args.dtype]
+    variant = VARIANT_MAPPING[args.dtype]
+
+    if args.vae_ckpt_path is not None:
+        vae = convert_vae(args.vae_ckpt_path, dtype)
+        vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a4749af5f61b..f393167796d1 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -80,6 +80,7 @@
             "AllegroTransformer3DModel",
             "AsymmetricAutoencoderKL",
             "AuraFlowTransformer2DModel",
+            "AutoencoderDC",
             "AutoencoderKL",
             "AutoencoderKLAllegro",
             "AutoencoderKLCogVideoX",
@@ -571,6 +572,7 @@
             AllegroTransformer3DModel,
             AsymmetricAutoencoderKL,
             AuraFlowTransformer2DModel,
+            AutoencoderDC,
             AutoencoderKL,
             AutoencoderKLAllegro,
             AutoencoderKLCogVideoX,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 65e2418ac794..7183d40b6f91 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -27,6 +27,7 @@
 if is_torch_available():
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
     _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_dc"] = ["AutoencoderDC"]
     _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
     _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
     _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
@@ -88,6 +89,7 @@
         from .adapter import MultiAdapter, T2IAdapter
         from .autoencoders import (
             AsymmetricAutoencoderKL,
+            AutoencoderDC,
             AutoencoderKL,
             AutoencoderKLAllegro,
             AutoencoderKLCogVideoX,
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 02ed1f965abf..bef5dc73a96f 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -22,7 +22,13 @@
 from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
 from .attention_processor import Attention, JointAttnProcessor2_0
 from .embeddings import SinusoidalPositionalEmbedding
-from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm, SD35AdaLayerNormZeroX
+from .normalization import (
+    AdaLayerNorm,
+    AdaLayerNormContinuous,
+    AdaLayerNormZero,
+    RMSNorm,
+    SD35AdaLayerNormZeroX,
+)
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
index ba45d6671252..7a36e88f1a36 100644
--- a/src/diffusers/models/autoencoders/__init__.py
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -1,4 +1,5 @@
 from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+from .autoencoder_dc import AutoencoderDC
 from .autoencoder_kl import AutoencoderKL
 from .autoencoder_kl_allegro import AutoencoderKLAllegro
 from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
new file mode 100644
index 000000000000..5eb090e83c54
--- /dev/null
+++ b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -0,0 +1,626 @@
+# Copyright 2024 MIT, Tsinghua University, NVIDIA CORPORATION and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..activations import get_activation
+from ..modeling_utils import ModelMixin
+from ..normalization import RMSNormNd
+
+
+def get_norm_layer(name: Optional[str] = "batch_norm", num_features: Optional[int] = None) -> Optional[nn.Module]:
+    if name is None:
+        norm = None
+    elif name == "rms_norm":
+        norm = RMSNormNd(num_features, eps=1e-5, elementwise_affine=True, bias=True, channel_dim=1)
+    elif name == "batch_norm":
+        norm = nn.BatchNorm2d(num_features=num_features)
+    else:
+        raise ValueError(f"norm {name} is not supported")
+    return norm
+
+
+class GLUMBConv(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+
+        hidden_channels = 4 * in_channels
+
+        self.nonlinearity = nn.SiLU()
+
+        self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
+        self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
+        self.norm = RMSNormNd(out_channels, eps=1e-5, elementwise_affine=True, bias=True, channel_dim=1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.conv_inverted(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.conv_depth(hidden_states)
+        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
+        hidden_states = hidden_states * self.nonlinearity(gate)
+
+        hidden_states = self.conv_point(hidden_states)
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states + residual
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_type: str = "batch_norm",
+        act_fn: str = "relu6",
+    ) -> None:
+        super().__init__()
+
+        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
+        self.conv1 = nn.Conv2d(in_channels, in_channels, 3, 1, 1)
+        self.conv2 = nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
+        self.norm = get_norm_layer(norm_type, out_channels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states + residual
+
+
+class MLAProjection(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        kernel_size: int,
+    ) -> None:
+        super().__init__()
+
+        self.proj_in = nn.Conv2d(
+            3 * in_channels,
+            3 * in_channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            groups=3 * in_channels,
+            bias=False,
+        )
+        self.proj_out = nn.Conv2d(
+            3 * in_channels, 3 * in_channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
+
+
+class LiteMLA(nn.Module):
+    r"""Lightweight multi-scale linear attention"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_attention_heads: Optional[int] = None,
+        heads_ratio: float = 1.0,
+        attention_head_dim: int = 8,
+        norm_type: str = "batch_norm",
+        kernel_sizes: Tuple[int, ...] = (5,),
+        eps: float = 1e-15,
+    ):
+        super().__init__()
+
+        self.eps = eps
+        self.attention_head_dim = attention_head_dim
+
+        num_attention_heads = (
+            int(in_channels // attention_head_dim * heads_ratio)
+            if num_attention_heads is None
+            else num_attention_heads
+        )
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.to_qkv = nn.Conv2d(in_channels, 3 * inner_dim, 1, 1, 0, bias=False)
+
+        self.to_qkv_multiscale = nn.ModuleList()
+        for kernel_size in kernel_sizes:
+            self.to_qkv_multiscale.append(MLAProjection(inner_dim, num_attention_heads, kernel_size))
+
+        self.kernel_nonlinearity = nn.ReLU()
+        self.proj_out = nn.Conv2d(inner_dim * (1 + len(kernel_sizes)), out_channels, 1, 1, 0, bias=False)
+        self.norm_out = get_norm_layer(norm_type, num_features=out_channels)
+
+    def linear_attention(self, qkv: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = qkv.shape
+
+        qkv = qkv.float()
+        qkv = torch.reshape(qkv, (batch_size, -1, 3 * self.attention_head_dim, height * width))
+
+        query, key, value = (
+            qkv[:, :, 0 : self.attention_head_dim],
+            qkv[:, :, self.attention_head_dim : 2 * self.attention_head_dim],
+            qkv[:, :, 2 * self.attention_head_dim :],
+        )
+
+        # lightweight linear attention
+        query = self.kernel_nonlinearity(query)
+        key = self.kernel_nonlinearity(key)
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
+
+        key_T = key.transpose(-1, -2)
+        scores = torch.matmul(value, key_T)
+        output = torch.matmul(scores, query)
+
+        output = output.float()
+        output = output[:, :, :-1] / (output[:, :, -1:] + self.eps)
+        output = torch.reshape(output, (batch_size, -1, height, width))
+
+        return output
+
+    def quadratic_attention(self, qkv: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = list(qkv.size())
+
+        qkv = torch.reshape(qkv, (batch_size, -1, 3 * self.attention_head_dim, height * width))
+        query, key, value = (
+            qkv[:, :, 0 : self.attention_head_dim],
+            qkv[:, :, self.attention_head_dim : 2 * self.attention_head_dim],
+            qkv[:, :, 2 * self.attention_head_dim :],
+        )
+
+        query = self.kernel_nonlinearity(query)
+        key = self.kernel_nonlinearity(key)
+
+        scores = torch.matmul(key.transpose(-1, -2), query)
+
+        original_dtype = scores.dtype
+        scores = scores.float()
+        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
+        scores = scores.to(original_dtype)
+
+        output = torch.matmul(value, scores)
+        output = torch.reshape(output, (batch_size, -1, height, width))
+
+        return output
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        qkv = self.to_qkv(hidden_states)
+
+        multi_scale_qkv = [qkv]
+        for block in self.to_qkv_multiscale:
+            multi_scale_qkv.append(block(qkv))
+
+        qkv = torch.cat(multi_scale_qkv, dim=1)
+
+        height, width = qkv.shape[-2:]
+        if height * width > self.attention_head_dim:
+            hidden_states = self.linear_attention(qkv).to(qkv.dtype)
+        else:
+            hidden_states = self.quadratic_attention(qkv)
+
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = self.norm_out(hidden_states)
+
+        return hidden_states + residual
+
+
+class EfficientViTBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        heads_ratio: float = 1.0,
+        dim: int = 32,
+        qkv_multiscales: Tuple[int, ...] = (5,),
+        norm_type: str = "batch_norm",
+    ):
+        super().__init__()
+
+        self.attn = LiteMLA(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            heads_ratio=heads_ratio,
+            attention_head_dim=dim,
+            norm_type=norm_type,
+            kernel_sizes=qkv_multiscales,
+        )
+
+        self.conv_out = GLUMBConv(
+            in_channels=in_channels,
+            out_channels=in_channels,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.attn(x)
+        x = self.conv_out(x)
+        return x
+
+
+def get_block_from_block_type(
+    block_type: str,
+    in_channels: int,
+    out_channels: int,
+    norm_type: str,
+    act_fn: str,
+    qkv_mutliscales: Tuple[int] = (),
+):
+    if block_type == "ResBlock":
+        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
+
+    elif block_type == "EfficientViTBlock":
+        block = EfficientViTBlock(in_channels, norm_type=norm_type, qkv_multiscales=qkv_mutliscales)
+
+    else:
+        raise ValueError(f"Block with {block_type=} is not supported.")
+
+    return block
+
+
+class DCDownBlock2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
+        super().__init__()
+
+        self.downsample = downsample
+        self.factor = 2
+        self.stride = 1 if downsample else 2
+        self.group_size = in_channels * self.factor**2 // out_channels
+        self.shortcut = shortcut
+
+        out_ratio = self.factor**2
+        if downsample:
+            assert out_channels % out_ratio == 0
+            out_channels = out_channels // out_ratio
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=self.stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.conv(hidden_states)
+        if self.downsample:
+            x = F.pixel_unshuffle(x, self.factor)
+
+        if self.shortcut:
+            y = F.pixel_unshuffle(hidden_states, self.factor)
+            y = y.unflatten(1, (-1, self.group_size))
+            y = y.mean(dim=2)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class DCUpBlock2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        interpolate: bool = False,
+        shortcut: bool = True,
+        interpolation_mode: str = "nearest",
+    ) -> None:
+        super().__init__()
+
+        self.interpolate = interpolate
+        self.interpolation_mode = interpolation_mode
+        self.shortcut = shortcut
+        self.factor = 2
+        self.repeats = out_channels * self.factor**2 // in_channels
+
+        out_ratio = self.factor**2
+
+        if not interpolate:
+            out_channels = out_channels * out_ratio
+
+        self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.interpolate:
+            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
+            x = self.conv(x)
+        else:
+            x = self.conv(hidden_states)
+            x = F.pixel_shuffle(x, self.factor)
+
+        if self.shortcut:
+            y = hidden_states.repeat_interleave(self.repeats, dim=1)
+            y = F.pixel_shuffle(y, self.factor)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        block_type: Union[str, Tuple[str]] = "ResBlock",
+        block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
+        downsample_block_type: str = "pixel_unshuffle",
+    ):
+        super().__init__()
+
+        num_stages = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_stages
+
+        if layers_per_block[0] > 0:
+            self.conv_in = nn.Conv2d(
+                in_channels,
+                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        else:
+            self.conv_in = DCDownBlock2d(
+                in_channels=in_channels,
+                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                downsample=downsample_block_type == "pixel_unshuffle",
+                shortcut=False,
+            )
+
+        stages = []
+        for stage_id, (width, depth) in enumerate(zip(block_out_channels, layers_per_block)):
+            stage = []
+
+            for _ in range(depth):
+                block = get_block_from_block_type(
+                    block_type[stage_id],
+                    width,
+                    width,
+                    norm_type="rms_norm",
+                    act_fn="silu",
+                    qkv_mutliscales=qkv_multiscales[stage_id],
+                )
+                stage.append(block)
+
+            if stage_id < num_stages - 1 and depth > 0:
+                downsample_block = DCDownBlock2d(
+                    in_channels=width,
+                    out_channels=block_out_channels[stage_id + 1],
+                    downsample=downsample_block_type == "pixel_unshuffle",
+                    shortcut=True,
+                )
+                stage.append(downsample_block)
+
+            stages.append(nn.Sequential(*stage))
+
+        self.stages = nn.ModuleList(stages)
+
+        self.conv_out = nn.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
+        self.norm_factor = 1
+        norm_in_channels = block_out_channels[-1]
+        norm_out_channels = latent_channels
+        self.norm_group_size = norm_in_channels * self.norm_factor**2 // norm_out_channels
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states)
+        for stage in self.stages:
+            hidden_states = stage(hidden_states)
+
+        x = F.pixel_unshuffle(hidden_states, self.norm_factor)
+        x = x.unflatten(1, (-1, self.norm_group_size))
+        x = x.mean(dim=2)
+
+        hidden_states = self.conv_out(hidden_states) + x
+        return hidden_states
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        block_type: Union[str, Tuple[str]] = "ResBlock",
+        block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
+        norm_type: Union[str, Tuple[str]] = "rms_norm",
+        act_fn: Union[str, Tuple[str]] = "silu",
+        upsample_block_type: str = "pixel_shuffle",
+        upsample_shortcut: str = "duplicating",
+    ):
+        super().__init__()
+
+        num_stages = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_stages
+        if isinstance(norm_type, str):
+            norm_type = (norm_type,) * num_stages
+        if isinstance(act_fn, str):
+            act_fn = (act_fn,) * num_stages
+
+        self.conv_in = nn.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
+
+        self.norm_factor = 1
+        self.norm_repeats = block_out_channels[-1] * self.norm_factor**2 // latent_channels
+
+        stages = []
+        for stage_id, (width, depth) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
+            stage = []
+
+            if stage_id < num_stages - 1 and depth > 0:
+                upsample_block = DCUpBlock2d(
+                    block_out_channels[stage_id + 1],
+                    width,
+                    interpolate=upsample_block_type == "interpolate",
+                    shortcut=upsample_shortcut,
+                )
+                stage.append(upsample_block)
+
+            for _ in range(depth):
+                block = get_block_from_block_type(
+                    block_type[stage_id],
+                    width,
+                    width,
+                    norm_type=norm_type[stage_id],
+                    act_fn=act_fn[stage_id],
+                    qkv_mutliscales=qkv_multiscales[stage_id],
+                )
+                stage.append(block)
+
+            stages.insert(0, nn.Sequential(*stage))
+
+        self.stages = nn.ModuleList(stages)
+
+        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
+
+        self.norm_out = RMSNormNd(channels, eps=1e-5, elementwise_affine=True, bias=True, channel_dim=1)
+        self.conv_act = nn.ReLU()
+        self.conv_out = None
+
+        if layers_per_block[0] > 0:
+            self.conv_out = nn.Conv2d(channels, in_channels, 3, 1, 1)
+        else:
+            self.conv_out = DCUpBlock2d(
+                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = hidden_states.repeat_interleave(self.norm_repeats, dim=1)
+        x = F.pixel_shuffle(x, self.norm_factor)
+
+        hidden_states = self.conv_in(hidden_states) + x
+
+        for stage in reversed(self.stages):
+            hidden_states = stage(hidden_states)
+
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class AutoencoderDC(ModelMixin, ConfigMixin):
+    r"""
+    An Autoencoder model introduced in [DCAE](https://arxiv.org/abs/2410.10733) and used in
+    [SANA](https://arxiv.org/abs/2410.10629).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Args:
+        in_channels (`int`, defaults to `3`):
+            The number of input channels in samples.
+        latent_channels (`int`, defaults to `32`):
+            The number of channels in the latent space representation.
+        encoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
+            The type(s) of block to use in the encoder.
+        decoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
+            The type(s) of block to use in the decoder.
+        block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512, 1024, 1024)`):
+            The number of output channels for each block in the encoder/decoder.
+        encoder_layers_per_block (`Tuple[int]`, defaults to `(2, 2, 2, 3, 3, 3)`):
+            The number of layers per block in the encoder.
+        decoder_layers_per_block (`Tuple[int]`, defaults to `(3, 3, 3, 3, 3, 3)`):
+            The number of layers per block in the decoder.
+        encoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
+            Multi-scale configurations for the encoder's QKV (query-key-value) transformations.
+        decoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
+            Multi-scale configurations for the decoder's QKV (query-key-value) transformations.
+        upsample_block_type (`str`, defaults to `"pixel_shuffle"`):
+            The type of block to use for upsampling in the decoder.
+        downsample_block_type (`str`, defaults to `"pixel_unshuffle"`):
+            The type of block to use for downsampling in the encoder.
+        decoder_norm_types (`Union[str, Tuple[str]]`, defaults to `"rms_norm"`):
+            The normalization type(s) to use in the decoder.
+        decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
+            The activation function(s) to use in the decoder.
+        scaling_factor (`float`, defaults to `1.0`):
+            A scaling factor applied during model operations.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        latent_channels: int = 32,
+        encoder_block_types: Union[str, Tuple[str]] = "ResBlock",
+        decoder_block_types: Union[str, Tuple[str]] = "ResBlock",
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
+        encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3),
+        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3),
+        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
+        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
+        upsample_block_type: str = "pixel_shuffle",
+        downsample_block_type: str = "pixel_unshuffle",
+        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
+        decoder_act_fns: Union[str, Tuple[str]] = "silu",
+        scaling_factor: float = 1.0,
+    ) -> None:
+        super().__init__()
+
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            block_type=encoder_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=encoder_layers_per_block,
+            qkv_multiscales=encoder_qkv_multiscales,
+            downsample_block_type=downsample_block_type,
+        )
+        self.decoder = Decoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            block_type=decoder_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=decoder_layers_per_block,
+            qkv_multiscales=decoder_qkv_multiscales,
+            norm_type=decoder_norm_types,
+            act_fn=decoder_act_fns,
+            upsample_block_type=upsample_block_type,
+        )
+
+        self.spatial_compression_ratio = 2 ** (len(block_out_channels) - 1)
+        self.temporal_compression_ratio = 1
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.encoder(x)
+        return x
+
+    def decode(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.decoder(x)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.encoder(x)
+        x = self.decoder(x)
+        return x
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
index 817b3fff2ea6..b94cda63906a 100644
--- a/src/diffusers/models/normalization.py
+++ b/src/diffusers/models/normalization.py
@@ -512,20 +512,24 @@ def forward(self, input):
 
 
 class RMSNorm(nn.Module):
-    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
         super().__init__()
 
         self.eps = eps
+        self.elementwise_affine = elementwise_affine
 
         if isinstance(dim, numbers.Integral):
             dim = (dim,)
 
         self.dim = torch.Size(dim)
 
+        self.weight = None
+        self.bias = None
+
         if elementwise_affine:
             self.weight = nn.Parameter(torch.ones(dim))
-        else:
-            self.weight = None
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
 
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
@@ -537,6 +541,8 @@ def forward(self, hidden_states):
             if self.weight.dtype in [torch.float16, torch.bfloat16]:
                 hidden_states = hidden_states.to(self.weight.dtype)
             hidden_states = hidden_states * self.weight
+            if self.bias is not None:
+                hidden_states = hidden_states + self.bias
         else:
             hidden_states = hidden_states.to(input_dtype)
 
@@ -566,3 +572,28 @@ def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12):
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps)
+
+
+class RMSNormNd(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        eps: float,
+        elementwise_affine: bool = True,
+        bias: bool = False,
+        channel_dim: int = -1,
+    ) -> None:
+        super().__init__()
+
+        self.norm = RMSNorm(dim, eps=eps, elementwise_affine=elementwise_affine, bias=bias)
+        self.channel_dim = channel_dim
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.channel_dim != -1:
+            hidden_states = hidden_states.movedim(self.channel_dim, -1)
+            hidden_states = self.norm(hidden_states)
+            hidden_states = hidden_states.movedim(-1, self.channel_dim)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 5091ff318f1b..7b3c366ca8e2 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -47,6 +47,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class AutoencoderDC(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AutoencoderKL(metaclass=DummyObject):
     _backends = ["torch"]