diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index b3f5f6ec9d89..007b40d50e7f 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -157,6 +157,7 @@
             "AutoencoderKLWan",
             "AutoencoderOobleck",
             "AutoencoderTiny",
+            "AutoencoderVidTok",
             "AutoModel",
             "CacheMixin",
             "ChromaTransformer2DModel",
@@ -772,6 +773,7 @@
             AutoencoderKLWan,
             AutoencoderOobleck,
             AutoencoderTiny,
+            AutoencoderVidTok,
             AutoModel,
             CacheMixin,
             ChromaTransformer2DModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 73903a627415..a7469128ec0b 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -41,6 +41,7 @@
     _import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["autoencoders.autoencoder_vidtok"] = ["AutoencoderVidTok"]
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["autoencoders.vq_model"] = ["VQModel"]
     _import_structure["cache_utils"] = ["CacheMixin"]
@@ -127,6 +128,7 @@
             AutoencoderKLWan,
             AutoencoderOobleck,
             AutoencoderTiny,
+            AutoencoderVidTok,
             ConsistencyDecoderVAE,
             VQModel,
         )
diff --git a/src/diffusers/models/autoencoders/autoencoder_vidtok.py b/src/diffusers/models/autoencoders/autoencoder_vidtok.py
new file mode 100644
index 000000000000..b7b316b6b561
--- /dev/null
+++ b/src/diffusers/models/autoencoders/autoencoder_vidtok.py
@@ -0,0 +1,1502 @@
+# Copyright 2025 The VidTok team, MSRA & Shanghai Jiao Tong University and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class FSQRegularizer(nn.Module):
+    r"""
+    Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505 Code adapted from
+    https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/finite_scalar_quantization.py
+
+    Args:
+        levels (`List[int]`):
+            A list of quantization levels.
+        dim (`int`, *optional*, defaults to `None`):
+            The dimension of latent codes.
+        num_codebooks (`int`, defaults to 1):
+            The number of codebooks.
+        keep_num_codebooks_dim (`bool`, *optional*, defaults to `None`):
+            Whether to keep the number of codebook dim.
+    """
+
+    def __init__(
+        self,
+        levels: List[int],
+        dim: Optional[int] = None,
+        num_codebooks: int = 1,
+        keep_num_codebooks_dim: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        _levels = torch.tensor(levels, dtype=torch.int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=torch.int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+
+        if keep_num_codebooks_dim is None:
+            keep_num_codebooks_dim = num_codebooks > 1
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = len(_levels) * num_codebooks if dim is None else dim
+
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+
+        self.codebook_size = self._levels.prod().item()
+
+        implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False)
+        self.register_buffer("implicit_codebook", implicit_codebook, persistent=False)
+        self.register_buffer("zero", torch.tensor(0.0), persistent=False)
+
+        self.global_codebook_usage = torch.zeros([2**self.codebook_dim, self.num_codebooks], dtype=torch.long)
+
+    def quantize(self, z: torch.Tensor, eps: float = 1e-3) -> torch.Tensor:
+        r"""Quantizes z, returns quantized zhat, same shape as z."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        z = (z + shift).tanh() * half_l - offset
+        zhat = z.round()
+        quantized = z + (zhat - z).detach()
+        half_width = self._levels // 2
+        return quantized / half_width
+
+    def codes_to_indices(self, zhat: torch.Tensor) -> torch.Tensor:
+        r"""Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        half_width = self._levels // 2
+        zhat = (zhat * half_width) + half_width
+        return (zhat * self._basis).sum(dim=-1).to(torch.int32)
+
+    def indices_to_codes(self, indices: torch.Tensor, project_out: bool = True) -> torch.Tensor:
+        r"""Inverse of `codes_to_indices`."""
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        indices = indices.unsqueeze(-1)
+        codes_non_centered = (indices // self._basis) % self._levels
+        half_width = self._levels // 2
+        codes = (codes_non_centered - half_width) / half_width
+        if self.keep_num_codebooks_dim:
+            codes = codes.reshape(*codes.shape[:-2], -1)
+        if project_out:
+            codes = self.project_out(codes)
+        if is_img_or_video:
+            codes = codes.permute(0, -1, *range(1, codes.dim() - 1))
+        return codes
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        einstein notation b - batch n - sequence (or flattened spatial dimensions) d - feature dimension c - number of
+        codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+
+        if is_img_or_video:
+            if z.ndim == 5:
+                b, d, t, h, w = z.shape
+                is_video = True
+            else:
+                b, d, h, w = z.shape
+                is_video = False
+            z = z.reshape(b, d, -1).permute(0, 2, 1)
+
+        z = self.project_in(z)
+        b, n, _ = z.shape
+        z = z.reshape(b, n, self.num_codebooks, -1)
+
+        with torch.autocast("cuda", enabled=False):
+            orig_dtype = z.dtype
+            z = z.float()
+            codes = self.quantize(z)
+            indices = self.codes_to_indices(codes)
+            codes = codes.type(orig_dtype)
+
+        codes = codes.reshape(b, n, -1)
+        out = self.project_out(codes)
+
+        # reconstitute image or video dimensions
+        if is_img_or_video:
+            if is_video:
+                out = out.reshape(b, t, h, w, d).permute(0, 4, 1, 2, 3)
+                indices = indices.reshape(b, t, h, w, 1)
+            else:
+                out = out.reshape(b, h, w, d).permute(0, 3, 1, 2)
+                indices = indices.reshape(b, h, w, 1)
+
+        if not self.keep_num_codebooks_dim:
+            indices = indices.squeeze(-1)
+
+        return out, indices
+
+
+class VidTokDownsample2D(nn.Module):
+    r"""A 2D downsampling layer used in VidTok Model."""
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
+class VidTokUpsample2D(nn.Module):
+    r"""A 2D upsampling layer used in VidTok Model."""
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.interpolate(x.to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype)
+        x = self.conv(x)
+        return x
+
+
+class VidTokLayerNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 5:
+            x = x.permute(0, 2, 3, 4, 1)
+            x = self.norm(x)
+            x = x.permute(0, 4, 1, 2, 3)
+        elif x.dim() == 4:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = x.permute(0, 3, 1, 2)
+        else:
+            x = x.permute(0, 2, 1)
+            x = self.norm(x)
+            x = x.permute(0, 2, 1)
+        return x
+
+
+class VidTokCausalConv1d(nn.Module):
+    r"""A 1D causal convolution layer that pads the input tensor to ensure causality in VidTok Model."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        padding: int = 0,
+    ):
+        super().__init__()
+
+        self.time_pad = dilation * (kernel_size - 1) + (1 - stride)
+
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation)
+
+        self.is_first_chunk = True
+        self.causal_cache = None
+        self.cache_offset = 0
+
+    def forward(self, x):
+        r"""The forward method of the `VidTokCausalConv1d` class."""
+        if self.is_first_chunk:
+            first_frame_pad = x[:, :, :1].repeat((1, 1, self.time_pad))
+        else:
+            first_frame_pad = self.causal_cache
+            if self.time_pad != 0:
+                first_frame_pad = first_frame_pad[:, :, -self.time_pad :]
+            else:
+                first_frame_pad = first_frame_pad[:, :, 0:0]
+        x = torch.concatenate((first_frame_pad, x), dim=2)
+        if self.cache_offset == 0:
+            self.causal_cache = x.clone()
+        else:
+            self.causal_cache = x[:, :, : -self.cache_offset].clone()
+        return self.conv(x)
+
+
+class VidTokCausalConv3d(nn.Module):
+    r"""A 3D causal convolution layer that pads the input tensor to ensure causality in VidTok Model."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        pad_mode: str = "constant",
+    ):
+        super().__init__()
+        self.pad_mode = pad_mode
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(dilation, int):
+            dilation = (dilation,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        time_pad = dilation[0] * (time_kernel_size - 1) + (1 - stride[0])
+        height_pad = dilation[1] * (height_kernel_size - 1) + (1 - stride[1])
+        width_pad = dilation[2] * (width_kernel_size - 1) + (1 - stride[2])
+
+        self.time_pad = time_pad
+        self.spatial_padding = (
+            width_pad // 2,
+            width_pad - width_pad // 2,
+            height_pad // 2,
+            height_pad - height_pad // 2,
+            0,
+            0,
+        )
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation)
+
+        self.is_first_chunk = True
+        self.causal_cache = None
+        self.cache_offset = 0
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokCausalConv3d` class."""
+        if self.is_first_chunk:
+            first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.time_pad, 1, 1))
+        else:
+            first_frame_pad = self.causal_cache
+            if self.time_pad != 0:
+                first_frame_pad = first_frame_pad[:, :, -self.time_pad :]
+            else:
+                first_frame_pad = first_frame_pad[:, :, 0:0]
+        x = torch.concatenate((first_frame_pad, x), dim=2)
+        if self.cache_offset == 0:
+            self.causal_cache = x.clone()
+        else:
+            self.causal_cache = x[:, :, : -self.cache_offset].clone()
+        x = F.pad(x, self.spatial_padding, mode=self.pad_mode)
+        return self.conv(x)
+
+
+class VidTokDownsample3D(nn.Module):
+    r"""A 3D downsampling layer used in VidTok Model."""
+
+    def __init__(self, in_channels: int, out_channels: int, mix_factor: float = 2.0, is_causal: bool = True):
+        super().__init__()
+        self.is_causal = is_causal
+        self.kernel_size = (3, 3, 3)
+        self.avg_pool = nn.AvgPool3d((3, 1, 1), stride=(2, 1, 1))
+        make_conv_cls = VidTokCausalConv3d if self.is_causal else nn.Conv3d
+        self.conv = make_conv_cls(in_channels, out_channels, 3, stride=(2, 1, 1), padding=(0, 1, 1))
+        self.mix_factor = nn.Parameter(torch.Tensor([mix_factor]))
+        if self.is_causal:
+            self.is_first_chunk = True
+            self.causal_cache = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokDownsample3D` class."""
+        alpha = torch.sigmoid(self.mix_factor)
+        if self.is_causal:
+            pad = (0, 0, 0, 0, 1, 0)
+            if self.is_first_chunk:
+                x_pad = torch.nn.functional.pad(x, pad, mode="replicate")
+            else:
+                x_pad = torch.concatenate((self.causal_cache, x), dim=2)
+            self.causal_cache = x_pad[:, :, -1:].clone()
+            x1 = self.avg_pool(x_pad)
+        else:
+            pad = (0, 0, 0, 0, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x1 = self.avg_pool(x)
+        x2 = self.conv(x)
+        return alpha * x1 + (1 - alpha) * x2
+
+
+class VidTokUpsample3D(nn.Module):
+    r"""A 3D upsampling layer used in VidTok Model."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mix_factor: float = 2.0,
+        num_temp_upsample: int = 1,
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        make_conv_cls = VidTokCausalConv3d if is_causal else nn.Conv3d
+        self.conv = make_conv_cls(in_channels, out_channels, 3, padding=1)
+        self.mix_factor = nn.Parameter(torch.Tensor([mix_factor]))
+
+        self.is_causal = is_causal
+        if self.is_causal:
+            self.enable_cached = True
+            self.interpolation_mode = "trilinear"
+            self.is_first_chunk = True
+            self.causal_cache = None
+            self.num_temp_upsample = num_temp_upsample
+        else:
+            self.enable_cached = False
+            self.interpolation_mode = "nearest"
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokUpsample3D` class."""
+        alpha = torch.sigmoid(self.mix_factor)
+        if not self.is_causal:
+            xlst = [
+                F.interpolate(
+                    sx.unsqueeze(0).to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode
+                ).to(x.dtype)
+                for sx in x
+            ]
+            x = torch.cat(xlst, dim=0)
+        else:
+            if not self.enable_cached:
+                x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(
+                    x.dtype
+                )
+            elif not self.is_first_chunk:
+                x = torch.cat([self.causal_cache, x], dim=2)
+                self.causal_cache = x[:, :, -2 * self.num_temp_upsample : -self.num_temp_upsample].clone()
+                x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(
+                    x.dtype
+                )
+                x = x[:, :, 2 * self.num_temp_upsample :]
+            else:
+                self.causal_cache = x[:, :, -self.num_temp_upsample :].clone()
+                x, _x = x[:, :, : self.num_temp_upsample], x[:, :, self.num_temp_upsample :]
+                x = F.interpolate(x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode).to(
+                    x.dtype
+                )
+                if _x.shape[-3] > 0:
+                    _x = F.interpolate(
+                        _x.to(torch.float32), scale_factor=[2.0, 1.0, 1.0], mode=self.interpolation_mode
+                    ).to(_x.dtype)
+                    x = torch.concat([x, _x], dim=2)
+        x_ = self.conv(x)
+        return alpha * x + (1 - alpha) * x_
+
+
+class VidTokAttnBlock(nn.Module):
+    r"""A 2D self-attention block used in VidTok Model."""
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = VidTokLayerNorm(dim=in_channels, eps=1e-6)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def attention(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""Implement self-attention."""
+        hidden_states = self.norm(hidden_states)
+        q = self.q(hidden_states)
+        k = self.k(hidden_states)
+        v = self.v(hidden_states)
+        b, c, h, w = q.shape
+        q, k, v = [x.permute(0, 2, 3, 1).reshape(b, -1, c).unsqueeze(1).contiguous() for x in [q, k, v]]
+        hidden_states = F.scaled_dot_product_attention(q, k, v)  # scale is dim ** -0.5 per default
+        return hidden_states.squeeze(1).reshape(b, h, w, c).permute(0, 3, 1, 2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokAttnBlock` class."""
+        hidden_states = x
+        hidden_states = self.attention(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return x + hidden_states
+
+
+class VidTokAttnBlockWrapper(VidTokAttnBlock):
+    r"""A 3D self-attention block used in VidTok Model."""
+
+    def __init__(self, in_channels: int, is_causal: bool = True):
+        super().__init__(in_channels)
+        make_conv_cls = VidTokCausalConv3d if is_causal else nn.Conv3d
+        self.q = make_conv_cls(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = make_conv_cls(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = make_conv_cls(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = make_conv_cls(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def attention(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""Implement self-attention."""
+        hidden_states = self.norm(hidden_states)
+        q = self.q(hidden_states)
+        k = self.k(hidden_states)
+        v = self.v(hidden_states)
+        b, c, t, h, w = q.shape
+        q, k, v = [x.permute(0, 2, 3, 4, 1).reshape(b, t, -1, c).contiguous() for x in [q, k, v]]
+        hidden_states = F.scaled_dot_product_attention(q, k, v)  # scale is dim ** -0.5 per default
+        return hidden_states.reshape(b, t, h, w, c).permute(0, 4, 1, 2, 3)
+
+
+class VidTokResnetBlock(nn.Module):
+    r"""A versatile ResNet block used in VidTok Model."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        btype: str = "3d",
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        assert btype in ["1d", "2d", "3d"], f"Invalid btype: {btype}"
+        if btype == "2d":
+            make_conv_cls = nn.Conv2d
+        elif btype == "1d":
+            make_conv_cls = VidTokCausalConv1d if is_causal else nn.Conv1d
+        else:
+            make_conv_cls = VidTokCausalConv3d if is_causal else nn.Conv3d
+
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.nonlinearity = nn.SiLU()
+
+        self.norm1 = VidTokLayerNorm(dim=in_channels, eps=1e-6)
+        self.conv1 = make_conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = nn.Linear(temb_channels, out_channels)
+        self.norm2 = VidTokLayerNorm(dim=out_channels, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = make_conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = make_conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = make_conv_cls(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x: torch.Tensor, temb: Optional[torch.Tensor]) -> torch.Tensor:
+        r"""The forward method of the `VidTokResnetBlock` class."""
+        hidden_states = x
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None]
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + hidden_states
+
+
+class VidTokEncoder3D(nn.Module):
+    r"""
+    The `VidTokEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        ch (`int`):
+            The number of the basic channel.
+        ch_mult (`List[int]`, defaults to `[1, 2, 4, 8]`):
+            The multiple of the basic channel for each block.
+        num_res_blocks (`int`, defaults to 2):
+            The number of resblocks.
+        dropout (`float`, defaults to 0.0):
+            Dropout rate.
+        z_channels (`int`, defaults to 4):
+            The number of latent channels.
+        double_z (`bool`, defaults to `True`):
+            Whether or not to double the z_channels.
+        spatial_ds (`List`, *optional*, defaults to `None`):
+            Spatial downsample layers.
+        tempo_ds (`List`, *optional*, defaults to `None`):
+            Temporal downsample layers.
+        is_causal (`bool`, defaults to `True`):
+            Whether it is a causal module.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        ch: int,
+        ch_mult: List[int] = [1, 2, 4, 8],
+        num_res_blocks: int = 2,
+        dropout: float = 0.0,
+        z_channels: int = 4,
+        double_z: bool = True,
+        spatial_ds: Optional[List] = None,
+        tempo_ds: Optional[List] = None,
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        self.is_causal = is_causal
+
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        self.nonlinearity = nn.SiLU()
+
+        make_conv_cls = VidTokCausalConv3d if self.is_causal else nn.Conv3d
+
+        self.conv_in = make_conv_cls(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.spatial_ds = list(range(0, self.num_resolutions - 1)) if spatial_ds is None else spatial_ds
+        self.tempo_ds = [self.num_resolutions - 2, self.num_resolutions - 3] if tempo_ds is None else tempo_ds
+        self.down = nn.ModuleList()
+        self.down_temporal = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_temporal = nn.ModuleList()
+            attn_temporal = nn.ModuleList()
+
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    VidTokResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        btype="2d",
+                    )
+                )
+                block_temporal.append(
+                    VidTokResnetBlock(
+                        in_channels=block_out,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        btype="1d",
+                        is_causal=self.is_causal,
+                    )
+                )
+                block_in = block_out
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+
+            down_temporal = nn.Module()
+            down_temporal.block = block_temporal
+            down_temporal.attn = attn_temporal
+
+            if i_level in self.spatial_ds:
+                down.downsample = VidTokDownsample2D(block_in)
+                if i_level in self.tempo_ds:
+                    down_temporal.downsample = VidTokDownsample3D(block_in, block_in, is_causal=self.is_causal)
+
+            self.down.append(down)
+            self.down_temporal.append(down_temporal)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = VidTokResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            btype="3d",
+            is_causal=self.is_causal,
+        )
+        self.mid.attn_1 = VidTokAttnBlockWrapper(block_in, is_causal=self.is_causal)
+        self.mid.block_2 = VidTokResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            btype="3d",
+            is_causal=self.is_causal,
+        )
+
+        # end
+        self.norm_out = VidTokLayerNorm(dim=block_in, eps=1e-6)
+        self.conv_out = make_conv_cls(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokEncoder3D` class."""
+        temb = None
+        B, _, T, H, W = x.shape
+        hs = [self.conv_in(x)]
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for i_level in range(self.num_resolutions):
+                for i_block in range(self.num_res_blocks):
+                    hidden_states = hs[-1].permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self._gradient_checkpointing_func(
+                        self.down[i_level].block[i_block], hidden_states, temb
+                    )
+                    hidden_states = (
+                        hidden_states.reshape(B, T, -1, H, W).permute(0, 3, 4, 2, 1).reshape(B * H * W, -1, T)
+                    )
+                    hidden_states = self._gradient_checkpointing_func(
+                        self.down_temporal[i_level].block[i_block], hidden_states, temb
+                    )
+                    hidden_states = hidden_states.reshape(B, H, W, -1, T).permute(0, 3, 4, 1, 2)
+                    hs.append(hidden_states)
+
+                if i_level in self.spatial_ds:
+                    # spatial downsample
+                    hidden_states = hs[-1].permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self._gradient_checkpointing_func(self.down[i_level].downsample, hidden_states)
+                    hidden_states = hidden_states.reshape(B, T, -1, *hidden_states.shape[-2:]).permute(0, 2, 1, 3, 4)
+                    if i_level in self.tempo_ds:
+                        # temporal downsample
+                        hidden_states = self._gradient_checkpointing_func(
+                            self.down_temporal[i_level].downsample, hidden_states
+                        )
+                    hs.append(hidden_states)
+                    B, _, T, H, W = hidden_states.shape
+            # middle
+            hidden_states = hs[-1]
+            hidden_states = self._gradient_checkpointing_func(self.mid.block_1, hidden_states, temb)
+            hidden_states = self._gradient_checkpointing_func(self.mid.attn_1, hidden_states)
+            hidden_states = self._gradient_checkpointing_func(self.mid.block_2, hidden_states, temb)
+
+        else:
+            for i_level in range(self.num_resolutions):
+                for i_block in range(self.num_res_blocks):
+                    hidden_states = hs[-1].permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self.down[i_level].block[i_block](hidden_states, temb)
+                    hidden_states = (
+                        hidden_states.reshape(B, T, -1, H, W).permute(0, 3, 4, 2, 1).reshape(B * H * W, -1, T)
+                    )
+                    hidden_states = self.down_temporal[i_level].block[i_block](hidden_states, temb)
+                    hidden_states = hidden_states.reshape(B, H, W, -1, T).permute(0, 3, 4, 1, 2)
+                    hs.append(hidden_states)
+
+                if i_level in self.spatial_ds:
+                    # spatial downsample
+                    hidden_states = hs[-1].permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self.down[i_level].downsample(hidden_states)
+                    hidden_states = hidden_states.reshape(B, T, -1, *hidden_states.shape[-2:]).permute(0, 2, 1, 3, 4)
+                    if i_level in self.tempo_ds:
+                        # temporal downsample
+                        hidden_states = self.down_temporal[i_level].downsample(hidden_states)
+                    hs.append(hidden_states)
+                    B, _, T, H, W = hidden_states.shape
+            # middle
+            hidden_states = hs[-1]
+            hidden_states = self.mid.block_1(hidden_states, temb)
+            hidden_states = self.mid.attn_1(hidden_states)
+            hidden_states = self.mid.block_2(hidden_states, temb)
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class VidTokDecoder3D(nn.Module):
+    r"""
+    The `VidTokDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
+    video.
+
+    Args:
+        ch (`int`):
+            The number of the basic channel.
+        ch_mult (`List[int]`, defaults to `[1, 2, 4, 8]`):
+            The multiple of the basic channel for each block.
+        num_res_blocks (`int`, defaults to 2):
+            The number of resblocks.
+        dropout (`float`, defaults to 0.0):
+            Dropout rate.
+        z_channels (`int`, defaults to 4):
+            The number of latent channels.
+        out_channels (`int`, defaults to 3):
+            The number of output channels.
+        spatial_us (`List`, *optional*, defaults to `None`):
+            Spatial upsample layers.
+        tempo_us (`List`, *optional*, defaults to `None`):
+            Temporal upsample layers.
+        is_causal (`bool`, defaults to `True`):
+            Whether it is a causal module.
+    """
+
+    def __init__(
+        self,
+        ch: int,
+        ch_mult: List[int] = [1, 2, 4, 8],
+        num_res_blocks: int = 2,
+        dropout: float = 0.0,
+        z_channels: int = 4,
+        out_channels: int = 3,
+        spatial_us: Optional[List] = None,
+        tempo_us: Optional[List] = None,
+        is_causal: bool = True,
+    ):
+        super().__init__()
+
+        self.is_causal = is_causal
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.nonlinearity = nn.SiLU()
+
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+
+        make_conv_cls = VidTokCausalConv3d if self.is_causal else nn.Conv3d
+
+        self.conv_in = make_conv_cls(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = VidTokResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            btype="3d",
+            is_causal=self.is_causal,
+        )
+        self.mid.attn_1 = VidTokAttnBlockWrapper(block_in, is_causal=self.is_causal)
+        self.mid.block_2 = VidTokResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            btype="3d",
+            is_causal=self.is_causal,
+        )
+
+        # upsampling
+        self.spatial_us = list(range(1, self.num_resolutions)) if spatial_us is None else spatial_us
+        self.tempo_us = [1, 2] if tempo_us is None else tempo_us
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    VidTokResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        btype="2d",
+                    )
+                )
+                block_in = block_out
+
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level in self.spatial_us:
+                up.upsample = VidTokUpsample2D(block_in)
+            self.up.insert(0, up)
+
+        num_temp_upsample = 1
+        self.up_temporal = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    VidTokResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        btype="1d",
+                        is_causal=self.is_causal,
+                    )
+                )
+                block_in = block_out
+            up_temporal = nn.Module()
+            up_temporal.block = block
+            up_temporal.attn = attn
+            if i_level in self.tempo_us:
+                up_temporal.upsample = VidTokUpsample3D(
+                    block_in, block_in, num_temp_upsample=num_temp_upsample, is_causal=self.is_causal
+                )
+                num_temp_upsample *= 2
+
+            self.up_temporal.insert(0, up_temporal)
+
+        # end
+        self.norm_out = VidTokLayerNorm(dim=block_in, eps=1e-6)
+        self.conv_out = make_conv_cls(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `VidTokDecoder3D` class."""
+        temb = None
+        B, _, T, H, W = z.shape
+        hidden_states = self.conv_in(z)
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            # middle
+            hidden_states = self._gradient_checkpointing_func(self.mid.block_1, hidden_states, temb)
+            hidden_states = self._gradient_checkpointing_func(self.mid.attn_1, hidden_states)
+            hidden_states = self._gradient_checkpointing_func(self.mid.block_2, hidden_states, temb)
+
+            for i_level in reversed(range(self.num_resolutions)):
+                for i_block in range(self.num_res_blocks + 1):
+                    hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self._gradient_checkpointing_func(
+                        self.up[i_level].block[i_block], hidden_states, temb
+                    )
+                    hidden_states = (
+                        hidden_states.reshape(B, T, -1, H, W).permute(0, 3, 4, 2, 1).reshape(B * H * W, -1, T)
+                    )
+                    hidden_states = self._gradient_checkpointing_func(
+                        self.up_temporal[i_level].block[i_block], hidden_states, temb
+                    )
+                    hidden_states = hidden_states.reshape(B, H, W, -1, T).permute(0, 3, 4, 1, 2)
+
+                if i_level in self.spatial_us:
+                    # spatial upsample
+                    hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self._gradient_checkpointing_func(self.up[i_level].upsample, hidden_states)
+                    hidden_states = hidden_states.reshape(B, T, -1, *hidden_states.shape[-2:]).permute(0, 2, 1, 3, 4)
+                    if i_level in self.tempo_us:
+                        # temporal upsample
+                        hidden_states = self._gradient_checkpointing_func(
+                            self.up_temporal[i_level].upsample, hidden_states
+                        )
+                    B, _, T, H, W = hidden_states.shape
+
+        else:
+            # middle
+            hidden_states = self.mid.block_1(hidden_states, temb)
+            hidden_states = self.mid.attn_1(hidden_states)
+            hidden_states = self.mid.block_2(hidden_states, temb)
+
+            for i_level in reversed(range(self.num_resolutions)):
+                for i_block in range(self.num_res_blocks + 1):
+                    hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self.up[i_level].block[i_block](hidden_states, temb)
+                    hidden_states = (
+                        hidden_states.reshape(B, T, -1, H, W).permute(0, 3, 4, 2, 1).reshape(B * H * W, -1, T)
+                    )
+                    hidden_states = self.up_temporal[i_level].block[i_block](hidden_states, temb)
+                    hidden_states = hidden_states.reshape(B, H, W, -1, T).permute(0, 3, 4, 1, 2)
+
+                if i_level in self.spatial_us:
+                    # spatial upsample
+                    hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(B * T, -1, H, W)
+                    hidden_states = self.up[i_level].upsample(hidden_states)
+                    hidden_states = hidden_states.reshape(B, T, -1, *hidden_states.shape[-2:]).permute(0, 2, 1, 3, 4)
+                    if i_level in self.tempo_us:
+                        # temporal upsample
+                        hidden_states = self.up_temporal[i_level].upsample(hidden_states)
+                    B, _, T, H, W = hidden_states.shape
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        out = self.conv_out(hidden_states)
+        return out
+
+
+class AutoencoderVidTok(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model for encoding videos into latents and decoding latent representations into videos, supporting both
+    continuous and discrete latent representations. Used in [VidTok](https://github.com/microsoft/VidTok).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Args:
+        in_channels (`int`, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, defaults to 3):
+            The number of output channels.
+        ch (`int`, defaults to 128):
+            The number of the basic channel.
+        ch_mult (`List[int]`, defaults to `[1, 2, 4, 4]`):
+            The multiple of the basic channel for each block.
+        z_channels (`int`, defaults to 4):
+            The number of latent channels.
+        double_z (`bool`, defaults to `True`):
+            Whether or not to double the z_channels.
+        num_res_blocks (`int`, defaults to 2):
+            The number of resblocks.
+        spatial_ds (`List`, *optional*, defaults to `None`):
+            Spatial downsample layers.
+        spatial_us (`List`, *optional*, defaults to `None`):
+            Spatial upsample layers.
+        tempo_ds (`List`, *optional*, defaults to `None`):
+            Temporal downsample layers.
+        tempo_us (`List`, *optional*, defaults to `None`):
+            Temporal upsample layers.
+        dropout (`float`, defaults to 0.0):
+            Dropout rate.
+        regularizer (`str`, defaults to `"kl"`):
+            The regularizer type - "kl" for continuous cases and "fsq" for discrete cases.
+        codebook_size (`int`, defaults to 262144):
+            The codebook size used only in discrete cases.
+        is_causal (`bool`, defaults to `True`):
+            Whether it is a causal module.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        ch: int = 128,
+        ch_mult: List[int] = [1, 2, 4, 4],
+        z_channels: int = 4,
+        double_z: bool = True,
+        num_res_blocks: int = 2,
+        spatial_ds: Optional[List] = None,
+        spatial_us: Optional[List] = None,
+        tempo_ds: Optional[List] = None,
+        tempo_us: Optional[List] = None,
+        dropout: float = 0.0,
+        regularizer: str = "kl",
+        codebook_size: int = 262144,
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        self.is_causal = is_causal
+
+        self.encoder = VidTokEncoder3D(
+            in_channels=in_channels,
+            ch=ch,
+            ch_mult=ch_mult,
+            num_res_blocks=num_res_blocks,
+            dropout=dropout,
+            z_channels=z_channels,
+            double_z=double_z,
+            spatial_ds=spatial_ds,
+            tempo_ds=tempo_ds,
+            is_causal=self.is_causal,
+        )
+        self.decoder = VidTokDecoder3D(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_res_blocks=num_res_blocks,
+            dropout=dropout,
+            z_channels=z_channels,
+            out_channels=out_channels,
+            spatial_us=spatial_us,
+            tempo_us=tempo_us,
+            is_causal=self.is_causal,
+        )
+        self.temporal_compression_ratio = 2 ** len(self.encoder.tempo_ds)
+
+        self.regularizer = regularizer
+        assert self.regularizer in ["kl", "fsq"], f"Invalid regularizer: {self.regtype}. Only support 'kl' and 'fsq'."
+
+        if self.regularizer == "fsq":
+            assert z_channels == int(math.log(codebook_size, 8)) and double_z is False
+            self.regularization = FSQRegularizer(levels=[8] * z_channels)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # Decode more latent frames at once
+        self.num_sample_frames_batch_size = 16
+        self.num_latent_frames_batch_size = self.num_sample_frames_batch_size // self.temporal_compression_ratio
+
+        # We make the minimum height and width of sample for tiling half that of the generally supported
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+        self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds)))
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds)))
+        self.tile_overlap_factor_height = 0.0  # 1 / 8
+        self.tile_overlap_factor_width = 0.0  # 1 / 8
+
+    @staticmethod
+    def _pad_at_dim(
+        t: torch.Tensor, pad: Tuple[int], dim: int = -1, pad_mode: str = "constant", value: float = 0.0
+    ) -> torch.Tensor:
+        r"""Pad function. Supported pad_mode: `constant`, `replicate`, `reflect`."""
+        dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1)
+        zeros = (0, 0) * dims_from_right
+        if pad_mode == "constant":
+            return F.pad(t, (*zeros, *pad), value=value)
+        return F.pad(t, (*zeros, *pad), mode=pad_mode)
+
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_overlap_factor_height: Optional[float] = None,
+        tile_overlap_factor_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+
+        Args:
+            tile_sample_min_height (`int`, *optional*, defaults to `None`):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*, defaults to `None`):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_overlap_factor_height (`float`, *optional*, defaults to `None`):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+            tile_overlap_factor_width (`float`, *optional*, defaults to `None`):
+                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
+                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = int(self.tile_sample_min_height / (2 ** len(self.encoder.spatial_ds)))
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** len(self.encoder.spatial_ds)))
+        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
+        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        self._empty_causal_cached(self.encoder)
+        self._set_first_chunk(True)
+
+        if self.use_tiling:
+            return self.tiled_encode(x)
+        return self.encoder(x)
+
+    @apply_forward_hook
+    def encode(self, x: torch.Tensor) -> Union[AutoencoderKLOutput, Tuple[torch.Tensor]]:
+        r"""
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+
+        Returns:
+            `AutoencoderKLOutput` or `Tuple[torch.Tensor]`:
+                The latent representations of the encoded videos. If the regularizer is `kl`, an `AutoencoderKLOutput`
+                is returned, otherwise a tuple of `torch.Tensor` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            z = torch.cat(encoded_slices)
+        else:
+            z = self._encode(x)
+
+        if self.regularizer == "kl":
+            posterior = DiagonalGaussianDistribution(z)
+            return AutoencoderKLOutput(latent_dist=posterior)
+        else:
+            quant_z, indices = self.regularization(z)
+            return quant_z, indices
+
+    def _decode(self, z: torch.Tensor, decode_from_indices: bool = False) -> torch.Tensor:
+        self._empty_causal_cached(self.decoder)
+        self._set_first_chunk(True)
+        if not self.is_causal and z.shape[-3] % self.num_latent_frames_batch_size != 0:
+            assert (
+                z.shape[-3] >= self.num_latent_frames_batch_size
+            ), f"Too short latent frames. At least {self.num_latent_frames_batch_size} frames."
+            z = z[..., : (z.shape[-3] // self.num_latent_frames_batch_size * self.num_latent_frames_batch_size), :, :]
+        if decode_from_indices:
+            z = self.tile_indices_to_latent(z) if self.use_tiling else self.indices_to_latent(z)
+        dec = self.tiled_decode(z) if self.use_tiling else self.decoder(z)
+        return dec
+
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, decode_from_indices: bool = False) -> torch.Tensor:
+        r"""
+        Decode a batch of images from latents.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            decode_from_indices (`bool`): If decode from indices or decode from latent code.
+        Returns:
+            `torch.Tensor`: The decoded images.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice, decode_from_indices=decode_from_indices) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z, decode_from_indices=decode_from_indices)
+        if self.is_causal:
+            decoded = decoded[:, :, self.temporal_compression_ratio - 1 :, :, :]
+        return decoded
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def build_chunk_start_end(self, t, decoder_mode=False):
+        if self.is_causal:
+            start_end = [[0, self.temporal_compression_ratio]] if not decoder_mode else [[0, 1]]
+            start = start_end[0][-1]
+        else:
+            start_end, start = [], 0
+        end = start
+        while True:
+            if start >= t:
+                break
+            end = min(
+                t, end + (self.num_latent_frames_batch_size if decoder_mode else self.num_sample_frames_batch_size)
+            )
+            start_end.append([start, end])
+            start = end
+        if len(start_end) > (2 if self.is_causal else 1):
+            if start_end[-1][1] - start_end[-1][0] < (
+                self.num_latent_frames_batch_size if decoder_mode else self.num_sample_frames_batch_size
+            ):
+                start_end[-2] = [start_end[-2][0], start_end[-1][1]]
+                start_end = start_end[:-1]
+        return start_end
+
+    def _set_first_chunk(self, is_first_chunk=True):
+        for module in self.modules():
+            if hasattr(module, "is_first_chunk"):
+                module.is_first_chunk = is_first_chunk
+
+    def _empty_causal_cached(self, parent):
+        for name, module in parent.named_modules():
+            if hasattr(module, "causal_cache"):
+                module.causal_cache = None
+
+    def _set_cache_offset(self, modules, cache_offset=0):
+        for module in modules:
+            for submodule in module.modules():
+                if hasattr(submodule, "cache_offset"):
+                    submodule.cache_offset = cache_offset
+
+    def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`: The latent representation of the encoded videos.
+        """
+        num_frames, height, width = x.shape[-3:]
+
+        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_latent_min_height - blend_extent_height
+        row_limit_width = self.tile_latent_min_width - blend_extent_width
+
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                start_end = self.build_chunk_start_end(num_frames)
+                time = []
+                for idx, (start_frame, end_frame) in enumerate(start_end):
+                    self._set_first_chunk(idx == 0)
+                    tile = x[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_sample_min_height,
+                        j : j + self.tile_sample_min_width,
+                    ]
+                    tile = self.encoder(tile)
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+        enc = torch.cat(result_rows, dim=3)
+        return enc
+
+    def indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor:
+        r"""
+        Transform indices to latent code.
+
+        Args:
+            token_indices (`torch.Tensor`): Token indices.
+
+        Returns:
+            `torch.Tensor`: Latent code corresponding to the input token indices.
+        """
+        b, t, h, w = token_indices.shape
+        token_indices = token_indices.unsqueeze(-1).reshape(b, -1, 1)
+        codes = self.regularization.indices_to_codes(token_indices)
+        codes = codes.permute(0, 2, 3, 1).reshape(b, codes.shape[2], -1)
+        z = self.regularization.project_out(codes)
+        return z.reshape(b, t, h, w, -1).permute(0, 4, 1, 2, 3)
+
+    def tile_indices_to_latent(self, token_indices: torch.Tensor) -> torch.Tensor:
+        r"""
+        Transform indices to latent code with tiling inference.
+
+        Args:
+            token_indices (`torch.Tensor`): Token indices.
+
+        Returns:
+            `torch.Tensor`: Latent code corresponding to the input token indices.
+        """
+        num_frames = token_indices.shape[1]
+        start_end = self.build_chunk_start_end(num_frames, decoder_mode=True)
+        result_z = []
+        for start, end in start_end:
+            chunk_z = self.indices_to_latent(token_indices[:, start:end, :, :])
+            result_z.append(chunk_z.clone())
+        return torch.cat(result_z, dim=2)
+
+    def tiled_decode(self, z: torch.Tensor) -> torch.Tensor:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+
+        Returns:
+            `torch.Tensor`: Reconstructed batch of videos.
+        """
+        num_frames, height, width = z.shape[-3:]
+
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_sample_min_height - blend_extent_height
+        row_limit_width = self.tile_sample_min_width - blend_extent_width
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                if self.is_causal:
+                    assert self.temporal_compression_ratio in [
+                        2,
+                        4,
+                        8,
+                    ], "Only support 2x, 4x or 8x temporal downsampling now."
+                    if self.temporal_compression_ratio == 4:
+                        self._set_cache_offset([self.decoder], 1)
+                        self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 2)
+                        self._set_cache_offset(
+                            [self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out],
+                            4,
+                        )
+                    elif self.temporal_compression_ratio == 2:
+                        self._set_cache_offset([self.decoder], 1)
+                        self._set_cache_offset(
+                            [
+                                self.decoder.up_temporal[2].upsample,
+                                self.decoder.up_temporal[1],
+                                self.decoder.up_temporal[0],
+                                self.decoder.conv_out,
+                            ],
+                            2,
+                        )
+                    else:
+                        self._set_cache_offset([self.decoder], 1)
+                        self._set_cache_offset([self.decoder.up_temporal[3].upsample, self.decoder.up_temporal[2]], 2)
+                        self._set_cache_offset([self.decoder.up_temporal[2].upsample, self.decoder.up_temporal[1]], 4)
+                        self._set_cache_offset(
+                            [self.decoder.up_temporal[1].upsample, self.decoder.up_temporal[0], self.decoder.conv_out],
+                            8,
+                        )
+
+                start_end = self.build_chunk_start_end(num_frames, decoder_mode=True)
+                time = []
+                for idx, (start_frame, end_frame) in enumerate(start_end):
+                    self._set_first_chunk(idx == 0)
+                    tile = z[
+                        :,
+                        :,
+                        start_frame : (end_frame + 1 if self.is_causal and end_frame + 1 <= num_frames else end_frame),
+                        i : i + self.tile_latent_min_height,
+                        j : j + self.tile_latent_min_width,
+                    ]
+                    tile = self.decoder(tile)
+                    if self.is_causal and end_frame + 1 <= num_frames:
+                        tile = tile[:, :, : -self.temporal_compression_ratio]
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+
+        dec = torch.cat(result_rows, dim=3)
+        return dec
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = True,
+        encoder_mode: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[torch.Tensor, DecoderOutput]:
+        r"""The forward method of the `AutoencoderVidTok` class."""
+        x = sample
+        res = 1 if self.is_causal else 0
+        if self.is_causal:
+            if x.shape[2] % self.temporal_compression_ratio != res:
+                time_padding = self.temporal_compression_ratio - x.shape[2] % self.temporal_compression_ratio + res
+                x = self._pad_at_dim(x, (0, time_padding), dim=2, pad_mode="replicate")
+            else:
+                time_padding = 0
+        else:
+            if x.shape[2] % self.num_sample_frames_batch_size != res:
+                if not encoder_mode:
+                    time_padding = (
+                        self.num_sample_frames_batch_size - x.shape[2] % self.num_sample_frames_batch_size + res
+                    )
+                    x = self._pad_at_dim(x, (0, time_padding), dim=2, pad_mode="replicate")
+                else:
+                    assert (
+                        x.shape[2] >= self.num_sample_frames_batch_size
+                    ), f"Too short video. At least {self.num_sample_frames_batch_size} frames."
+                    x = x[:, :, : x.shape[2] // self.num_sample_frames_batch_size * self.num_sample_frames_batch_size]
+            else:
+                time_padding = 0
+
+        if self.is_causal:
+            x = self._pad_at_dim(x, (self.temporal_compression_ratio - 1, 0), dim=2, pad_mode="replicate")
+
+        if self.regularizer == "kl":
+            posterior = self.encode(x).latent_dist
+            if sample_posterior:
+                z = posterior.sample(generator=generator)
+            else:
+                z = posterior.mode()
+            if encoder_mode:
+                return z
+        else:
+            z, indices = self.encode(x)
+            if encoder_mode:
+                return z, indices
+
+        dec = self.decode(z)
+        if time_padding != 0:
+            dec = dec[:, :, :-time_padding, :, :]
+
+        if not return_dict:
+            return dec
+        return DecoderOutput(sample=dec)
diff --git a/tests/models/autoencoders/test_models_autoencoder_vidtok.py b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
new file mode 100644
index 000000000000..cca950ab312c
--- /dev/null
+++ b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AutoencoderVidTok
+from diffusers.utils.testing_utils import (
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+class AutoencoderVidTokTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderVidTok
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_vidtok_config(self):
+        return {
+            "is_causal": False,
+            "in_channels": 3,
+            "out_channels": 3,
+            "ch": 128,
+            "ch_mult": [1, 2, 4, 4, 4],
+            "z_channels": 6,
+            "double_z": False,
+            "num_res_blocks": 2,
+            "regularizer": "fsq",
+            "codebook_size": 262144,
+        }
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_frames = 16
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 16, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 16, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_vidtok_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {
+            "VidTokEncoder3D",
+            "VidTokDecoder3D",
+        }
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_forward_with_norm_groups(self):
+        r"""VidTok uses layernorm instead of groupnorm."""
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    @unittest.skip("Unsupported test.")
+    def test_outputs_equivalence(self):
+        pass