huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/_toctree.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/autoencoderkl_ltx_video.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/autoencoderkl_ltx_video.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/ltx_video_transformer3d.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/ltx_video_transformer3d.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/loaders/transformer_flux.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/loaders/transformer_flux.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/autoencoders/autoencoder_kl_ltx.py‎
Lines changed: 38 additions & 37 deletions b/‎src/diffusers/models/autoencoders/autoencoder_kl_ltx.py‎
Lines changed: 38 additions & 37 deletions
diff --git a/‎src/diffusers/models/embeddings.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/models/embeddings.py‎
Lines changed: 1 addition & 1 deletion
@@ -429,7 +429,7 @@
     - local: api/pipelines/ledits_pp
       title: LEDITS++
     - local: api/pipelines/ltx_video
-      title: LTX
+      title: LTXVideo
     - local: api/pipelines/lumina
       title: Lumina-T2X
     - local: api/pipelines/marigold
 
@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
 ```python
 from diffusers import AutoencoderKLLTXVideo
 
-vae = AutoencoderKLLTXVideo.from_pretrained("TODO/TODO", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda")
 ```
 
 ## AutoencoderKLLTXVideo
 
@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
 ```python
 from diffusers import LTXVideoTransformer3DModel
 
-transformer = LTXVideoTransformer3DModel.from_pretrained("TODO/TODO", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
 ```
 
 ## LTXVideoTransformer3DModel
 
@@ -177,3 +177,5 @@ def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
 
         self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
         self.config.encoder_hid_dim_type = "ip_image_proj"
+
+        self.to(dtype=self.dtype, device=self.device)
@@ -29,7 +29,7 @@
 from .vae import DecoderOutput, DiagonalGaussianDistribution
 
 
-class LTXCausalConv3d(nn.Module):
+class LTXVideoCausalConv3d(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -80,9 +80,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class LTXResnetBlock3d(nn.Module):
+class LTXVideoResnetBlock3d(nn.Module):
     r"""
-    A 3D ResNet block used in the LTX model.
+    A 3D ResNet block used in the LTXVideo model.
 
     Args:
         in_channels (`int`):
@@ -120,21 +120,21 @@ def __init__(
         self.nonlinearity = get_activation(non_linearity)
 
         self.norm1 = RMSNorm(in_channels, eps=1e-8, elementwise_affine=elementwise_affine)
-        self.conv1 = LTXCausalConv3d(
+        self.conv1 = LTXVideoCausalConv3d(
             in_channels=in_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
         )
 
         self.norm2 = RMSNorm(out_channels, eps=1e-8, elementwise_affine=elementwise_affine)
         self.dropout = nn.Dropout(dropout)
-        self.conv2 = LTXCausalConv3d(
+        self.conv2 = LTXVideoCausalConv3d(
             in_channels=out_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
         )
 
         self.norm3 = None
         self.conv_shortcut = None
         if in_channels != out_channels:
             self.norm3 = nn.LayerNorm(in_channels, eps=eps, elementwise_affine=True, bias=True)
-            self.conv_shortcut = LTXCausalConv3d(
+            self.conv_shortcut = LTXVideoCausalConv3d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, is_causal=is_causal
             )
 
@@ -196,7 +196,7 @@ def forward(
         return hidden_states
 
 
-class LTXUpsampler3d(nn.Module):
+class LTXVideoUpsampler3d(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -213,7 +213,7 @@ def __init__(
 
         out_channels = (in_channels * stride[0] * stride[1] * stride[2]) // upscale_factor
 
-        self.conv = LTXCausalConv3d(
+        self.conv = LTXVideoCausalConv3d(
             in_channels=in_channels,
             out_channels=out_channels,
             kernel_size=3,
@@ -246,9 +246,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class LTXDownBlock3D(nn.Module):
+class LTXVideoDownBlock3D(nn.Module):
     r"""
-    Down block used in the LTX model.
+    Down block used in the LTXVideo model.
 
     Args:
         in_channels (`int`):
@@ -290,7 +290,7 @@ def __init__(
         resnets = []
         for _ in range(num_layers):
             resnets.append(
-                LTXResnetBlock3d(
+                LTXVideoResnetBlock3d(
                     in_channels=in_channels,
                     out_channels=in_channels,
                     dropout=dropout,
@@ -305,7 +305,7 @@ def __init__(
         if spatio_temporal_scale:
             self.downsamplers = nn.ModuleList(
                 [
-                    LTXCausalConv3d(
+                    LTXVideoCausalConv3d(
                         in_channels=in_channels,
                         out_channels=in_channels,
                         kernel_size=3,
@@ -317,7 +317,7 @@ def __init__(
 
         self.conv_out = None
         if in_channels != out_channels:
-            self.conv_out = LTXResnetBlock3d(
+            self.conv_out = LTXVideoResnetBlock3d(
                 in_channels=in_channels,
                 out_channels=out_channels,
                 dropout=dropout,
@@ -362,9 +362,9 @@ def create_forward(*inputs):
 
 
 # Adapted from diffusers.models.autoencoders.autoencoder_kl_cogvideox.CogVideoMidBlock3d
-class LTXMidBlock3d(nn.Module):
+class LTXVideoMidBlock3d(nn.Module):
     r"""
-    A middle block used in the LTX model.
+    A middle block used in the LTXVideo model.
 
     Args:
         in_channels (`int`):
@@ -403,7 +403,7 @@ def __init__(
         resnets = []
         for _ in range(num_layers):
             resnets.append(
-                LTXResnetBlock3d(
+                LTXVideoResnetBlock3d(
                     in_channels=in_channels,
                     out_channels=in_channels,
                     dropout=dropout,
@@ -454,9 +454,9 @@ def create_forward(*inputs):
         return hidden_states
 
 
-class LTXUpBlock3d(nn.Module):
+class LTXVideoUpBlock3d(nn.Module):
     r"""
-    Up block used in the LTX model.
+    Up block used in the LTXVideo model.
 
     Args:
         in_channels (`int`):
@@ -505,7 +505,7 @@ def __init__(
 
         self.conv_in = None
         if in_channels != out_channels:
-            self.conv_in = LTXResnetBlock3d(
+            self.conv_in = LTXVideoResnetBlock3d(
                 in_channels=in_channels,
                 out_channels=out_channels,
                 dropout=dropout,
@@ -520,7 +520,7 @@ def __init__(
         if spatio_temporal_scale:
             self.upsamplers = nn.ModuleList(
                 [
-                    LTXUpsampler3d(
+                    LTXVideoUpsampler3d(
                         out_channels * upscale_factor,
                         stride=(2, 2, 2),
                         is_causal=is_causal,
@@ -533,7 +533,7 @@ def __init__(
         resnets = []
         for _ in range(num_layers):
             resnets.append(
-                LTXResnetBlock3d(
+                LTXVideoResnetBlock3d(
                     in_channels=out_channels,
                     out_channels=out_channels,
                     dropout=dropout,
@@ -589,9 +589,9 @@ def create_forward(*inputs):
         return hidden_states
 
 
-class LTXEncoder3d(nn.Module):
+class LTXVideoEncoder3d(nn.Module):
     r"""
-    The `LTXEncoder3D` layer of a variational autoencoder that encodes input video samples to its latent
+    The `LTXVideoEncoder3d` layer of a variational autoencoder that encodes input video samples to its latent
     representation.
 
     Args:
@@ -635,7 +635,7 @@ def __init__(
 
         output_channel = block_out_channels[0]
 
-        self.conv_in = LTXCausalConv3d(
+        self.conv_in = LTXVideoCausalConv3d(
             in_channels=self.in_channels,
             out_channels=output_channel,
             kernel_size=3,
@@ -650,7 +650,7 @@ def __init__(
             input_channel = output_channel
             output_channel = block_out_channels[i + 1] if i + 1 < num_block_out_channels else block_out_channels[i]
 
-            down_block = LTXDownBlock3D(
+            down_block = LTXVideoDownBlock3D(
                 in_channels=input_channel,
                 out_channels=output_channel,
                 num_layers=layers_per_block[i],
@@ -662,7 +662,7 @@ def __init__(
             self.down_blocks.append(down_block)
 
         # mid block
-        self.mid_block = LTXMidBlock3d(
+        self.mid_block = LTXVideoMidBlock3d(
             in_channels=output_channel,
             num_layers=layers_per_block[-1],
             resnet_eps=resnet_norm_eps,
@@ -672,14 +672,14 @@ def __init__(
         # out
         self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
         self.conv_act = nn.SiLU()
-        self.conv_out = LTXCausalConv3d(
+        self.conv_out = LTXVideoCausalConv3d(
             in_channels=output_channel, out_channels=out_channels + 1, kernel_size=3, stride=1, is_causal=is_causal
         )
 
         self.gradient_checkpointing = False
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        r"""The forward method of the `LTXEncoder3D` class."""
+        r"""The forward method of the `LTXVideoEncoder3d` class."""
 
         p = self.patch_size
         p_t = self.patch_size_t
@@ -725,9 +725,10 @@ def create_forward(*inputs):
         return hidden_states
 
 
-class LTXDecoder3d(nn.Module):
+class LTXVideoDecoder3d(nn.Module):
     r"""
-    The `LTXDecoder3d` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    The `LTXVideoDecoder3d` layer of a variational autoencoder that decodes its latent representation into an output
+    sample.
 
     Args:
         in_channels (`int`, defaults to 128):
@@ -782,11 +783,11 @@ def __init__(
         upsample_factor = tuple(reversed(upsample_factor))
         output_channel = block_out_channels[0]
 
-        self.conv_in = LTXCausalConv3d(
+        self.conv_in = LTXVideoCausalConv3d(
             in_channels=in_channels, out_channels=output_channel, kernel_size=3, stride=1, is_causal=is_causal
         )
 
-        self.mid_block = LTXMidBlock3d(
+        self.mid_block = LTXVideoMidBlock3d(
             in_channels=output_channel,
             num_layers=layers_per_block[0],
             resnet_eps=resnet_norm_eps,
@@ -802,7 +803,7 @@ def __init__(
             input_channel = output_channel // upsample_factor[i]
             output_channel = block_out_channels[i] // upsample_factor[i]
 
-            up_block = LTXUpBlock3d(
+            up_block = LTXVideoUpBlock3d(
                 in_channels=input_channel,
                 out_channels=output_channel,
                 num_layers=layers_per_block[i + 1],
@@ -820,7 +821,7 @@ def __init__(
         # out
         self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
         self.conv_act = nn.SiLU()
-        self.conv_out = LTXCausalConv3d(
+        self.conv_out = LTXVideoCausalConv3d(
             in_channels=output_channel, out_channels=self.out_channels, kernel_size=3, stride=1, is_causal=is_causal
         )
 
@@ -951,7 +952,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self.encoder = LTXEncoder3d(
+        self.encoder = LTXVideoEncoder3d(
             in_channels=in_channels,
             out_channels=latent_channels,
             block_out_channels=block_out_channels,
@@ -962,7 +963,7 @@ def __init__(
             resnet_norm_eps=resnet_norm_eps,
             is_causal=encoder_causal,
         )
-        self.decoder = LTXDecoder3d(
+        self.decoder = LTXVideoDecoder3d(
             in_channels=latent_channels,
             out_channels=out_channels,
             block_out_channels=decoder_block_out_channels,
@@ -1015,7 +1016,7 @@ def __init__(
         self.tile_sample_stride_width = 448
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (LTXEncoder3d, LTXDecoder3d)):
+        if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
             module.gradient_checkpointing = value
 
     def enable_tiling(
 
@@ -748,10 +748,10 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
                 pos_embedding = self._get_positional_embeddings(
                     height, width, pre_time_compression_frames, device=embeds.device
                 )
-                pos_embedding = pos_embedding.to(dtype=embeds.dtype)
             else:
                 pos_embedding = self.pos_embedding
 
+            pos_embedding = pos_embedding.to(dtype=embeds.dtype)
             embeds = embeds + pos_embedding
 
         return embeds