huggingface
diff --git a/‎scripts/convert_ltx_to_diffusers.py‎
Lines changed: 5 additions & 1 deletion b/‎scripts/convert_ltx_to_diffusers.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/diffusers/models/transformers/transformer_ltx.py‎
Lines changed: 32 additions & 7 deletions b/‎src/diffusers/models/transformers/transformer_ltx.py‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎src/diffusers/pipelines/ltx/pipeline_ltx.py‎
Lines changed: 1 addition & 2 deletions b/‎src/diffusers/pipelines/ltx/pipeline_ltx.py‎
Lines changed: 1 addition & 2 deletions
@@ -134,12 +134,16 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key:
 def convert_transformer(
     ckpt_path: str,
     dtype: torch.dtype,
+    version: str = "0.9.0",
 ):
     PREFIX_KEY = "model.diffusion_model."
 
     original_state_dict = get_state_dict(load_file(ckpt_path))
+    config = {}
+    if version == "0.9.5":
+        config["_use_causal_rope_fix"] = True
     with init_empty_weights():
-        transformer = LTXVideoTransformer3DModel()
+        transformer = LTXVideoTransformer3DModel(**config)
 
     for key in list(original_state_dict.keys()):
         new_key = key[:]
 
@@ -14,15 +14,15 @@
 # limitations under the License.
 
 import math
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
 from ..attention_processor import Attention
@@ -102,6 +102,7 @@ def __init__(
         patch_size: int = 1,
         patch_size_t: int = 1,
         theta: float = 10000.0,
+        _causal_rope_fix: bool = False,
     ) -> None:
         super().__init__()
 
@@ -112,13 +113,15 @@ def __init__(
         self.patch_size = patch_size
         self.patch_size_t = patch_size_t
         self.theta = theta
+        self._causal_rope_fix = _causal_rope_fix
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         num_frames: int,
         height: int,
         width: int,
+        frame_rate: Optional[int] = None,
         rope_interpolation_scale: Optional[Tuple[torch.Tensor, float, float]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size = hidden_states.size(0)
@@ -132,9 +135,24 @@ def forward(
         grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
 
         if rope_interpolation_scale is not None:
-            grid[:, 0:1] = grid[:, 0:1] * rope_interpolation_scale[0] * self.patch_size_t / self.base_num_frames
-            grid[:, 1:2] = grid[:, 1:2] * rope_interpolation_scale[1] * self.patch_size / self.base_height
-            grid[:, 2:3] = grid[:, 2:3] * rope_interpolation_scale[2] * self.patch_size / self.base_width
+            if isinstance(rope_interpolation_scale, tuple):
+                # This will be deprecated in v0.34.0
+                grid[:, 0:1] = grid[:, 0:1] * rope_interpolation_scale[0] * self.patch_size_t / self.base_num_frames
+                grid[:, 1:2] = grid[:, 1:2] * rope_interpolation_scale[1] * self.patch_size / self.base_height
+                grid[:, 2:3] = grid[:, 2:3] * rope_interpolation_scale[2] * self.patch_size / self.base_width
+            else:
+                if not self._causal_rope_fix:
+                    grid[:, 0:1] = (
+                        grid[:, 0:1] * rope_interpolation_scale[0:1] * self.patch_size_t / self.base_num_frames
+                    )
+                else:
+                    grid[:, 0:1] = (
+                        ((grid[:, 0:1] - 1) * rope_interpolation_scale[0:1] + 1 / frame_rate).clamp(min=0)
+                        * self.patch_size_t
+                        / self.base_num_frames
+                    )
+                grid[:, 1:2] = grid[:, 1:2] * rope_interpolation_scale[1:2] * self.patch_size / self.base_height
+                grid[:, 2:3] = grid[:, 2:3] * rope_interpolation_scale[2:3] * self.patch_size / self.base_width
 
         grid = grid.flatten(2, 4).transpose(1, 2)
 
@@ -315,6 +333,7 @@ def __init__(
         caption_channels: int = 4096,
         attention_bias: bool = True,
         attention_out_bias: bool = True,
+        _causal_rope_fix: bool = False,
     ) -> None:
         super().__init__()
 
@@ -336,6 +355,7 @@ def __init__(
             patch_size=patch_size,
             patch_size_t=patch_size_t,
             theta=10000.0,
+            _causal_rope_fix=_causal_rope_fix,
         )
 
         self.transformer_blocks = nn.ModuleList(
@@ -370,7 +390,8 @@ def forward(
         num_frames: int,
         height: int,
         width: int,
-        rope_interpolation_scale: Optional[Tuple[float, float, float]] = None,
+        frame_rate: int,
+        rope_interpolation_scale: Optional[Union[Tuple[float, float, float], torch.Tensor]] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> torch.Tensor:
@@ -389,7 +410,11 @@ def forward(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
 
-        image_rotary_emb = self.rope(hidden_states, num_frames, height, width, rope_interpolation_scale)
+        if not isinstance(rope_interpolation_scale, torch.Tensor):
+            msg = "Passing a tuple for `rope_interpolation_scale` is deprecated and will be removed in v0.34.0."
+            deprecate("rope_interpolation_scale", "0.34.0", msg)
+
+        image_rotary_emb = self.rope(hidden_states, num_frames, height, width, frame_rate, rope_interpolation_scale)
 
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
         if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
 
@@ -694,9 +694,8 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 6. Prepare micro-conditions
-        latent_frame_rate = frame_rate / self.vae_temporal_compression_ratio
         rope_interpolation_scale = (
-            1 / latent_frame_rate,
+            self.vae_temporal_compression_ratio / frame_rate,
             self.vae_spatial_compression_ratio,
             self.vae_spatial_compression_ratio,
         )