address review comments

a-r-r-o-w · a-r-r-o-w · commit d76232df7ad6 · 2024-12-02T21:09:10.000+01:00
diff --git a/scripts/convert_ltx_to_diffusers.py b/scripts/convert_ltx_to_diffusers.py
@@ -49,6 +49,7 @@ def remove_keys_(key: str, state_dict: Dict[str, Any]):
     # common
     "conv_shortcut": "conv_shortcut.conv",
     "res_blocks": "resnets",
+    "norm3.norm": "norm3",
     "per_channel_statistics.mean-of-means": "latents_mean",
     "per_channel_statistics.std-of-means": "latents_std",
 }
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -23,7 +23,7 @@
 from ..activations import get_activation
 from ..modeling_outputs import AutoencoderKLOutput
 from ..modeling_utils import ModelMixin
-from ..normalization import LayerNormNd, RMSNormNd
+from ..normalization import RMSNorm
 from .vae import DecoderOutput, DiagonalGaussianDistribution
 
 
@@ -117,12 +117,12 @@ def __init__(
 
         self.nonlinearity = get_activation(non_linearity)
 
-        self.norm1 = RMSNormNd(dim=in_channels, eps=1e-8, elementwise_affine=elementwise_affine, channel_dim=1)
+        self.norm1 = RMSNorm(in_channels, eps=1e-8, elementwise_affine=elementwise_affine)
         self.conv1 = LTXCausalConv3d(
             in_channels=in_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
         )
 
-        self.norm2 = RMSNormNd(dim=out_channels, eps=1e-8, elementwise_affine=elementwise_affine, channel_dim=1)
+        self.norm2 = RMSNorm(out_channels, eps=1e-8, elementwise_affine=elementwise_affine)
         self.dropout = nn.Dropout(dropout)
         self.conv2 = LTXCausalConv3d(
             in_channels=out_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
@@ -131,25 +131,25 @@ def __init__(
         self.norm3 = None
         self.conv_shortcut = None
         if in_channels != out_channels:
-            self.norm3 = LayerNormNd(in_channels, eps=eps, elementwise_affine=True, bias=True, channel_dim=1)
+            self.norm3 = nn.LayerNorm(in_channels, eps=eps, elementwise_affine=True, bias=True)
             self.conv_shortcut = LTXCausalConv3d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, is_causal=is_causal
             )
 
     def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         hidden_states = inputs
 
-        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.norm1(hidden_states.movedim(1, -1)).movedim(-1, 1)
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.conv1(hidden_states)
 
-        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.norm2(hidden_states.movedim(1, -1)).movedim(-1, 1)
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states)
 
         if self.norm3 is not None:
-            inputs = self.norm3(inputs)
+            inputs = self.norm3(inputs.movedim(1, -1)).movedim(-1, 1)
 
         if self.conv_shortcut is not None:
             inputs = self.conv_shortcut(inputs)
@@ -545,7 +545,7 @@ def __init__(
         )
 
         # out
-        self.norm_out = RMSNormNd(dim=out_channels, eps=1e-8, elementwise_affine=False, channel_dim=1)
+        self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
         self.conv_act = nn.SiLU()
         self.conv_out = LTXCausalConv3d(
             in_channels=output_channel, out_channels=out_channels + 1, kernel_size=3, stride=1, is_causal=is_causal
@@ -589,7 +589,7 @@ def create_forward(*inputs):
 
             hidden_states = self.mid_block(hidden_states)
 
-        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
         hidden_states = self.conv_act(hidden_states)
         hidden_states = self.conv_out(hidden_states)
 
@@ -675,7 +675,7 @@ def __init__(
             self.up_blocks.append(up_block)
 
         # out
-        self.norm_out = RMSNormNd(dim=out_channels, eps=1e-8, elementwise_affine=False, channel_dim=1)
+        self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
         self.conv_act = nn.SiLU()
         self.conv_out = LTXCausalConv3d(
             in_channels=output_channel, out_channels=self.out_channels, kernel_size=3, stride=1, is_causal=is_causal
@@ -704,7 +704,7 @@ def create_forward(*inputs):
             for up_block in self.up_blocks:
                 hidden_states = up_block(hidden_states)
 
-        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
         hidden_states = self.conv_act(hidden_states)
         hidden_states = self.conv_out(hidden_states)
 
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import numbers
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -567,54 +567,3 @@ def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12):
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps)
-
-
-class LayerNormNd(nn.Module):
-    def __init__(
-        self,
-        normalized_shape: Union[int, List[int], Tuple[int], torch.Size],
-        eps: float = 1e-5,
-        elementwise_affine: bool = True,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-        channel_dim: int = -1,
-    ) -> None:
-        super().__init__()
-
-        self.norm = nn.LayerNorm(normalized_shape, eps, elementwise_affine, bias, device, dtype)
-        self.channel_dim = channel_dim
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.channel_dim != -1:
-            hidden_states = hidden_states.movedim(self.channel_dim, -1)
-            hidden_states = self.norm(hidden_states)
-            hidden_states = hidden_states.movedim(-1, self.channel_dim)
-        else:
-            hidden_states = self.norm(hidden_states)
-
-        return hidden_states
-
-
-class RMSNormNd(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        eps: float,
-        elementwise_affine: bool = True,
-        channel_dim: int = -1,
-    ) -> None:
-        super().__init__()
-
-        self.norm = RMSNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
-        self.channel_dim = channel_dim
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.channel_dim != -1:
-            hidden_states = hidden_states.movedim(self.channel_dim, -1)
-            hidden_states = self.norm(hidden_states)
-            hidden_states = hidden_states.movedim(-1, self.channel_dim)
-        else:
-            hidden_states = self.norm(hidden_states)
-
-        return hidden_states
diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
@@ -62,10 +62,8 @@ def __call__(
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
 
-        use_rotary_emb = False
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
-            use_rotary_emb = True
 
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
@@ -74,7 +72,7 @@ def __call__(
         query = attn.norm_q(query)
         key = attn.norm_k(key)
 
-        if image_rotary_emb is not None and use_rotary_emb:
+        if image_rotary_emb is not None:
             query = apply_rotary_emb(query, image_rotary_emb)
             key = apply_rotary_emb(key, image_rotary_emb)
 
@@ -255,7 +253,7 @@ def forward(
         attn_hidden_states = self.attn2(
             hidden_states,
             encoder_hidden_states=encoder_hidden_states,
-            image_rotary_emb=image_rotary_emb,
+            image_rotary_emb=None,
             attention_mask=encoder_attention_mask,
         )
         hidden_states = hidden_states + attn_hidden_states
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
@@ -774,7 +774,6 @@ def __call__(
                     timestep, _ = timestep.chunk(2)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                # ============= TODO(aryan): needs a look by YiYi
                 noise_pred = self._unpack_latents(
                     noise_pred,
                     latent_num_frames,
@@ -800,7 +799,6 @@ def __call__(
                 latents = self._pack_latents(
                     latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
                 )
-                # =============
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -183,6 +183,21 @@ def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
         return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 
     def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
+        r"""
+        Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
+        value.
+
+        Reference:
+        https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
+
+        Args:
+            t (`torch.Tensor`):
+                A tensor of timesteps to be stretched and shifted.
+
+        Returns:
+            `torch.Tensor`:
+                A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
+        """
         one_minus_z = 1 - t
         scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
         stretched_t = 1 - (one_minus_z / scale_factor)

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def remove_keys_(key: str, state_dict: Dict[str, Any]):`
`49`	`49`	`# common`
`50`	`50`	`"conv_shortcut": "conv_shortcut.conv",`
`51`	`51`	`"res_blocks": "resnets",`
	`52`	`+ "norm3.norm": "norm3",`
`52`	`53`	`"per_channel_statistics.mean-of-means": "latents_mean",`
`53`	`54`	`"per_channel_statistics.std-of-means": "latents_std",`
`54`	`55`	`}`