revert the timestep_scale_multiplier change

yiyixuxu · yiyixuxu · commit cf3a77ea3bcd · 2025-03-18T01:01:29.000+01:00
diff --git a/scripts/convert_ltx_to_diffusers.py b/scripts/convert_ltx_to_diffusers.py
@@ -105,7 +105,6 @@ def remove_keys_(key: str, state_dict: Dict[str, Any]):
     "per_channel_statistics.mean-of-means": remove_keys_,
     "per_channel_statistics.mean-of-stds": remove_keys_,
     "model.diffusion_model": remove_keys_,
-    "decoder.timestep_scale_multiplier": remove_keys_,
 }
 
 
@@ -271,7 +270,6 @@ def get_vae_config(version: str) -> Dict[str, Any]:
             "decoder_causal": False,
             "spatial_compression_ratio": 32,
             "temporal_compression_ratio": 8,
-            "timestep_scale_multiplier": 1000.0,
         }
         VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
     return config
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -921,14 +921,12 @@ def __init__(
         timestep_conditioning: bool = False,
         upsample_residual: Tuple[bool, ...] = (False, False, False, False),
         upsample_factor: Tuple[bool, ...] = (1, 1, 1, 1),
-        timestep_scale_multiplier: float = 1.0,
     ) -> None:
         super().__init__()
 
         self.patch_size = patch_size
         self.patch_size_t = patch_size_t
         self.out_channels = out_channels * patch_size**2
-        self.timestep_scale_multiplier = timestep_scale_multiplier
 
         block_out_channels = tuple(reversed(block_out_channels))
         spatio_temporal_scaling = tuple(reversed(spatio_temporal_scaling))
@@ -983,7 +981,9 @@ def __init__(
         # timestep embedding
         self.time_embedder = None
         self.scale_shift_table = None
+        self.timestep_scale_multiplier = None
         if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(torch.tensor(1000.0, dtype=torch.float32))
             self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(output_channel * 2, 0)
             self.scale_shift_table = nn.Parameter(torch.randn(2, output_channel) / output_channel**0.5)
 
@@ -992,7 +992,7 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
         hidden_states = self.conv_in(hidden_states)
 
-        if temb is not None:
+        if self.timestep_scale_multiplier is not None:
             temb = temb * self.timestep_scale_multiplier
 
         if torch.is_grad_enabled() and self.gradient_checkpointing:
@@ -1107,7 +1107,6 @@ def __init__(
         decoder_causal: bool = False,
         spatial_compression_ratio: int = None,
         temporal_compression_ratio: int = None,
-        timestep_scale_multiplier: float = 1.0,
     ) -> None:
         super().__init__()
 
@@ -1138,7 +1137,6 @@ def __init__(
             inject_noise=decoder_inject_noise,
             upsample_residual=upsample_residual,
             upsample_factor=upsample_factor,
-            timestep_scale_multiplier=timestep_scale_multiplier,
         )
 
         latents_mean = torch.zeros((latent_channels,), requires_grad=False)
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py
@@ -46,26 +46,46 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import LTXConditionPipeline
-        >>> from diffusers.utils import export_to_video, load_image
-
-        >>> pipe = LTXConditionPipeline.from_pretrained("YiYiXu/ltx-95", torch_dtype=torch.bfloat16)
+        >>> from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXConditionPipeline, LTXVideoCondition
+        >>> from diffusers.utils import export_to_video, load_video, load_image
+        >>> 
+        >>> pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.1", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
+        >>> 
+        >>> # Load input image and video
+        >>> video = load_video(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
+        ... )
         >>> image = load_image(
-        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
+        ... )
+        >>> 
+        >>> # Create conditioning objects
+        >>> condition1 = LTXVideoCondition(
+        ...     image=image,
+        ...     frame_index=0,
         ... )
-        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
+        >>> condition2 = LTXVideoCondition(
+        ...     video=video,
+        ...     frame_index=80,
+        ... )
+        >>> 
+        >>> prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day. And then the camera switch to a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
         >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-
+        >>> 
+        >>> # Generate video
+        >>> generator = torch.Generator("cuda").manual_seed(0)
         >>> video = pipe(
-        ...     image=image,
+        ...     conditions=[condition1, condition2],
         ...     prompt=prompt,
         ...     negative_prompt=negative_prompt,
-        ...     width=704,
-        ...     height=480,
+        ...     width=768,
+        ...     height=512,
         ...     num_frames=161,
-        ...     num_inference_steps=50,
+        ...     num_inference_steps=40,
+        ...     generator=generator,
         ... ).frames[0]
+        >>> 
         >>> export_to_video(video, "output.mp4", fps=24)
         ```
 """

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,6 @@ def remove_keys_(key: str, state_dict: Dict[str, Any]):`
`105`	`105`	`"per_channel_statistics.mean-of-means": remove_keys_,`
`106`	`106`	`"per_channel_statistics.mean-of-stds": remove_keys_,`
`107`	`107`	`"model.diffusion_model": remove_keys_,`
`108`		`- "decoder.timestep_scale_multiplier": remove_keys_,`
`109`	`108`	`}`
`110`	`109`
`111`	`110`
`@@ -271,7 +270,6 @@ def get_vae_config(version: str) -> Dict[str, Any]:`
`271`	`270`	`"decoder_causal": False,`
`272`	`271`	`"spatial_compression_ratio": 32,`
`273`	`272`	`"temporal_compression_ratio": 8,`
`274`		`- "timestep_scale_multiplier": 1000.0,`
`275`	`273`	`}`
`276`	`274`	`VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)`
`277`	`275`	`return config`