Skip to content

Commit 8fc5cfc

Browse files
committed
single file related changes
1 parent 734fb71 commit 8fc5cfc

File tree

4 files changed

+61
-10
lines changed

4 files changed

+61
-10
lines changed

docs/source/en/api/pipelines/ltx_video.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License. -->
1414

15-
# LTX
15+
# LTX Video
1616

1717
[LTX Video](https://huggingface.co/Lightricks/LTX-Video) is the first DiT-based video generation model capable of generating high-quality videos in real-time. It produces 24 FPS videos at a 768x512 resolution faster than they can be watched. Trained on a large-scale dataset of diverse videos, the model generates high-resolution videos with realistic and varied content. We provide a model for both text-to-video as well as image + text-to-video usecases.
1818

@@ -30,6 +30,7 @@ Loading the original LTX Video checkpoints is also possible with [`~ModelMixin.f
3030
import torch
3131
from diffusers import AutoencoderKLLTXVideo, LTXImageToVideoPipeline, LTXVideoTransformer3DModel
3232

33+
# `single_file_url` could also be https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.1.safetensors
3334
single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
3435
transformer = LTXVideoTransformer3DModel.from_single_file(
3536
single_file_url, torch_dtype=torch.bfloat16
@@ -99,6 +100,32 @@ export_to_video(video, "output_gguf_ltx.mp4", fps=24)
99100

100101
Make sure to read the [documentation on GGUF](../../quantization/gguf) to learn more about our GGUF support.
101102

103+
<!-- TODO(aryan): Update this when official weights are supported -->
104+
105+
Loading and running inference with [LTX Video 0.9.1](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) weights.
106+
107+
```python
108+
import torch
109+
from diffusers import LTXPipeline
110+
from diffusers.utils import export_to_video
111+
112+
pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.1-diffusers", torch_dtype=torch.bfloat16)
113+
pipe.to("cuda")
114+
115+
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
116+
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
117+
118+
video = pipe(
119+
prompt=prompt,
120+
negative_prompt=negative_prompt,
121+
width=768,
122+
height=512,
123+
num_frames=161,
124+
num_inference_steps=50,
125+
).frames[0]
126+
export_to_video(video, "output.mp4", fps=24)
127+
```
128+
102129
Refer to [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox#memory-optimization) to learn more about optimizing memory consumption.
103130

104131
## LTXPipeline

src/diffusers/loaders/single_file_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@
157157
"flux-depth": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Depth-dev"},
158158
"flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
159159
"ltx-video": {"pretrained_model_name_or_path": "Lightricks/LTX-Video"},
160+
"ltx-video-0.9.1": {"pretrained_model_name_or_path": "a-r-r-o-w/LTX-Video-0.9.1-diffusers"},
160161
"autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
161162
"autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
162163
"autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
@@ -603,7 +604,10 @@ def infer_diffusers_model_type(checkpoint):
603604
model_type = "flux-schnell"
604605

605606
elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["ltx-video"]):
606-
model_type = "ltx-video"
607+
if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
608+
model_type = "ltx-video-0.9.1"
609+
else:
610+
model_type = "ltx-video"
607611

608612
elif CHECKPOINT_KEY_NAMES["autoencoder-dc"] in checkpoint:
609613
encoder_key = "encoder.project_in.conv.conv.bias"
@@ -2333,12 +2337,32 @@ def remove_keys_(key: str, state_dict):
23332337
"per_channel_statistics.std-of-means": "latents_std",
23342338
}
23352339

2340+
VAE_091_RENAME_DICT = {
2341+
# decoder
2342+
"up_blocks.0": "mid_block",
2343+
"up_blocks.1": "up_blocks.0.upsamplers.0",
2344+
"up_blocks.2": "up_blocks.0",
2345+
"up_blocks.3": "up_blocks.1.upsamplers.0",
2346+
"up_blocks.4": "up_blocks.1",
2347+
"up_blocks.5": "up_blocks.2.upsamplers.0",
2348+
"up_blocks.6": "up_blocks.2",
2349+
"up_blocks.7": "up_blocks.3.upsamplers.0",
2350+
"up_blocks.8": "up_blocks.3",
2351+
# common
2352+
"last_time_embedder": "time_embedder",
2353+
"last_scale_shift_table": "scale_shift_table",
2354+
}
2355+
23362356
VAE_SPECIAL_KEYS_REMAP = {
23372357
"per_channel_statistics.channel": remove_keys_,
23382358
"per_channel_statistics.mean-of-means": remove_keys_,
23392359
"per_channel_statistics.mean-of-stds": remove_keys_,
2360+
"timestep_scale_multiplier": remove_keys_,
23402361
}
23412362

2363+
if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
2364+
VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
2365+
23422366
for key in list(converted_state_dict.keys()):
23432367
new_key = key
23442368
for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():

src/diffusers/pipelines/ltx/pipeline_ltx.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -511,8 +511,8 @@ def __call__(
511511
prompt_attention_mask: Optional[torch.Tensor] = None,
512512
negative_prompt_embeds: Optional[torch.Tensor] = None,
513513
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
514-
decode_timestep: Union[float, List[float]] = 0.05,
515-
decode_noise_scale: Union[float, List[float]] = 0.025,
514+
decode_timestep: Union[float, List[float]] = 0.0,
515+
decode_noise_scale: Optional[Union[float, List[float]]] = None,
516516
output_type: Optional[str] = "pil",
517517
return_dict: bool = True,
518518
attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -565,9 +565,9 @@ def __call__(
565565
provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
566566
negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
567567
Pre-generated attention mask for negative text embeddings.
568-
decode_timestep (`float`, defaults to `0.05`):
568+
decode_timestep (`float`, defaults to `0.0`):
569569
The timestep at which generated video is decoded.
570-
decode_noise_scale (`float`, defaults to `0.025`):
570+
decode_noise_scale (`float`, defaults to `None`):
571571
The interpolation factor between random noise and denoised latents at the decode timestep.
572572
output_type (`str`, *optional*, defaults to `"pil"`):
573573
The output format of the generate image. Choose between

src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -571,8 +571,8 @@ def __call__(
571571
prompt_attention_mask: Optional[torch.Tensor] = None,
572572
negative_prompt_embeds: Optional[torch.Tensor] = None,
573573
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
574-
decode_timestep: Union[float, List[float]] = 0.05,
575-
decode_noise_scale: Union[float, List[float]] = 0.025,
574+
decode_timestep: Union[float, List[float]] = 0.0,
575+
decode_noise_scale: Optional[Union[float, List[float]]] = None,
576576
output_type: Optional[str] = "pil",
577577
return_dict: bool = True,
578578
attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -627,9 +627,9 @@ def __call__(
627627
provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
628628
negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
629629
Pre-generated attention mask for negative text embeddings.
630-
decode_timestep (`float`, defaults to `0.05`):
630+
decode_timestep (`float`, defaults to `0.0`):
631631
The timestep at which generated video is decoded.
632-
decode_noise_scale (`float`, defaults to `0.025`):
632+
decode_noise_scale (`float`, defaults to `None`):
633633
The interpolation factor between random noise and denoised latents at the decode timestep.
634634
output_type (`str`, *optional*, defaults to `"pil"`):
635635
The output format of the generate image. Choose between

0 commit comments

Comments
 (0)