Skip to content

Conversation

@a-r-r-o-w
Copy link
Contributor

@a-r-r-o-w a-r-r-o-w commented Mar 5, 2025

See https://huggingface.slack.com/archives/C08275HSG8J/p1741091747532049?thread_ts=1738246363.413529&cid=C08275HSG8J

testing scripts

test1: image condition
# test ltx image conditioning
import torch
from diffusers import LTXConditionPipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video, load_video, load_image


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day."
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)

# use `conditions` input
condition = LTXVideoCondition(
    image=image,
)

generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_1_output_1.mp4", fps=24)


#  pass `image` input directly
generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_1_output_2.mp4", fps=24)
test2: video condition
# ltx video conditioning
import torch
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXConditionPipeline, LTXVideoCondition
from diffusers.utils import export_to_video, load_video


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


video = load_video(
    "/raid/yiyi/LTX-Video/outputs/2025-03-11/video_output_0_a-woman-with-long-brown-hair-and_42_512x768x40_0.mp4"
)

condition = LTXVideoCondition(
    video=video,
    frame_index=0
)

# Define prompts
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
# Generate the video
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_1.mp4", fps=24)



# test 2: frame index 8
condition = LTXVideoCondition(
    video=video,
    frame_index=8
)

# Generate the video
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_2.mp4", fps=24)


# test3: pass inputs directly

# Generate the video
video = pipe(
    video=video,
    frame_index=8,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_3.mp4", fps=24)
test3: video + image
# image + video
import torch
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXConditionPipeline, LTXVideoCondition
from diffusers.utils import export_to_video, load_video, load_image


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
)

image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)

condition1 = LTXVideoCondition(
    image=image,
    frame_index=0,
)

condition2 = LTXVideoCondition(
    video=video,
    frame_index=80,
)

# Define prompts
prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day. And then the camera switch to a inding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
# Generate the video
generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    conditions=[condition1, condition2],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_4_output.mp4", fps=24)

@a-r-r-o-w a-r-r-o-w requested a review from yiyixuxu March 5, 2025 00:37
@HuggingFaceDocBuilderDev

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

@a-r-r-o-w a-r-r-o-w changed the title Fix documentation LTX 0.9.5 Mar 5, 2025
@a-r-r-o-w
Copy link
Contributor Author

a-r-r-o-w commented Mar 5, 2025

Code for matching VAE:

import sys
sys.path.append("/raid/aryan/ltx-code")

import json
from typing import Any, Dict

import torch
from safetensors.torch import load_file
from safetensors import safe_open

from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder


def remove_keys_(key: str, state_dict: Dict[str, Any]):
    state_dict.pop(key)

VAE_KEYS_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0",
    "up_blocks.2": "up_blocks.1.upsamplers.0",
    "up_blocks.3": "up_blocks.1",
    "up_blocks.4": "up_blocks.2.conv_in",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.conv_in",
    "up_blocks.8": "up_blocks.3.upsamplers.0",
    "up_blocks.9": "up_blocks.3",
    # encoder
    "down_blocks.0": "down_blocks.0",
    "down_blocks.1": "down_blocks.0.downsamplers.0",
    "down_blocks.2": "down_blocks.0.conv_out",
    "down_blocks.3": "down_blocks.1",
    "down_blocks.4": "down_blocks.1.downsamplers.0",
    "down_blocks.5": "down_blocks.1.conv_out",
    "down_blocks.6": "down_blocks.2",
    "down_blocks.7": "down_blocks.2.downsamplers.0",
    "down_blocks.8": "down_blocks.3",
    "down_blocks.9": "mid_block",
    # common
    "conv_shortcut": "conv_shortcut.conv",
    "res_blocks": "resnets",
    "norm3.norm": "norm3",
    "per_channel_statistics.mean-of-means": "latents_mean",
    "per_channel_statistics.std-of-means": "latents_std",
}

VAE_091_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0.upsamplers.0",
    "up_blocks.2": "up_blocks.0",
    "up_blocks.3": "up_blocks.1.upsamplers.0",
    "up_blocks.4": "up_blocks.1",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.upsamplers.0",
    "up_blocks.8": "up_blocks.3",
    # common
    "last_time_embedder": "time_embedder",
    "last_scale_shift_table": "scale_shift_table",
}

VAE_095_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0.upsamplers.0",
    "up_blocks.2": "up_blocks.0",
    "up_blocks.3": "up_blocks.1.upsamplers.0",
    "up_blocks.4": "up_blocks.1",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.upsamplers.0",
    "up_blocks.8": "up_blocks.3",
    # encoder
    "down_blocks.0": "down_blocks.0",
    "down_blocks.1": "down_blocks.0.downsamplers.0",
    "down_blocks.2": "down_blocks.1",
    "down_blocks.3": "down_blocks.1.downsamplers.0",
    "down_blocks.4": "down_blocks.2",
    "down_blocks.5": "down_blocks.2.downsamplers.0",
    "down_blocks.6": "down_blocks.3",
    "down_blocks.7": "down_blocks.3.downsamplers.0",
    "down_blocks.8": "mid_block",
    # common
    "last_time_embedder": "time_embedder",
    "last_scale_shift_table": "scale_shift_table",
}

VAE_SPECIAL_KEYS_REMAP = {
    "per_channel_statistics.channel": remove_keys_,
    "per_channel_statistics.mean-of-means": remove_keys_,
    "per_channel_statistics.mean-of-stds": remove_keys_,
    "model.diffusion_model": remove_keys_,
}

VAE_091_SPECIAL_KEYS_REMAP = {
    "timestep_scale_multiplier": remove_keys_,
}

VAE_095_SPECIAL_KEYS_REMAP = {
    
}


def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
    state_dict[new_key] = state_dict.pop(old_key)


def convert_vae(original_state_dict):
    PREFIX_KEY = "vae."

    for key in list(original_state_dict.keys()):
        new_key = key[:]
        if new_key.startswith(PREFIX_KEY):
            new_key = key[len(PREFIX_KEY) :]
        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
            new_key = new_key.replace(replace_key, rename_key)
        update_state_dict_inplace(original_state_dict, key, new_key)

    for key in list(original_state_dict.keys()):
        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
            if special_key not in key:
                continue
            handler_fn_inplace(key, original_state_dict)

    return original_state_dict


@torch.no_grad()
def match_vae():
    from diffusers import AutoencoderKLLTXVideo

    original_model_path = "/raid/aryan/ltx-new/ltx-video-2b-v0.9.5rc1.safetensors"
    theirs_config = json.loads(safe_open(original_model_path, "pt").metadata()["config"])
    theirs_model = CausalVideoAutoencoder.from_config(theirs_config["vae"])
    theirs_state_dict = load_file(original_model_path)
    theirs_model.load_state_dict(theirs_state_dict)

    ours_config = {
        "in_channels": 3,
        "out_channels": 3,
        "latent_channels": 128,
        "block_out_channels": (128, 256, 512, 1024, 2048),
        "down_block_types": (
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
        ),
        "decoder_block_out_channels": (256, 512, 1024),
        "layers_per_block": (4, 6, 6, 2, 2),
        "decoder_layers_per_block": (5, 5, 5, 5),
        "spatio_temporal_scaling": (True, True, True, True),
        "decoder_spatio_temporal_scaling": (True, True, True),
        "decoder_inject_noise": (False, False, False, False),
        "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
        "upsample_residual": (True, True, True),
        "upsample_factor": (2, 2, 2),
        "timestep_conditioning": True,
        "patch_size": 4,
        "patch_size_t": 1,
        "resnet_norm_eps": 1e-6,
        "scaling_factor": 1.0,
        "encoder_causal": True,
        "decoder_causal": False,
    }
    ours_model = AutoencoderKLLTXVideo.from_config(ours_config)

    VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
    VAE_SPECIAL_KEYS_REMAP.update(VAE_095_SPECIAL_KEYS_REMAP)
    ours_state_dict = convert_vae(theirs_state_dict)
    ours_model.load_state_dict(ours_state_dict)

    state_dict_params = sum(p.numel() for p in ours_state_dict.values())
    print(f"State dict params: {state_dict_params}")

    device = torch.device("cuda")
    dtype = torch.float32

    theirs_model.to(device=device, dtype=dtype)
    ours_model.to(device=device, dtype=dtype)

    theirs_model.disable_z_tiling()
    theirs_model.disable_hw_tiling()

    print(sum(p.numel() for p in theirs_model.parameters()))
    print(sum(p.numel() for p in ours_model.parameters()))

    batch_size = 1
    num_channels = 3
    num_frames = 49
    height = 128
    width = 128

    torch.manual_seed(0)
    input = torch.randn(batch_size, num_channels, num_frames, height, width, device=device, dtype=dtype)
    decode_timestep = 0.025

    print("theirs_encoding")
    theirs_encoder_output = theirs_model.encode(input).latent_dist.mode()
    print("theirs_decoding")
    theirs_decoder_output = theirs_model.decode(theirs_encoder_output, timestep=decode_timestep, target_shape=(batch_size, num_channels, num_frames, height, width)).sample
    print("theirs:", theirs_encoder_output.shape, theirs_decoder_output.shape)

    print("ours_encoding")
    ours_encoder_output = ours_model.encode(input).latent_dist.mode()
    print("ours_decoding")
    ours_decoder_output = ours_model.decode(ours_encoder_output, temb=decode_timestep).sample
    print("ours:", ours_encoder_output.shape, ours_decoder_output.shape)

    diff_encoder = theirs_encoder_output - ours_encoder_output
    diff_decoder = theirs_decoder_output - ours_decoder_output

    absmax_encoder, absmean_encoder = torch.max(diff_encoder.abs()), torch.mean(diff_encoder.abs())
    absmax_decoder, absmean_decoder = torch.max(diff_decoder.abs()), torch.mean(diff_decoder.abs())

    print(f"Encoder: absmax={absmax_encoder}, absmean={absmean_encoder}")
    print(f"Decoder: absmax={absmax_decoder}, absmean={absmean_decoder}")


match_vae()

@hlky hlky linked an issue Mar 13, 2025 that may be closed by this pull request
yiyixuxu and others added 3 commits March 13, 2025 23:24
* up



* Update src/diffusers/pipelines/ltx/pipeline_ltx_condition.py

Co-authored-by: hlky <[email protected]>

* up

* make it work

* up

* update conversion script

* up

* up

* up

* up

* up more

* up

* Apply suggestions from code review

Co-authored-by: Aryan <[email protected]>

* add docs tests + more refactor

* up

---------

Co-authored-by: hlky <[email protected]>
Co-authored-by: Aryan <[email protected]>
def __call__(
self,
conditions: Union[LTXVideoCondition, List[LTXVideoCondition]] = None,
image: Union[PipelineImageInput, List[PipelineImageInput]] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the LTXVideoCondition class!
but let's still support the image/video inputs still so that users can do a simple image2video video2video with the same API they use for other img2video/video2video

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good!

Copy link
Contributor Author

@a-r-r-o-w a-r-r-o-w left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @yiyixuxu! Just had a question, but otherwise looks good to merge

def __call__(
self,
conditions: Union[LTXVideoCondition, List[LTXVideoCondition]] = None,
image: Union[PipelineImageInput, List[PipelineImageInput]] = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good!

@yiyixuxu yiyixuxu merged commit 2e83cbb into main Mar 18, 2025
14 of 15 checks passed
@yiyixuxu yiyixuxu deleted the integrations/ltx-0.9.5 branch March 18, 2025 02:43
@DN6 DN6 added the roadmap Add to current release roadmap label Mar 20, 2025
@DN6 DN6 moved this from In Progress to Done in Diffusers Roadmap 0.36 Mar 20, 2025
sayakpaul added a commit that referenced this pull request Mar 20, 2025
Co-authored-by: SunMarc <[email protected]>

condition better.

support mapping.

improvements.

[Quantization] Add Quanto backend (#10756)

* update

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update docs/source/en/quantization/quanto.md

Co-authored-by: Sayak Paul <[email protected]>

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update src/diffusers/quantizers/quanto/utils.py

Co-authored-by: Sayak Paul <[email protected]>

* update

* update

---------

Co-authored-by: Sayak Paul <[email protected]>

[Single File] Add single file loading for SANA Transformer (#10947)

* added support for from_single_file

* added diffusers mapping script

* added testcase

* bug fix

* updated tests

* corrected code quality

* corrected code quality

---------

Co-authored-by: Dhruv Nair <[email protected]>

[LoRA] Improve warning messages when LoRA loading becomes a no-op (#10187)

* updates

* updates

* updates

* updates

* notebooks revert

* fix-copies.

* seeing

* fix

* revert

* fixes

* fixes

* fixes

* remove print

* fix

* conflicts ii.

* updates

* fixes

* better filtering of prefix.

---------

Co-authored-by: hlky <[email protected]>

[LoRA] CogView4 (#10981)

* update

* make fix-copies

* update

[Tests] improve quantization tests by additionally measuring the inference memory savings (#11021)

* memory usage tests

* fixes

* gguf

[`Research Project`] Add AnyText: Multilingual Visual Text Generation And Editing (#8998)

* Add initial template

* Second template

* feat: Add TextEmbeddingModule to AnyTextPipeline

* feat: Add AuxiliaryLatentModule template to AnyTextPipeline

* Add bert tokenizer from the anytext repo for now

* feat: Update AnyTextPipeline's modify_prompt method

This commit adds improvements to the modify_prompt method in the AnyTextPipeline class. The method now handles special characters and replaces selected string prompts with a placeholder. Additionally, it includes a check for Chinese text and translation using the trans_pipe.

* Fill in the `forward` pass of `AuxiliaryLatentModule`

* `make style && make quality`

* `chore: Update bert_tokenizer.py with a TODO comment suggesting the use of the transformers library`

* Update error handling to raise and logging

* Add `create_glyph_lines` function into `TextEmbeddingModule`

* make style

* Up

* Up

* Up

* Up

* Remove several comments

* refactor: Remove ControlNetConditioningEmbedding and update code accordingly

* Up

* Up

* up

* refactor: Update AnyTextPipeline to include new optional parameters

* up

* feat: Add OCR model and its components

* chore: Update `TextEmbeddingModule` to include OCR model components and dependencies

* chore: Update `AuxiliaryLatentModule` to include VAE model and its dependencies for masked image in the editing task

* `make style`

* refactor: Update `AnyTextPipeline`'s docstring

* Update `AuxiliaryLatentModule` to include info dictionary so that text processing is done once

* simplify

* `make style`

* Converting `TextEmbeddingModule` to ordinary `encode_prompt()` function

* Simplify for now

* `make style`

* Up

* feat: Add scripts to convert AnyText controlnet to diffusers

* `make style`

* Fix: Move glyph rendering to `TextEmbeddingModule` from `AuxiliaryLatentModule`

* make style

* Up

* Simplify

* Up

* feat: Add safetensors module for loading model file

* Fix device issues

* Up

* Up

* refactor: Simplify

* refactor: Simplify code for loading models and handling data types

* `make style`

* refactor: Update to() method in FrozenCLIPEmbedderT3 and TextEmbeddingModule

* refactor: Update dtype in embedding_manager.py to match proj.weight

* Up

* Add attribution and adaptation information to pipeline_anytext.py

* Update usage example

* Will refactor `controlnet_cond_embedding` initialization

* Add `AnyTextControlNetConditioningEmbedding` template

* Refactor organization

* style

* style

* Move custom blocks from `AuxiliaryLatentModule` to `AnyTextControlNetConditioningEmbedding`

* Follow one-file policy

* style

* [Docs] Update README and pipeline_anytext.py to use AnyTextControlNetModel

* [Docs] Update import statement for AnyTextControlNetModel in pipeline_anytext.py

* [Fix] Update import path for ControlNetModel, ControlNetOutput in anytext_controlnet.py

* Refactor AnyTextControlNet to use configurable conditioning embedding channels

* Complete control net conditioning embedding in AnyTextControlNetModel

* up

* [FIX] Ensure embeddings use correct device in AnyTextControlNetModel

* up

* up

* style

* [UPDATE] Revise README and example code for AnyTextPipeline integration with DiffusionPipeline

* [UPDATE] Update example code in anytext.py to use correct font file and improve clarity

* down

* [UPDATE] Refactor BasicTokenizer usage to a new Checker class for text processing

* update pillow

* [UPDATE] Remove commented-out code and unnecessary docstring in anytext.py and anytext_controlnet.py for improved clarity

* [REMOVE] Delete frozen_clip_embedder_t3.py as it is in the anytext.py file

* [UPDATE] Replace edict with dict for configuration in anytext.py and RecModel.py for consistency

* 🆙

* style

* [UPDATE] Revise README.md for clarity, remove unused imports in anytext.py, and add author credits in anytext_controlnet.py

* style

* Update examples/research_projects/anytext/README.md

Co-authored-by: Aryan <[email protected]>

* Remove commented-out image preparation code in AnyTextPipeline

* Remove unnecessary blank line in README.md

[Quantization] Allow loading TorchAO serialized Tensor objects with torch>=2.6  (#11018)

* update

* update

* update

* update

* update

* update

* update

* update

* update

fix: mixture tiling sdxl pipeline - adjust gerating time_ids & embeddings  (#11012)

small fix on generating time_ids & embeddings

[LoRA] support wan i2v loras from the world. (#11025)

* support wan i2v loras from the world.

* remove copied from.

* upates

* add lora.

Fix SD3 IPAdapter feature extractor (#11027)

chore: fix help messages in advanced diffusion examples (#10923)

Fix missing **kwargs in lora_pipeline.py (#11011)

* Update lora_pipeline.py

* Apply style fixes

* fix-copies

---------

Co-authored-by: hlky <[email protected]>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

Fix for multi-GPU WAN inference (#10997)

Ensure that hidden_state and shift/scale are on the same device when running with multiple GPUs

Co-authored-by: Jimmy <39@🇺🇸.com>

[Refactor] Clean up import utils boilerplate (#11026)

* update

* update

* update

Use `output_size` in `repeat_interleave` (#11030)

[hybrid inference 🍯🐝] Add VAE encode (#11017)

* [hybrid inference 🍯🐝] Add VAE encode

* _toctree: add vae encode

* Add endpoints, tests

* vae_encode docs

* vae encode benchmarks

* api reference

* changelog

* Update docs/source/en/hybrid_inference/overview.md

Co-authored-by: Sayak Paul <[email protected]>

* update

---------

Co-authored-by: Sayak Paul <[email protected]>

Wan Pipeline scaling fix, type hint warning, multi generator fix (#11007)

* Wan Pipeline scaling fix, type hint warning, multi generator fix

* Apply suggestions from code review

[LoRA] change to warning from info when notifying the users about a LoRA no-op (#11044)

* move to warning.

* test related changes.

Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline (#10827)

* Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline

---------

Co-authored-by: YiYi Xu <[email protected]>

making ```formatted_images``` initialization compact (#10801)

compact writing

Co-authored-by: Sayak Paul <[email protected]>
Co-authored-by: YiYi Xu <[email protected]>

Fix aclnnRepeatInterleaveIntWithDim error on NPU for get_1d_rotary_pos_embed (#10820)

* get_1d_rotary_pos_embed support npu

* Update src/diffusers/models/embeddings.py

---------

Co-authored-by: Kai zheng <[email protected]>
Co-authored-by: hlky <[email protected]>
Co-authored-by: YiYi Xu <[email protected]>

[Tests] restrict memory tests for quanto for certain schemes. (#11052)

* restrict memory tests for quanto for certain schemes.

* Apply suggestions from code review

Co-authored-by: Dhruv Nair <[email protected]>

* fixes

* style

---------

Co-authored-by: Dhruv Nair <[email protected]>

[LoRA] feat: support non-diffusers wan t2v loras. (#11059)

feat: support non-diffusers wan t2v loras.

[examples/controlnet/train_controlnet_sd3.py] Fixes #11050 - Cast prompt_embeds and pooled_prompt_embeds to weight_dtype to prevent dtype mismatch (#11051)

Fix: dtype mismatch of prompt embeddings in sd3 controlnet training

Co-authored-by: Andreas Jörg <[email protected]>
Co-authored-by: Sayak Paul <[email protected]>

reverts accidental change that removes attn_mask in attn. Improves fl… (#11065)

reverts accidental change that removes attn_mask in attn. Improves flux ptxla by using flash block sizes. Moves encoding outside the for loop.

Co-authored-by: Juan Acevedo <[email protected]>

Fix deterministic issue when getting pipeline dtype and device (#10696)

Co-authored-by: Dhruv Nair <[email protected]>

[Tests] add requires peft decorator. (#11037)

* add requires peft decorator.

* install peft conditionally.

* conditional deps.

Co-authored-by: DN6 <[email protected]>

---------

Co-authored-by: DN6 <[email protected]>

CogView4 Control Block (#10809)

* cogview4 control training

---------

Co-authored-by: OleehyO <[email protected]>
Co-authored-by: yiyixuxu <[email protected]>

[CI] pin transformers version for benchmarking. (#11067)

pin transformers version for benchmarking.

updates

Fix Wan I2V Quality (#11087)

* fix_wan_i2v_quality

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update pipeline_wan_i2v.py

---------

Co-authored-by: YiYi Xu <[email protected]>
Co-authored-by: hlky <[email protected]>

LTX 0.9.5 (#10968)

* update

---------

Co-authored-by: YiYi Xu <[email protected]>
Co-authored-by: hlky <[email protected]>

make PR GPU tests conditioned on styling. (#11099)

Group offloading improvements (#11094)

update

Fix pipeline_flux_controlnet.py (#11095)

* Fix pipeline_flux_controlnet.py

* Fix style

update readme instructions. (#11096)

Co-authored-by: Juan Acevedo <[email protected]>

Resolve stride mismatch in UNet's ResNet to support Torch DDP (#11098)

Modify UNet's ResNet implementation to resolve stride mismatch in Torch's DDP

Fix Group offloading behaviour when using streams (#11097)

* update

* update

Quality options in `export_to_video` (#11090)

* Quality options in `export_to_video`

* make style

improve more.

add placeholders for docstrings.

formatting.

smol fix.

solidify validation and annotation
@SHYuanBest
Copy link
Contributor

Hi, it seem that LTXPipeline have a bug when loading LTX-0.9.5 (to device)?

import torch
from diffusers import LTXPipeline
from diffusers.utils import export_to_video

base_model_path = "YiYiXu/ltx-95"

pipe = LTXPipeline.from_pretrained(base_model_path, torch_dtype=torch.bfloat16)
pipe.to("cuda")
The config attributes {'timestep_scale_multiplier': 1000.0} were passed to AutoencoderKLLTXVideo, but are not expected and will be ignored. Please verify your config.json configuration file.
Some weights of AutoencoderKLLTXVideo were not initialized from the model checkpoint atLTX-Video-0.9.5-diffusers/vae and are newly initialized: ['decoder.timestep_scale_multiplier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading pipeline components...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.07it/s]
Traceback (most recent call last):
  File "inference.py", line 8, in <module>
    pipe.to("cuda")
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/diffusers/pipelines/pipeline_utils.py", line 482, in to
    module.to(device, dtype)
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/diffusers/models/modeling_utils.py", line 1351, in to
    return super().to(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1340, in to
    return self._apply(convert)
           ^^^^^^^^^^^^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 900, in _apply
    module._apply(fn)
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 927, in _apply
    param_applied = fn(param)
                    ^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1333, in convert
    raise NotImplementedError(
NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

@a-r-r-o-w
Copy link
Contributor Author

Hi, please use the official Lightricks repository: https://huggingface.co/Lightricks/LTX-Video-0.9.5

sayakpaul added a commit that referenced this pull request May 9, 2025
* feat: pipeline-level quant config.

Co-authored-by: SunMarc <[email protected]>

condition better.

support mapping.

improvements.

[Quantization] Add Quanto backend (#10756)

* update

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update docs/source/en/quantization/quanto.md

Co-authored-by: Sayak Paul <[email protected]>

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update src/diffusers/quantizers/quanto/utils.py

Co-authored-by: Sayak Paul <[email protected]>

* update

* update

---------

Co-authored-by: Sayak Paul <[email protected]>

[Single File] Add single file loading for SANA Transformer (#10947)

* added support for from_single_file

* added diffusers mapping script

* added testcase

* bug fix

* updated tests

* corrected code quality

* corrected code quality

---------

Co-authored-by: Dhruv Nair <[email protected]>

[LoRA] Improve warning messages when LoRA loading becomes a no-op (#10187)

* updates

* updates

* updates

* updates

* notebooks revert

* fix-copies.

* seeing

* fix

* revert

* fixes

* fixes

* fixes

* remove print

* fix

* conflicts ii.

* updates

* fixes

* better filtering of prefix.

---------

Co-authored-by: hlky <[email protected]>

[LoRA] CogView4 (#10981)

* update

* make fix-copies

* update

[Tests] improve quantization tests by additionally measuring the inference memory savings (#11021)

* memory usage tests

* fixes

* gguf

[`Research Project`] Add AnyText: Multilingual Visual Text Generation And Editing (#8998)

* Add initial template

* Second template

* feat: Add TextEmbeddingModule to AnyTextPipeline

* feat: Add AuxiliaryLatentModule template to AnyTextPipeline

* Add bert tokenizer from the anytext repo for now

* feat: Update AnyTextPipeline's modify_prompt method

This commit adds improvements to the modify_prompt method in the AnyTextPipeline class. The method now handles special characters and replaces selected string prompts with a placeholder. Additionally, it includes a check for Chinese text and translation using the trans_pipe.

* Fill in the `forward` pass of `AuxiliaryLatentModule`

* `make style && make quality`

* `chore: Update bert_tokenizer.py with a TODO comment suggesting the use of the transformers library`

* Update error handling to raise and logging

* Add `create_glyph_lines` function into `TextEmbeddingModule`

* make style

* Up

* Up

* Up

* Up

* Remove several comments

* refactor: Remove ControlNetConditioningEmbedding and update code accordingly

* Up

* Up

* up

* refactor: Update AnyTextPipeline to include new optional parameters

* up

* feat: Add OCR model and its components

* chore: Update `TextEmbeddingModule` to include OCR model components and dependencies

* chore: Update `AuxiliaryLatentModule` to include VAE model and its dependencies for masked image in the editing task

* `make style`

* refactor: Update `AnyTextPipeline`'s docstring

* Update `AuxiliaryLatentModule` to include info dictionary so that text processing is done once

* simplify

* `make style`

* Converting `TextEmbeddingModule` to ordinary `encode_prompt()` function

* Simplify for now

* `make style`

* Up

* feat: Add scripts to convert AnyText controlnet to diffusers

* `make style`

* Fix: Move glyph rendering to `TextEmbeddingModule` from `AuxiliaryLatentModule`

* make style

* Up

* Simplify

* Up

* feat: Add safetensors module for loading model file

* Fix device issues

* Up

* Up

* refactor: Simplify

* refactor: Simplify code for loading models and handling data types

* `make style`

* refactor: Update to() method in FrozenCLIPEmbedderT3 and TextEmbeddingModule

* refactor: Update dtype in embedding_manager.py to match proj.weight

* Up

* Add attribution and adaptation information to pipeline_anytext.py

* Update usage example

* Will refactor `controlnet_cond_embedding` initialization

* Add `AnyTextControlNetConditioningEmbedding` template

* Refactor organization

* style

* style

* Move custom blocks from `AuxiliaryLatentModule` to `AnyTextControlNetConditioningEmbedding`

* Follow one-file policy

* style

* [Docs] Update README and pipeline_anytext.py to use AnyTextControlNetModel

* [Docs] Update import statement for AnyTextControlNetModel in pipeline_anytext.py

* [Fix] Update import path for ControlNetModel, ControlNetOutput in anytext_controlnet.py

* Refactor AnyTextControlNet to use configurable conditioning embedding channels

* Complete control net conditioning embedding in AnyTextControlNetModel

* up

* [FIX] Ensure embeddings use correct device in AnyTextControlNetModel

* up

* up

* style

* [UPDATE] Revise README and example code for AnyTextPipeline integration with DiffusionPipeline

* [UPDATE] Update example code in anytext.py to use correct font file and improve clarity

* down

* [UPDATE] Refactor BasicTokenizer usage to a new Checker class for text processing

* update pillow

* [UPDATE] Remove commented-out code and unnecessary docstring in anytext.py and anytext_controlnet.py for improved clarity

* [REMOVE] Delete frozen_clip_embedder_t3.py as it is in the anytext.py file

* [UPDATE] Replace edict with dict for configuration in anytext.py and RecModel.py for consistency

* 🆙

* style

* [UPDATE] Revise README.md for clarity, remove unused imports in anytext.py, and add author credits in anytext_controlnet.py

* style

* Update examples/research_projects/anytext/README.md

Co-authored-by: Aryan <[email protected]>

* Remove commented-out image preparation code in AnyTextPipeline

* Remove unnecessary blank line in README.md

[Quantization] Allow loading TorchAO serialized Tensor objects with torch>=2.6  (#11018)

* update

* update

* update

* update

* update

* update

* update

* update

* update

fix: mixture tiling sdxl pipeline - adjust gerating time_ids & embeddings  (#11012)

small fix on generating time_ids & embeddings

[LoRA] support wan i2v loras from the world. (#11025)

* support wan i2v loras from the world.

* remove copied from.

* upates

* add lora.

Fix SD3 IPAdapter feature extractor (#11027)

chore: fix help messages in advanced diffusion examples (#10923)

Fix missing **kwargs in lora_pipeline.py (#11011)

* Update lora_pipeline.py

* Apply style fixes

* fix-copies

---------

Co-authored-by: hlky <[email protected]>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

Fix for multi-GPU WAN inference (#10997)

Ensure that hidden_state and shift/scale are on the same device when running with multiple GPUs

Co-authored-by: Jimmy <39@🇺🇸.com>

[Refactor] Clean up import utils boilerplate (#11026)

* update

* update

* update

Use `output_size` in `repeat_interleave` (#11030)

[hybrid inference 🍯🐝] Add VAE encode (#11017)

* [hybrid inference 🍯🐝] Add VAE encode

* _toctree: add vae encode

* Add endpoints, tests

* vae_encode docs

* vae encode benchmarks

* api reference

* changelog

* Update docs/source/en/hybrid_inference/overview.md

Co-authored-by: Sayak Paul <[email protected]>

* update

---------

Co-authored-by: Sayak Paul <[email protected]>

Wan Pipeline scaling fix, type hint warning, multi generator fix (#11007)

* Wan Pipeline scaling fix, type hint warning, multi generator fix

* Apply suggestions from code review

[LoRA] change to warning from info when notifying the users about a LoRA no-op (#11044)

* move to warning.

* test related changes.

Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline (#10827)

* Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline

---------

Co-authored-by: YiYi Xu <[email protected]>

making ```formatted_images``` initialization compact (#10801)

compact writing

Co-authored-by: Sayak Paul <[email protected]>
Co-authored-by: YiYi Xu <[email protected]>

Fix aclnnRepeatInterleaveIntWithDim error on NPU for get_1d_rotary_pos_embed (#10820)

* get_1d_rotary_pos_embed support npu

* Update src/diffusers/models/embeddings.py

---------

Co-authored-by: Kai zheng <[email protected]>
Co-authored-by: hlky <[email protected]>
Co-authored-by: YiYi Xu <[email protected]>

[Tests] restrict memory tests for quanto for certain schemes. (#11052)

* restrict memory tests for quanto for certain schemes.

* Apply suggestions from code review

Co-authored-by: Dhruv Nair <[email protected]>

* fixes

* style

---------

Co-authored-by: Dhruv Nair <[email protected]>

[LoRA] feat: support non-diffusers wan t2v loras. (#11059)

feat: support non-diffusers wan t2v loras.

[examples/controlnet/train_controlnet_sd3.py] Fixes #11050 - Cast prompt_embeds and pooled_prompt_embeds to weight_dtype to prevent dtype mismatch (#11051)

Fix: dtype mismatch of prompt embeddings in sd3 controlnet training

Co-authored-by: Andreas Jörg <[email protected]>
Co-authored-by: Sayak Paul <[email protected]>

reverts accidental change that removes attn_mask in attn. Improves fl… (#11065)

reverts accidental change that removes attn_mask in attn. Improves flux ptxla by using flash block sizes. Moves encoding outside the for loop.

Co-authored-by: Juan Acevedo <[email protected]>

Fix deterministic issue when getting pipeline dtype and device (#10696)

Co-authored-by: Dhruv Nair <[email protected]>

[Tests] add requires peft decorator. (#11037)

* add requires peft decorator.

* install peft conditionally.

* conditional deps.

Co-authored-by: DN6 <[email protected]>

---------

Co-authored-by: DN6 <[email protected]>

CogView4 Control Block (#10809)

* cogview4 control training

---------

Co-authored-by: OleehyO <[email protected]>
Co-authored-by: yiyixuxu <[email protected]>

[CI] pin transformers version for benchmarking. (#11067)

pin transformers version for benchmarking.

updates

Fix Wan I2V Quality (#11087)

* fix_wan_i2v_quality

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py

Co-authored-by: YiYi Xu <[email protected]>

* Update pipeline_wan_i2v.py

---------

Co-authored-by: YiYi Xu <[email protected]>
Co-authored-by: hlky <[email protected]>

LTX 0.9.5 (#10968)

* update

---------

Co-authored-by: YiYi Xu <[email protected]>
Co-authored-by: hlky <[email protected]>

make PR GPU tests conditioned on styling. (#11099)

Group offloading improvements (#11094)

update

Fix pipeline_flux_controlnet.py (#11095)

* Fix pipeline_flux_controlnet.py

* Fix style

update readme instructions. (#11096)

Co-authored-by: Juan Acevedo <[email protected]>

Resolve stride mismatch in UNet's ResNet to support Torch DDP (#11098)

Modify UNet's ResNet implementation to resolve stride mismatch in Torch's DDP

Fix Group offloading behaviour when using streams (#11097)

* update

* update

Quality options in `export_to_video` (#11090)

* Quality options in `export_to_video`

* make style

improve more.

add placeholders for docstrings.

formatting.

smol fix.

solidify validation and annotation

* Revert "feat: pipeline-level quant config."

This reverts commit 316ff46.

* feat: implement pipeline-level quantization config

Co-authored-by: SunMarc <[email protected]>

* update

* fixes

* fix validation.

* add tests and other improvements.

* add tests

* import quality

* remove prints.

* add docs.

* fixes to docs.

* doc fixes.

* doc fixes.

* add validation to the input quantization_config.

* clarify recommendations.

* docs

* add to ci.

* todo.

---------

Co-authored-by: SunMarc <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

roadmap Add to current release roadmap