1818from typing import Dict , List , Optional , Tuple
1919
2020import cv2
21+ from diffusers import AutoencoderKLWan
2122import numpy as np
2223import torch
23- import webdataset as wds
24-
25- from diffusers import AutoencoderKLWan
2624from transformers import AutoTokenizer , UMT5EncoderModel
25+ import webdataset as wds
2726
2827
2928def _map_interpolation (resize_mode : str ) -> int :
@@ -412,7 +411,7 @@ def main():
412411 for index , meta in enumerate (metadata_list ):
413412 video_name = meta ["file_name" ]
414413 start_frame = int (meta ["start_frame" ]) # inclusive
415- end_frame = int (meta ["end_frame" ]) # inclusive
414+ end_frame = int (meta ["end_frame" ]) # inclusive
416415 caption_text = meta .get ("vila_caption" , "" )
417416
418417 video_path = str (video_folder / video_name )
@@ -431,7 +430,9 @@ def main():
431430
432431 # Encode text and video with HF models exactly like automodel
433432 text_embed = _encode_text (tokenizer , text_encoder , args .device , caption_text )
434- latents = _encode_video_latents (vae , args .device , video_tensor , deterministic_latents = not args .stochastic )
433+ latents = _encode_video_latents (
434+ vae , args .device , video_tensor , deterministic_latents = not args .stochastic
435+ )
435436
436437 # Move to CPU without changing dtype; keep exact values to match automodel outputs
437438 text_embed_cpu = text_embed .detach ().to (device = "cpu" )
0 commit comments