2525You can find all the original Wan2.1 checkpoints under the [ Wan-AI] ( https://huggingface.co/Wan-AI ) organization.
2626
2727> [ !TIP]
28- > Click on the Wan2.1 models in the right sidebar for more examples of other video generation tasks .
28+ > Click on the Wan2.1 models in the right sidebar for more examples of video generation.
2929
3030The example below demonstrates how to generate a video from text optimized for memory or inference speed.
3131
@@ -38,17 +38,16 @@ The Wan2.1 text-to-video model below requires ~13GB of VRAM.
3838
3939``` py
4040# pip install ftfy
41-
4241import torch
4342import numpy as np
44- from diffusers import AutoencoderKLWan, WanTransformer3DModel , WanPipeline
43+ from diffusers import AutoModel , WanPipeline
4544from diffusers.hooks.group_offloading import apply_group_offloading
4645from diffusers.utils import export_to_video, load_image
47- from transformers import UMT5EncoderModel, CLIPVisionModel
46+ from transformers import UMT5EncoderModel
4847
4948text_encoder = UMT5EncoderModel.from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " text_encoder" , torch_dtype = torch.bfloat16)
50- vae = AutoencoderKLWan .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
51- transformer = WanTransformer3DModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
49+ vae = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
50+ transformer = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
5251
5352# group-offloading
5453onload_device = torch.device(" cuda" )
@@ -67,7 +66,7 @@ transformer.enable_group_offload(
6766)
6867
6968pipeline = WanPipeline.from_pretrained(
70- model_id ,
69+ " Wan-AI/Wan2.1-T2V-14B-Diffusers " ,
7170 vae = vae,
7271 transformer = transformer,
7372 text_encoder = text_encoder,
@@ -104,20 +103,19 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
104103
105104``` py
106105# pip install ftfy
107-
108106import torch
109107import numpy as np
110- from diffusers import AutoencoderKLWan, WanTransformer3DModel , WanPipeline
108+ from diffusers import AutoModel , WanPipeline
111109from diffusers.hooks.group_offloading import apply_group_offloading
112110from diffusers.utils import export_to_video, load_image
113- from transformers import UMT5EncoderModel, CLIPVisionModel
111+ from transformers import UMT5EncoderModel
114112
115113text_encoder = UMT5EncoderModel.from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " text_encoder" , torch_dtype = torch.bfloat16)
116- vae = AutoencoderKLWan .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
117- transformer = WanTransformer3DModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
114+ vae = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
115+ transformer = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
118116
119117pipeline = WanPipeline.from_pretrained(
120- model_id ,
118+ " Wan-AI/Wan2.1-T2V-14B-Diffusers " ,
121119 vae = vae,
122120 transformer = transformer,
123121 text_encoder = text_encoder,
@@ -162,20 +160,19 @@ export_to_video(output, "output.mp4", fps=16)
162160
163161 ``` py
164162 # pip install ftfy
165-
166163 import torch
167- from diffusers import WanPipeline
164+ from diffusers import AutoModel, WanPipeline
168165 from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
169166 from diffusers.utils import export_to_video
170167
171- vae = AutoencoderKLWan .from_pretrained(
172- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32
168+ vae = AutoModel .from_pretrained(
169+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32
173170 )
174171 pipeline = WanPipeline.from_pretrained(
175- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , vae = vae, torch_dtype = torch.bfloat16
172+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , vae = vae, torch_dtype = torch.bfloat16
176173 )
177174 pipeline.scheduler = UniPCMultistepScheduler.from_config(
178- pipeline.scheduler.config, flow_shift = 5.0
175+ pipeline.scheduler.config, flow_shift = 5.0
179176 )
180177 pipeline.to(" cuda" )
181178
@@ -194,9 +191,9 @@ export_to_video(output, "output.mp4", fps=16)
194191 """
195192
196193 output = pipeline(
197- prompt = prompt,
198- num_frames = 81 ,
199- guidance_scale = 5.0 ,
194+ prompt = prompt,
195+ num_frames = 81 ,
196+ guidance_scale = 5.0 ,
200197 ).frames[0 ]
201198 export_to_video(output, " output.mp4" , fps = 16 )
202199 ```
@@ -205,30 +202,29 @@ export_to_video(output, "output.mp4", fps=16)
205202
206203 ``` py
207204 # pip install ftfy
208-
209205 import torch
210- from diffusers import WanPipeline, WanTransformer3DModel, AutoencoderKLWan
206+ from diffusers import WanPipeline, AutoModel
211207
212- vae = AutoencoderKLWan .from_single_file(
213- " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
208+ vae = AutoModel .from_single_file(
209+ " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
214210 )
215- transformer = WanTransformer3DModel .from_single_file(
216- " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors" ,
217- torch_dtype = torch.bfloat16
211+ transformer = AutoModel .from_single_file(
212+ " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors" ,
213+ torch_dtype = torch.bfloat16
218214 )
219215 pipeline = WanPipeline.from_pretrained(
220- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" ,
221- vae = vae,
222- transformer = transformer,
223- torch_dtype = torch.bfloat16
216+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" ,
217+ vae = vae,
218+ transformer = transformer,
219+ torch_dtype = torch.bfloat16
224220 )
225221 ```
226222
227223- Set the [ ` AutoencoderKLWan ` ] dtype to ` torch.float32 ` for better decoding quality.
228224
229225- The number of frames per second (fps) or ` k ` should be calculated by ` 4 * k + 1 ` .
230226
231- - Try lower ` shift ` values (` 2.0 ` to ` 5.0 ` ) for lower resolution videos, and try higher ` shift ` values (` 7.0 ` to ` 12.0 ` ) for higher resolution images.
227+ - Try lower ` shift ` values (` 2.0 ` to ` 5.0 ` ) for lower resolution videos and higher ` shift ` values (` 7.0 ` to ` 12.0 ` ) for higher resolution images.
232228
233229## WanPipeline
234230
0 commit comments