|
19 | 19 | import PIL |
20 | 20 | import regex as re |
21 | 21 | import torch |
22 | | -from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection, UMT5EncoderModel |
| 22 | +from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel |
23 | 23 |
|
24 | 24 | from ...callbacks import MultiPipelineCallbacks, PipelineCallback |
25 | 25 | from ...image_processor import PipelineImageInput |
|
49 | 49 | >>> import numpy as np |
50 | 50 | >>> from diffusers import AutoencoderKLWan, WanImageToVideoPipeline |
51 | 51 | >>> from diffusers.utils import export_to_video, load_image |
52 | | - >>> from transformers import CLIPVisionModelWithProjection |
| 52 | + >>> from transformers import CLIPVisionModel |
53 | 53 |
|
54 | 54 | >>> # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers |
55 | 55 | >>> model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" |
56 | | - >>> image_encoder = CLIPVisionModelWithProjection.from_pretrained( |
| 56 | + >>> image_encoder = CLIPVisionModel.from_pretrained( |
57 | 57 | ... model_id, subfolder="image_encoder", torch_dtype=torch.float32 |
58 | 58 | ... ) |
59 | 59 | >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) |
@@ -171,7 +171,7 @@ def __init__( |
171 | 171 | self, |
172 | 172 | tokenizer: AutoTokenizer, |
173 | 173 | text_encoder: UMT5EncoderModel, |
174 | | - image_encoder: CLIPVisionModelWithProjection, |
| 174 | + image_encoder: CLIPVisionModel, |
175 | 175 | image_processor: CLIPImageProcessor, |
176 | 176 | transformer: WanTransformer3DModel, |
177 | 177 | vae: AutoencoderKLWan, |
|
0 commit comments