|
3 | 3 |
|
4 | 4 | import torch |
5 | 5 | from accelerate import init_empty_weights |
6 | | -from transformers import AutoModel, AutoTokenizer, CLIPTextModel, CLIPTokenizer, LlavaForConditionalGeneration |
| 6 | +from transformers import ( |
| 7 | + AutoModel, |
| 8 | + AutoTokenizer, |
| 9 | + CLIPImageProcessor, |
| 10 | + CLIPTextModel, |
| 11 | + CLIPTokenizer, |
| 12 | + LlavaForConditionalGeneration, |
| 13 | +) |
7 | 14 |
|
8 | 15 | from diffusers import ( |
9 | 16 | AutoencoderKLHunyuanVideo, |
10 | 17 | FlowMatchEulerDiscreteScheduler, |
| 18 | + HunyuanVideoImageToVideoPipeline, |
11 | 19 | HunyuanVideoPipeline, |
12 | 20 | HunyuanVideoTransformer3DModel, |
13 | 21 | ) |
@@ -153,7 +161,7 @@ def remap_single_transformer_blocks_(key, state_dict): |
153 | 161 | "rope_theta": 256.0, |
154 | 162 | "rope_axes_dim": (16, 56, 56), |
155 | 163 | }, |
156 | | - "HYVideo-T/2": { |
| 164 | + "HYVideo-T/2-I2V": { |
157 | 165 | "in_channels": 16 * 2 + 1, |
158 | 166 | "out_channels": 16, |
159 | 167 | "num_attention_heads": 24, |
@@ -286,23 +294,39 @@ def get_args(): |
286 | 294 | if args.save_pipeline: |
287 | 295 | if args.transformer_type == "HYVideo-T/2-cfgdistill": |
288 | 296 | text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16) |
| 297 | + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right") |
| 298 | + text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16) |
| 299 | + tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path) |
| 300 | + scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift) |
| 301 | + |
| 302 | + pipe = HunyuanVideoPipeline( |
| 303 | + transformer=transformer, |
| 304 | + vae=vae, |
| 305 | + text_encoder=text_encoder, |
| 306 | + tokenizer=tokenizer, |
| 307 | + text_encoder_2=text_encoder_2, |
| 308 | + tokenizer_2=tokenizer_2, |
| 309 | + scheduler=scheduler, |
| 310 | + ) |
| 311 | + pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB") |
289 | 312 | else: |
290 | 313 | text_encoder = LlavaForConditionalGeneration.from_pretrained( |
291 | 314 | args.text_encoder_path, torch_dtype=torch.float16 |
292 | 315 | ) |
293 | | - |
294 | | - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right") |
295 | | - text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16) |
296 | | - tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path) |
297 | | - scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift) |
298 | | - |
299 | | - pipe = HunyuanVideoPipeline( |
300 | | - transformer=transformer, |
301 | | - vae=vae, |
302 | | - text_encoder=text_encoder, |
303 | | - tokenizer=tokenizer, |
304 | | - text_encoder_2=text_encoder_2, |
305 | | - tokenizer_2=tokenizer_2, |
306 | | - scheduler=scheduler, |
307 | | - ) |
308 | | - pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB") |
| 316 | + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right") |
| 317 | + text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16) |
| 318 | + tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path) |
| 319 | + scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift) |
| 320 | + image_processor = CLIPImageProcessor.from_pretrained(args.text_encoder_2_path) |
| 321 | + |
| 322 | + pipe = HunyuanVideoImageToVideoPipeline( |
| 323 | + transformer=transformer, |
| 324 | + vae=vae, |
| 325 | + text_encoder=text_encoder, |
| 326 | + tokenizer=tokenizer, |
| 327 | + text_encoder_2=text_encoder_2, |
| 328 | + tokenizer_2=tokenizer_2, |
| 329 | + scheduler=scheduler, |
| 330 | + image_processor=image_processor, |
| 331 | + ) |
| 332 | + pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB") |
0 commit comments