Skip to content

Commit 77abad3

Browse files
committed
update
1 parent ab2476b commit 77abad3

File tree

6 files changed

+832
-20
lines changed

6 files changed

+832
-20
lines changed

scripts/convert_hunyuan_video_to_diffusers.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,19 @@
33

44
import torch
55
from accelerate import init_empty_weights
6-
from transformers import AutoModel, AutoTokenizer, CLIPTextModel, CLIPTokenizer, LlavaForConditionalGeneration
6+
from transformers import (
7+
AutoModel,
8+
AutoTokenizer,
9+
CLIPImageProcessor,
10+
CLIPTextModel,
11+
CLIPTokenizer,
12+
LlavaForConditionalGeneration,
13+
)
714

815
from diffusers import (
916
AutoencoderKLHunyuanVideo,
1017
FlowMatchEulerDiscreteScheduler,
18+
HunyuanVideoImageToVideoPipeline,
1119
HunyuanVideoPipeline,
1220
HunyuanVideoTransformer3DModel,
1321
)
@@ -153,7 +161,7 @@ def remap_single_transformer_blocks_(key, state_dict):
153161
"rope_theta": 256.0,
154162
"rope_axes_dim": (16, 56, 56),
155163
},
156-
"HYVideo-T/2": {
164+
"HYVideo-T/2-I2V": {
157165
"in_channels": 16 * 2 + 1,
158166
"out_channels": 16,
159167
"num_attention_heads": 24,
@@ -286,23 +294,39 @@ def get_args():
286294
if args.save_pipeline:
287295
if args.transformer_type == "HYVideo-T/2-cfgdistill":
288296
text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16)
297+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
298+
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
299+
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
300+
scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift)
301+
302+
pipe = HunyuanVideoPipeline(
303+
transformer=transformer,
304+
vae=vae,
305+
text_encoder=text_encoder,
306+
tokenizer=tokenizer,
307+
text_encoder_2=text_encoder_2,
308+
tokenizer_2=tokenizer_2,
309+
scheduler=scheduler,
310+
)
311+
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
289312
else:
290313
text_encoder = LlavaForConditionalGeneration.from_pretrained(
291314
args.text_encoder_path, torch_dtype=torch.float16
292315
)
293-
294-
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
295-
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
296-
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
297-
scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift)
298-
299-
pipe = HunyuanVideoPipeline(
300-
transformer=transformer,
301-
vae=vae,
302-
text_encoder=text_encoder,
303-
tokenizer=tokenizer,
304-
text_encoder_2=text_encoder_2,
305-
tokenizer_2=tokenizer_2,
306-
scheduler=scheduler,
307-
)
308-
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
316+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
317+
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
318+
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
319+
scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift)
320+
image_processor = CLIPImageProcessor.from_pretrained(args.text_encoder_2_path)
321+
322+
pipe = HunyuanVideoImageToVideoPipeline(
323+
transformer=transformer,
324+
vae=vae,
325+
text_encoder=text_encoder,
326+
tokenizer=tokenizer,
327+
text_encoder_2=text_encoder_2,
328+
tokenizer_2=tokenizer_2,
329+
scheduler=scheduler,
330+
image_processor=image_processor,
331+
)
332+
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@
313313
"HunyuanDiTPAGPipeline",
314314
"HunyuanDiTPipeline",
315315
"HunyuanSkyreelsImageToVideoPipeline",
316+
"HunyuanVideoImageToVideoPipeline",
316317
"HunyuanVideoPipeline",
317318
"I2VGenXLPipeline",
318319
"IFImg2ImgPipeline",
@@ -823,6 +824,7 @@
823824
HunyuanDiTPAGPipeline,
824825
HunyuanDiTPipeline,
825826
HunyuanSkyreelsImageToVideoPipeline,
827+
HunyuanVideoImageToVideoPipeline,
826828
HunyuanVideoPipeline,
827829
I2VGenXLPipeline,
828830
IFImg2ImgPipeline,

src/diffusers/pipelines/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@
222222
"EasyAnimateControlPipeline",
223223
]
224224
_import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
225-
_import_structure["hunyuan_video"] = ["HunyuanVideoPipeline", "HunyuanSkyreelsImageToVideoPipeline"]
225+
_import_structure["hunyuan_video"] = [
226+
"HunyuanVideoPipeline",
227+
"HunyuanSkyreelsImageToVideoPipeline",
228+
"HunyuanVideoImageToVideoPipeline",
229+
]
226230
_import_structure["kandinsky"] = [
227231
"KandinskyCombinedPipeline",
228232
"KandinskyImg2ImgCombinedPipeline",
@@ -570,7 +574,11 @@
570574
FluxPriorReduxPipeline,
571575
ReduxImageEncoder,
572576
)
573-
from .hunyuan_video import HunyuanSkyreelsImageToVideoPipeline, HunyuanVideoPipeline
577+
from .hunyuan_video import (
578+
HunyuanSkyreelsImageToVideoPipeline,
579+
HunyuanVideoImageToVideoPipeline,
580+
HunyuanVideoPipeline,
581+
)
574582
from .hunyuandit import HunyuanDiTPipeline
575583
from .i2vgen_xl import I2VGenXLPipeline
576584
from .kandinsky import (

src/diffusers/pipelines/hunyuan_video/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
else:
2525
_import_structure["pipeline_hunyuan_skyreels_image2video"] = ["HunyuanSkyreelsImageToVideoPipeline"]
2626
_import_structure["pipeline_hunyuan_video"] = ["HunyuanVideoPipeline"]
27+
_import_structure["pipeline_hunyuan_video_image2video"] = ["HunyuanVideoImageToVideoPipeline"]
2728

2829
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
2930
try:
@@ -35,6 +36,7 @@
3536
else:
3637
from .pipeline_hunyuan_skyreels_image2video import HunyuanSkyreelsImageToVideoPipeline
3738
from .pipeline_hunyuan_video import HunyuanVideoPipeline
39+
from .pipeline_hunyuan_video_image2video import HunyuanVideoImageToVideoPipeline
3840

3941
else:
4042
import sys

0 commit comments

Comments
 (0)