Skip to content

Commit e194034

Browse files
clean up a bit more pipelines
1 parent e319d72 commit e194034

File tree

3 files changed

+97
-139
lines changed

3 files changed

+97
-139
lines changed

src/diffusers/pipelines/hunyuan_video1_5/image_processor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from ...video_processor import VideoProcessor
1818
from ...configuration_utils import register_to_config
1919

20-
# Copied from hyvideo/utils/data_utils.py
20+
# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L20
2121
def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0):
2222
num_patches = round((base_size / patch_size) ** 2)
2323
assert max_ratio >= 1.0
@@ -32,7 +32,7 @@ def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0):
3232
wp -= 1
3333
return crop_size_list
3434

35-
# Copied from hyvideo/utils/data_utils.py
35+
# copied fromhttps://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38
3636
def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
3737
"""
3838
Get the closest ratio in the buckets.
@@ -72,6 +72,8 @@ class HunyuanVideo15ImageProcessor(VideoProcessor):
7272
this factor.
7373
vae_latent_channels (`int`, *optional*, defaults to `32`):
7474
VAE latent channels.
75+
do_convert_rgb (`bool`, *optional*, defaults to `True`):
76+
Whether to convert the image to RGB.
7577
"""
7678

7779
@register_to_config

src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py

Lines changed: 32 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
>>> from diffusers import HunyuanVideo15Pipeline
4848
>>> from diffusers.utils import export_to_video
4949
50-
>>> model_id = "hunyuanvideo-community/HunyuanVideo15"
50+
>>> model_id = "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v"
5151
>>> pipe = HunyuanVideo15Pipeline.from_pretrained(model_id, torch_dtype=torch.float16)
5252
>>> pipe.vae.enable_tiling()
5353
>>> pipe.to("cuda")
@@ -196,7 +196,6 @@ class HunyuanVideo15Pipeline(DiffusionPipeline):
196196
"""
197197

198198
model_cpu_offload_seq = "text_encoder->transformer->vae"
199-
_callback_tensor_inputs = ["latents", "prompt_embeds"]
200199

201200
def __init__(
202201
self,
@@ -550,10 +549,6 @@ def prepare_cond_latents_and_mask(self, latents, dtype: Optional[torch.dtype], d
550549
return cond_latents_concat, mask_concat
551550

552551

553-
@property
554-
def guidance_scale(self):
555-
return self._guidance_scale
556-
557552
@property
558553
def num_timesteps(self):
559554
return self._num_timesteps
@@ -601,91 +596,67 @@ def __call__(
601596
602597
Args:
603598
prompt (`str` or `List[str]`, *optional*):
604-
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
599+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`
605600
instead.
606-
prompt_2 (`str` or `List[str]`, *optional*):
607-
The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
608-
will be used instead.
609601
negative_prompt (`str` or `List[str]`, *optional*):
610602
The prompt or prompts not to guide the image generation. If not defined, one has to pass
611-
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
612-
not greater than `1`).
613-
negative_prompt_2 (`str` or `List[str]`, *optional*):
614-
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
615-
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
616-
height (`int`, defaults to `720`):
617-
The height in pixels of the generated image.
618-
width (`int`, defaults to `1280`):
619-
The width in pixels of the generated image.
620-
num_frames (`int`, defaults to `129`):
603+
`negative_prompt_embeds` instead.
604+
height (`int`, *optional*):
605+
The height in pixels of the generated video.
606+
width (`int`, *optional*):
607+
The width in pixels of the generated video.
608+
num_frames (`int`, defaults to `121`):
621609
The number of frames in the generated video.
622610
num_inference_steps (`int`, defaults to `50`):
623-
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
611+
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
624612
expense of slower inference.
625613
sigmas (`List[float]`, *optional*):
626614
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
627615
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
628616
will be used.
629-
true_cfg_scale (`float`, *optional*, defaults to 1.0):
630-
True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and
631-
`negative_prompt` is provided.
632-
guidance_scale (`float`, defaults to `6.0`):
633-
Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
634-
a model to generate images more aligned with `prompt` at the expense of lower image quality.
635-
636-
Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
637-
the [paper](https://huggingface.co/papers/2210.03142) to learn more.
638617
num_videos_per_prompt (`int`, *optional*, defaults to 1):
639-
The number of images to generate per prompt.
618+
The number of videos to generate per prompt.
640619
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
641620
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
642621
generation deterministic.
643622
latents (`torch.Tensor`, *optional*):
644-
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
623+
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
645624
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
646625
tensor is generated by sampling using the supplied random `generator`.
647626
prompt_embeds (`torch.Tensor`, *optional*):
648627
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
649628
provided, text embeddings are generated from the `prompt` input argument.
650-
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
651-
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
652-
If not provided, pooled text embeddings will be generated from `prompt` input argument.
653-
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
629+
prompt_embeds_mask (`torch.Tensor`, *optional*):
630+
Pre-generated mask for prompt embeddings.
631+
negative_prompt_embeds (`torch.Tensor`, *optional*):
654632
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
655633
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
656634
argument.
657-
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
658-
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
659-
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
660-
input argument.
661-
output_type (`str`, *optional*, defaults to `"pil"`):
662-
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
635+
negative_prompt_embeds_mask (`torch.Tensor`, *optional*):
636+
Pre-generated mask for negative prompt embeddings.
637+
prompt_embeds_2 (`torch.Tensor`, *optional*):
638+
Pre-generated text embeddings from the second text encoder. Can be used to easily tweak text inputs.
639+
prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
640+
Pre-generated mask for prompt embeddings from the second text encoder.
641+
negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
642+
Pre-generated negative text embeddings from the second text encoder.
643+
negative_prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
644+
Pre-generated mask for negative prompt embeddings from the second text encoder.
645+
output_type (`str`, *optional*, defaults to `"np"`):
646+
The output format of the generated video. Choose between "np", "pt", or "latent".
663647
return_dict (`bool`, *optional*, defaults to `True`):
664-
Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a plain tuple.
648+
Whether or not to return a [`HunyuanVideo15PipelineOutput`] instead of a plain tuple.
665649
attention_kwargs (`dict`, *optional*):
666650
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
667651
`self.processor` in
668652
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
669-
clip_skip (`int`, *optional*):
670-
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
671-
the output of the pre-final layer will be used for computing the prompt embeddings.
672-
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
673-
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
674-
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
675-
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
676-
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
677-
callback_on_step_end_tensor_inputs (`List`, *optional*):
678-
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
679-
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
680-
`._callback_tensor_inputs` attribute of your pipeline class.
681653
682654
Examples:
683655
684656
Returns:
685-
[`~HunyuanVideoPipelineOutput`] or `tuple`:
686-
If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned, otherwise a `tuple` is returned
687-
where the first element is a list with the generated images and the second element is a list of `bool`s
688-
indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
657+
[`~HunyuanVideo15PipelineOutput`] or `tuple`:
658+
If `return_dict` is `True`, [`HunyuanVideo15PipelineOutput`] is returned, otherwise a `tuple` is returned
659+
where the first element is a list with the generated videos.
689660
"""
690661

691662
# 1. Check inputs. Raise error if not correct
@@ -867,7 +838,8 @@ def __call__(
867838
xm.mark_step()
868839

869840
self._current_timestep = None
870-
841+
842+
# 8. decode the latents to video and postprocess
871843
if not output_type == "latent":
872844
latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
873845
video = self.vae.decode(latents, return_dict=False)[0]

0 commit comments

Comments
 (0)