|
47 | 47 | >>> from diffusers import HunyuanVideo15Pipeline |
48 | 48 | >>> from diffusers.utils import export_to_video |
49 | 49 |
|
50 | | - >>> model_id = "hunyuanvideo-community/HunyuanVideo15" |
| 50 | + >>> model_id = "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v" |
51 | 51 | >>> pipe = HunyuanVideo15Pipeline.from_pretrained(model_id, torch_dtype=torch.float16) |
52 | 52 | >>> pipe.vae.enable_tiling() |
53 | 53 | >>> pipe.to("cuda") |
@@ -196,7 +196,6 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): |
196 | 196 | """ |
197 | 197 |
|
198 | 198 | model_cpu_offload_seq = "text_encoder->transformer->vae" |
199 | | - _callback_tensor_inputs = ["latents", "prompt_embeds"] |
200 | 199 |
|
201 | 200 | def __init__( |
202 | 201 | self, |
@@ -550,10 +549,6 @@ def prepare_cond_latents_and_mask(self, latents, dtype: Optional[torch.dtype], d |
550 | 549 | return cond_latents_concat, mask_concat |
551 | 550 |
|
552 | 551 |
|
553 | | - @property |
554 | | - def guidance_scale(self): |
555 | | - return self._guidance_scale |
556 | | - |
557 | 552 | @property |
558 | 553 | def num_timesteps(self): |
559 | 554 | return self._num_timesteps |
@@ -601,91 +596,67 @@ def __call__( |
601 | 596 |
|
602 | 597 | Args: |
603 | 598 | prompt (`str` or `List[str]`, *optional*): |
604 | | - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. |
| 599 | + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` |
605 | 600 | instead. |
606 | | - prompt_2 (`str` or `List[str]`, *optional*): |
607 | | - The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is |
608 | | - will be used instead. |
609 | 601 | negative_prompt (`str` or `List[str]`, *optional*): |
610 | 602 | The prompt or prompts not to guide the image generation. If not defined, one has to pass |
611 | | - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is |
612 | | - not greater than `1`). |
613 | | - negative_prompt_2 (`str` or `List[str]`, *optional*): |
614 | | - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and |
615 | | - `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. |
616 | | - height (`int`, defaults to `720`): |
617 | | - The height in pixels of the generated image. |
618 | | - width (`int`, defaults to `1280`): |
619 | | - The width in pixels of the generated image. |
620 | | - num_frames (`int`, defaults to `129`): |
| 603 | + `negative_prompt_embeds` instead. |
| 604 | + height (`int`, *optional*): |
| 605 | + The height in pixels of the generated video. |
| 606 | + width (`int`, *optional*): |
| 607 | + The width in pixels of the generated video. |
| 608 | + num_frames (`int`, defaults to `121`): |
621 | 609 | The number of frames in the generated video. |
622 | 610 | num_inference_steps (`int`, defaults to `50`): |
623 | | - The number of denoising steps. More denoising steps usually lead to a higher quality image at the |
| 611 | + The number of denoising steps. More denoising steps usually lead to a higher quality video at the |
624 | 612 | expense of slower inference. |
625 | 613 | sigmas (`List[float]`, *optional*): |
626 | 614 | Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in |
627 | 615 | their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed |
628 | 616 | will be used. |
629 | | - true_cfg_scale (`float`, *optional*, defaults to 1.0): |
630 | | - True classifier-free guidance (guidance scale) is enabled when `true_cfg_scale` > 1 and |
631 | | - `negative_prompt` is provided. |
632 | | - guidance_scale (`float`, defaults to `6.0`): |
633 | | - Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages |
634 | | - a model to generate images more aligned with `prompt` at the expense of lower image quality. |
635 | | -
|
636 | | - Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to |
637 | | - the [paper](https://huggingface.co/papers/2210.03142) to learn more. |
638 | 617 | num_videos_per_prompt (`int`, *optional*, defaults to 1): |
639 | | - The number of images to generate per prompt. |
| 618 | + The number of videos to generate per prompt. |
640 | 619 | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): |
641 | 620 | A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make |
642 | 621 | generation deterministic. |
643 | 622 | latents (`torch.Tensor`, *optional*): |
644 | | - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image |
| 623 | + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video |
645 | 624 | generation. Can be used to tweak the same generation with different prompts. If not provided, a latents |
646 | 625 | tensor is generated by sampling using the supplied random `generator`. |
647 | 626 | prompt_embeds (`torch.Tensor`, *optional*): |
648 | 627 | Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not |
649 | 628 | provided, text embeddings are generated from the `prompt` input argument. |
650 | | - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): |
651 | | - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. |
652 | | - If not provided, pooled text embeddings will be generated from `prompt` input argument. |
653 | | - negative_prompt_embeds (`torch.FloatTensor`, *optional*): |
| 629 | + prompt_embeds_mask (`torch.Tensor`, *optional*): |
| 630 | + Pre-generated mask for prompt embeddings. |
| 631 | + negative_prompt_embeds (`torch.Tensor`, *optional*): |
654 | 632 | Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt |
655 | 633 | weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input |
656 | 634 | argument. |
657 | | - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): |
658 | | - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt |
659 | | - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` |
660 | | - input argument. |
661 | | - output_type (`str`, *optional*, defaults to `"pil"`): |
662 | | - The output format of the generated image. Choose between `PIL.Image` or `np.array`. |
| 635 | + negative_prompt_embeds_mask (`torch.Tensor`, *optional*): |
| 636 | + Pre-generated mask for negative prompt embeddings. |
| 637 | + prompt_embeds_2 (`torch.Tensor`, *optional*): |
| 638 | + Pre-generated text embeddings from the second text encoder. Can be used to easily tweak text inputs. |
| 639 | + prompt_embeds_mask_2 (`torch.Tensor`, *optional*): |
| 640 | + Pre-generated mask for prompt embeddings from the second text encoder. |
| 641 | + negative_prompt_embeds_2 (`torch.Tensor`, *optional*): |
| 642 | + Pre-generated negative text embeddings from the second text encoder. |
| 643 | + negative_prompt_embeds_mask_2 (`torch.Tensor`, *optional*): |
| 644 | + Pre-generated mask for negative prompt embeddings from the second text encoder. |
| 645 | + output_type (`str`, *optional*, defaults to `"np"`): |
| 646 | + The output format of the generated video. Choose between "np", "pt", or "latent". |
663 | 647 | return_dict (`bool`, *optional*, defaults to `True`): |
664 | | - Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a plain tuple. |
| 648 | + Whether or not to return a [`HunyuanVideo15PipelineOutput`] instead of a plain tuple. |
665 | 649 | attention_kwargs (`dict`, *optional*): |
666 | 650 | A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under |
667 | 651 | `self.processor` in |
668 | 652 | [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). |
669 | | - clip_skip (`int`, *optional*): |
670 | | - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that |
671 | | - the output of the pre-final layer will be used for computing the prompt embeddings. |
672 | | - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): |
673 | | - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of |
674 | | - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: |
675 | | - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a |
676 | | - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. |
677 | | - callback_on_step_end_tensor_inputs (`List`, *optional*): |
678 | | - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list |
679 | | - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the |
680 | | - `._callback_tensor_inputs` attribute of your pipeline class. |
681 | 653 |
|
682 | 654 | Examples: |
683 | 655 |
|
684 | 656 | Returns: |
685 | | - [`~HunyuanVideoPipelineOutput`] or `tuple`: |
686 | | - If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned, otherwise a `tuple` is returned |
687 | | - where the first element is a list with the generated images and the second element is a list of `bool`s |
688 | | - indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content. |
| 657 | + [`~HunyuanVideo15PipelineOutput`] or `tuple`: |
| 658 | + If `return_dict` is `True`, [`HunyuanVideo15PipelineOutput`] is returned, otherwise a `tuple` is returned |
| 659 | + where the first element is a list with the generated videos. |
689 | 660 | """ |
690 | 661 |
|
691 | 662 | # 1. Check inputs. Raise error if not correct |
@@ -867,7 +838,8 @@ def __call__( |
867 | 838 | xm.mark_step() |
868 | 839 |
|
869 | 840 | self._current_timestep = None |
870 | | - |
| 841 | + |
| 842 | + # 8. decode the latents to video and postprocess |
871 | 843 | if not output_type == "latent": |
872 | 844 | latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor |
873 | 845 | video = self.vae.decode(latents, return_dict=False)[0] |
|
0 commit comments