From 2e50ff44f4c53436754ab5055c0adca6cc360d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Wed, 12 Mar 2025 10:25:36 +0300 Subject: [PATCH] Add support for image input in HunyuanVideoModelSpecification --- finetrainers/models/hunyuan_video/base_specification.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/finetrainers/models/hunyuan_video/base_specification.py b/finetrainers/models/hunyuan_video/base_specification.py index e72a060b..07a8c070 100644 --- a/finetrainers/models/hunyuan_video/base_specification.py +++ b/finetrainers/models/hunyuan_video/base_specification.py @@ -7,9 +7,11 @@ AutoencoderKLHunyuanVideo, FlowMatchEulerDiscreteScheduler, HunyuanVideoPipeline, + HunyuanVideoImageToVideoPipeline, HunyuanVideoTransformer3DModel, ) from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from PIL.Image import Image from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, LlamaModel from ... import data @@ -358,6 +360,7 @@ def validation( self, pipeline: HunyuanVideoPipeline, prompt: str, + image: Optional[Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_frames: Optional[int] = None, @@ -365,8 +368,12 @@ def validation( generator: Optional[torch.Generator] = None, **kwargs, ) -> List[ArtifactType]: + if image is not None: + pipeline = HunyuanVideoImageToVideoPipeline.from_pipe(pipeline) + generation_kwargs = { "prompt": prompt, + "image": image, "height": height, "width": width, "num_frames": num_frames,