2525from ...models import AutoencoderKLHunyuanVideo
2626from ...models .transformers import Kandinsky5Transformer3DModel
2727from ...schedulers import FlowMatchEulerDiscreteScheduler
28- from ...utils import is_ftfy_available , is_torch_xla_available , logging , replace_example_docstring
28+
29+ # Add imports for offloading and tiling
30+ from ...utils import (
31+ is_ftfy_available ,
32+ is_torch_xla_available ,
33+ logging ,
34+ replace_example_docstring ,
35+ )
2936from ...utils .torch_utils import randn_tensor
3037from ...video_processor import VideoProcessor
3138from ..pipeline_utils import DiffusionPipeline
3239from .pipeline_output import KandinskyPipelineOutput
3340
34- # Add imports for offloading and tiling
35- from ...utils import (
36- is_accelerate_available ,
37- is_accelerate_version ,
38- )
3941
4042if is_torch_xla_available ():
4143 import torch_xla .core .xla_model as xm
9193def basic_clean (text ):
9294 """
9395 Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
94-
96+
9597 Clean text using ftfy if available and unescape HTML entities.
9698 """
9799 if is_ftfy_available ():
@@ -103,7 +105,7 @@ def basic_clean(text):
103105def whitespace_clean (text ):
104106 """
105107 Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
106-
108+
107109 Normalize whitespace in text by replacing multiple spaces with single space.
108110 """
109111 text = re .sub (r"\s+" , " " , text )
@@ -114,14 +116,13 @@ def whitespace_clean(text):
114116def prompt_clean (text ):
115117 """
116118 Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
117-
119+
118120 Apply both basic cleaning and whitespace normalization to prompts.
119121 """
120122 text = whitespace_clean (basic_clean (text ))
121123 return text
122124
123125
124-
125126class Kandinsky5T2VPipeline (DiffusionPipeline , KandinskyLoraLoaderMixin ):
126127 r"""
127128 Pipeline for text-to-video generation using Kandinsky 5.0.
@@ -133,14 +134,16 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
133134 transformer ([`Kandinsky5Transformer3DModel`]):
134135 Conditional Transformer to denoise the encoded video latents.
135136 vae ([`AutoencoderKLHunyuanVideo`]):
136- Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo (vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from latent representations.
137+ Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo
138+ (vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from
139+ latent representations.
137140 text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
138141 Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
139142 tokenizer ([`AutoProcessor`]):
140143 Tokenizer for Qwen2.5-VL.
141144 text_encoder_2 ([`CLIPTextModel`]):
142- Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
143- the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
145+ Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
146+ specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
144147 tokenizer_2 ([`CLIPTokenizer`]):
145148 Tokenizer for CLIP.
146149 scheduler ([`FlowMatchEulerDiscreteScheduler`]):
@@ -198,21 +201,21 @@ def __init__(
198201 self .vae_scale_factor_spatial = self .vae .config .spatial_compression_ratio if getattr (self , "vae" , None ) else 8
199202 self .video_processor = VideoProcessor (vae_scale_factor = self .vae_scale_factor_spatial )
200203
201-
202204 def _get_scale_factor (self , height : int , width : int ) -> tuple :
203205 """
204206 Calculate the scale factor based on resolution.
205-
207+
206208 Args:
207209 height (int): Video height
208210 width (int): Video width
209-
211+
210212 Returns:
211213 tuple: Scale factor as (temporal_scale, height_scale, width_scale)
212214 """
213-
214- between_480p = lambda x : 480 <= x <= 854
215-
215+
216+ def between_480p (x ):
217+ return 480 <= x <= 854
218+
216219 if between_480p (height ) and between_480p (width ):
217220 return (1 , 2 , 2 )
218221 else :
@@ -337,14 +340,14 @@ def _encode_prompt_qwen(
337340 videos = None ,
338341 return_tensors = "pt" ,
339342 padding = "longest" ,
340- )[' input_ids' ]
343+ )[" input_ids" ]
341344
342345 if untruncated_ids .shape [- 1 ] > max_allowed_len :
343- for i ,text in enumerate (full_texts ):
344- tokens = untruncated_ids [i ][self .prompt_template_encode_start_idx : - 2 ]
345- removed_text = self .tokenizer .decode (tokens [max_sequence_length - 2 :])
346+ for i , text in enumerate (full_texts ):
347+ tokens = untruncated_ids [i ][self .prompt_template_encode_start_idx : - 2 ]
348+ removed_text = self .tokenizer .decode (tokens [max_sequence_length - 2 :])
346349 if len (removed_text ) > 0 :
347- full_texts [i ] = text [:- len (removed_text )]
350+ full_texts [i ] = text [: - len (removed_text )]
348351 logger .warning (
349352 "The following part of your input was truncated because `max_sequence_length` is set to "
350353 f" { max_sequence_length } tokens: { removed_text } "
@@ -538,7 +541,7 @@ def check_inputs(
538541 """
539542
540543 if max_sequence_length is not None and max_sequence_length > 1024 :
541- raise ValueError (f "max_sequence_length must be less than 1024" )
544+ raise ValueError ("max_sequence_length must be less than 1024" )
542545
543546 if height % 16 != 0 or width % 16 != 0 :
544547 raise ValueError (f"`height` and `width` have to be divisible by 16 but are { height } and { width } ." )
@@ -796,7 +799,7 @@ def __call__(
796799 dtype = dtype ,
797800 )
798801
799- if self .guidance_scale > 1. :
802+ if self .guidance_scale > 1.0 :
800803 if negative_prompt is None :
801804 negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
802805
@@ -867,7 +870,7 @@ def __call__(
867870 continue
868871
869872 timestep = t .unsqueeze (0 ).repeat (batch_size * num_videos_per_prompt )
870-
873+
871874 # Predict noise residual
872875 pred_velocity = self .transformer (
873876 hidden_states = latents .to (dtype ),
@@ -881,7 +884,7 @@ def __call__(
881884 return_dict = True ,
882885 ).sample
883886
884- if self .guidance_scale > 1. and negative_prompt_embeds_qwen is not None :
887+ if self .guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None :
885888 uncond_pred_velocity = self .transformer (
886889 hidden_states = latents .to (dtype ),
887890 encoder_hidden_states = negative_prompt_embeds_qwen .to (dtype ),
0 commit comments