Skip to content

Commit 4cf1953

Browse files
committed
style
1 parent 2783601 commit 4cf1953

File tree

9 files changed

+182
-178
lines changed

9 files changed

+182
-178
lines changed

src/diffusers/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,10 +490,10 @@
490490
"ImageTextPipelineOutput",
491491
"Kandinsky3Img2ImgPipeline",
492492
"Kandinsky3Pipeline",
493-
"Kandinsky5T2VPipeline",
493+
"Kandinsky5I2IPipeline",
494494
"Kandinsky5I2VPipeline",
495495
"Kandinsky5T2IPipeline",
496-
"Kandinsky5I2IPipeline",
496+
"Kandinsky5T2VPipeline",
497497
"KandinskyCombinedPipeline",
498498
"KandinskyImg2ImgCombinedPipeline",
499499
"KandinskyImg2ImgPipeline",
@@ -1177,10 +1177,10 @@
11771177
ImageTextPipelineOutput,
11781178
Kandinsky3Img2ImgPipeline,
11791179
Kandinsky3Pipeline,
1180-
Kandinsky5T2VPipeline,
1180+
Kandinsky5I2IPipeline,
11811181
Kandinsky5I2VPipeline,
11821182
Kandinsky5T2IPipeline,
1183-
Kandinsky5I2IPipeline,
1183+
Kandinsky5T2VPipeline,
11841184
KandinskyCombinedPipeline,
11851185
KandinskyImg2ImgCombinedPipeline,
11861186
KandinskyImg2ImgPipeline,

src/diffusers/pipelines/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,10 +695,10 @@
695695
Kandinsky3Pipeline,
696696
)
697697
from .kandinsky5 import (
698-
Kandinsky5T2VPipeline,
698+
Kandinsky5I2IPipeline,
699699
Kandinsky5I2VPipeline,
700700
Kandinsky5T2IPipeline,
701-
Kandinsky5I2IPipeline,
701+
Kandinsky5T2VPipeline,
702702
)
703703
from .latent_consistency_models import (
704704
LatentConsistencyModelImg2ImgPipeline,

src/diffusers/pipelines/kandinsky5/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
2424
else:
2525
_import_structure["pipeline_kandinsky"] = ["Kandinsky5T2VPipeline"]
26-
_import_structure["pipeline_kandinsky_i2v"] = ["Kandinsky5I2VPipeline"]
2726
_import_structure["pipeline_kandinsky_i2i"] = ["Kandinsky5I2IPipeline"]
27+
_import_structure["pipeline_kandinsky_i2v"] = ["Kandinsky5I2VPipeline"]
2828
_import_structure["pipeline_kandinsky_t2i"] = ["Kandinsky5T2IPipeline"]
2929

3030
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,9 +36,9 @@
3636
from ...utils.dummy_torch_and_transformers_objects import *
3737
else:
3838
from .pipeline_kandinsky import Kandinsky5T2VPipeline
39+
from .pipeline_kandinsky_i2i import Kandinsky5I2IPipeline
3940
from .pipeline_kandinsky_i2v import Kandinsky5I2VPipeline
4041
from .pipeline_kandinsky_t2i import Kandinsky5T2IPipeline
41-
from .pipeline_kandinsky_i2i import Kandinsky5I2IPipeline
4242

4343
else:
4444
import sys

src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,19 @@
2525
from ...models import AutoencoderKLHunyuanVideo
2626
from ...models.transformers import Kandinsky5Transformer3DModel
2727
from ...schedulers import FlowMatchEulerDiscreteScheduler
28-
from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
28+
29+
# Add imports for offloading and tiling
30+
from ...utils import (
31+
is_ftfy_available,
32+
is_torch_xla_available,
33+
logging,
34+
replace_example_docstring,
35+
)
2936
from ...utils.torch_utils import randn_tensor
3037
from ...video_processor import VideoProcessor
3138
from ..pipeline_utils import DiffusionPipeline
3239
from .pipeline_output import KandinskyPipelineOutput
3340

34-
# Add imports for offloading and tiling
35-
from ...utils import (
36-
is_accelerate_available,
37-
is_accelerate_version,
38-
)
3941

4042
if is_torch_xla_available():
4143
import torch_xla.core.xla_model as xm
@@ -91,7 +93,7 @@
9193
def basic_clean(text):
9294
"""
9395
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
94-
96+
9597
Clean text using ftfy if available and unescape HTML entities.
9698
"""
9799
if is_ftfy_available():
@@ -103,7 +105,7 @@ def basic_clean(text):
103105
def whitespace_clean(text):
104106
"""
105107
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
106-
108+
107109
Normalize whitespace in text by replacing multiple spaces with single space.
108110
"""
109111
text = re.sub(r"\s+", " ", text)
@@ -114,14 +116,13 @@ def whitespace_clean(text):
114116
def prompt_clean(text):
115117
"""
116118
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
117-
119+
118120
Apply both basic cleaning and whitespace normalization to prompts.
119121
"""
120122
text = whitespace_clean(basic_clean(text))
121123
return text
122124

123125

124-
125126
class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
126127
r"""
127128
Pipeline for text-to-video generation using Kandinsky 5.0.
@@ -133,14 +134,16 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
133134
transformer ([`Kandinsky5Transformer3DModel`]):
134135
Conditional Transformer to denoise the encoded video latents.
135136
vae ([`AutoencoderKLHunyuanVideo`]):
136-
Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo (vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from latent representations.
137+
Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo
138+
(vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from
139+
latent representations.
137140
text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
138141
Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
139142
tokenizer ([`AutoProcessor`]):
140143
Tokenizer for Qwen2.5-VL.
141144
text_encoder_2 ([`CLIPTextModel`]):
142-
Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
143-
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
145+
Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
146+
specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
144147
tokenizer_2 ([`CLIPTokenizer`]):
145148
Tokenizer for CLIP.
146149
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
@@ -198,21 +201,21 @@ def __init__(
198201
self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
199202
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
200203

201-
202204
def _get_scale_factor(self, height: int, width: int) -> tuple:
203205
"""
204206
Calculate the scale factor based on resolution.
205-
207+
206208
Args:
207209
height (int): Video height
208210
width (int): Video width
209-
211+
210212
Returns:
211213
tuple: Scale factor as (temporal_scale, height_scale, width_scale)
212214
"""
213-
214-
between_480p = lambda x: 480 <= x <= 854
215-
215+
216+
def between_480p(x):
217+
return 480 <= x <= 854
218+
216219
if between_480p(height) and between_480p(width):
217220
return (1, 2, 2)
218221
else:
@@ -337,14 +340,14 @@ def _encode_prompt_qwen(
337340
videos=None,
338341
return_tensors="pt",
339342
padding="longest",
340-
)['input_ids']
343+
)["input_ids"]
341344

342345
if untruncated_ids.shape[-1] > max_allowed_len:
343-
for i,text in enumerate(full_texts):
344-
tokens = untruncated_ids[i][self.prompt_template_encode_start_idx:-2]
345-
removed_text = self.tokenizer.decode(tokens[max_sequence_length-2:])
346+
for i, text in enumerate(full_texts):
347+
tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
348+
removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
346349
if len(removed_text) > 0:
347-
full_texts[i] = text[:-len(removed_text)]
350+
full_texts[i] = text[: -len(removed_text)]
348351
logger.warning(
349352
"The following part of your input was truncated because `max_sequence_length` is set to "
350353
f" {max_sequence_length} tokens: {removed_text}"
@@ -538,7 +541,7 @@ def check_inputs(
538541
"""
539542

540543
if max_sequence_length is not None and max_sequence_length > 1024:
541-
raise ValueError(f"max_sequence_length must be less than 1024")
544+
raise ValueError("max_sequence_length must be less than 1024")
542545

543546
if height % 16 != 0 or width % 16 != 0:
544547
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
@@ -796,7 +799,7 @@ def __call__(
796799
dtype=dtype,
797800
)
798801

799-
if self.guidance_scale > 1.:
802+
if self.guidance_scale > 1.0:
800803
if negative_prompt is None:
801804
negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
802805

@@ -867,7 +870,7 @@ def __call__(
867870
continue
868871

869872
timestep = t.unsqueeze(0).repeat(batch_size * num_videos_per_prompt)
870-
873+
871874
# Predict noise residual
872875
pred_velocity = self.transformer(
873876
hidden_states=latents.to(dtype),
@@ -881,7 +884,7 @@ def __call__(
881884
return_dict=True,
882885
).sample
883886

884-
if self.guidance_scale > 1. and negative_prompt_embeds_qwen is not None:
887+
if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
885888
uncond_pred_velocity = self.transformer(
886889
hidden_states=latents.to(dtype),
887890
encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),

0 commit comments

Comments
 (0)