style

yiyixuxu · yiyixuxu · commit bb8f75333f85 · 2025-09-16T04:26:36.000+02:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py
@@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 
+# YiYi TODO: remove this
+from einops import rearrange
+
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
 from ...utils import logging
@@ -27,10 +31,6 @@
 from ..modeling_outputs import AutoencoderKLOutput
 from ..modeling_utils import ModelMixin
 from .vae import DecoderOutput, DiagonalGaussianDistribution
-import numpy as np
-
-#YiYi TODO: remove this
-from einops import rearrange
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -50,7 +50,7 @@ def __init__(self, in_channels: int, out_channels: int, non_linearity: str = "si
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.nonlinearity = get_activation(non_linearity) # YiYi Notes, they have a custom defined swish but should be the same
+        self.nonlinearity = get_activation(non_linearity)
 
         # layers
         self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
@@ -109,9 +109,9 @@ def forward(self, x):
         value = self.to_v(x)
 
         batch_size, channels, height, width = query.shape
-        query = query.permute(0, 2, 3, 1).reshape(batch_size, height*width, channels).contiguous()
-        key = key.permute(0, 2, 3, 1).reshape(batch_size, height*width, channels).contiguous()
-        value = value.permute(0, 2, 3, 1).reshape(batch_size, height*width, channels).contiguous()
+        query = query.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels).contiguous()
+        key = key.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels).contiguous()
+        value = value.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels).contiguous()
 
         # apply attention
         x = F.scaled_dot_product_attention(query, key, value)
@@ -182,12 +182,11 @@ class HunyuanImageMidBlock(nn.Module):
         in_channels (int): Number of input channels.
         num_layers (int): Number of layers.
     """
+
     def __init__(self, in_channels: int, num_layers: int = 1):
         super().__init__()
 
-        resnets = [
-            HunyuanImageResnetBlock(in_channels=in_channels, out_channels=in_channels)
-        ]
+        resnets = [HunyuanImageResnetBlock(in_channels=in_channels, out_channels=in_channels)]
 
         attentions = []
         for _ in range(num_layers):
@@ -198,7 +197,6 @@ def __init__(self, in_channels: int, num_layers: int = 1):
         self.attentions = nn.ModuleList(attentions)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-
         x = self.resnets[0](x)
 
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
@@ -234,8 +232,10 @@ def __init__(
     ):
         super().__init__()
         if block_out_channels[-1] % (2 * z_channels) != 0:
-            raise ValueError(f"block_out_channels[-1 has to be divisible by 2 * out_channels, you have block_out_channels = {block_out_channels[-1]} and out_channels = {out_channels}")
-        
+            raise ValueError(
+                f"block_out_channels[-1 has to be divisible by 2 * out_channels, you have block_out_channels = {block_out_channels[-1]} and out_channels = {z_channels}"
+            )
+
         self.in_channels = in_channels
         self.z_channels = z_channels
         self.block_out_channels = block_out_channels
@@ -256,14 +256,18 @@ def __init__(
             block_out_channel = block_out_channels[i]
             # residual blocks
             for _ in range(num_res_blocks):
-                self.down_blocks.append(HunyuanImageResnetBlock(in_channels=block_in_channel, out_channels=block_out_channel))
+                self.down_blocks.append(
+                    HunyuanImageResnetBlock(in_channels=block_in_channel, out_channels=block_out_channel)
+                )
                 block_in_channel = block_out_channel
 
             # downsample block
             if i < np.log2(ffactor_spatial) and i != len(block_out_channels) - 1:
                 if downsample_match_channel:
                     block_out_channel = block_out_channels[i + 1]
-                self.down_blocks.append(HunyuanImageDownsample(in_channels=block_in_channel, out_channels=block_out_channel))
+                self.down_blocks.append(
+                    HunyuanImageDownsample(in_channels=block_in_channel, out_channels=block_out_channel)
+                )
                 block_in_channel = block_out_channel
 
         # middle blocks
@@ -305,7 +309,6 @@ class HunyuanImageDecoder2D(nn.Module):
     Decoder network that reconstructs output from latent representation.
 
     Args:
-
     z_channels : int
         Number of latent channels.
     out_channels : int
@@ -333,7 +336,9 @@ def __init__(
     ):
         super().__init__()
         if block_out_channels[0] % z_channels != 0:
-            raise ValueError(f"block_out_channels[0] should be divisible by z_channels but has block_out_channels[0] = {block_out_channels[0]} and z_channels = {z_channels}")
+            raise ValueError(
+                f"block_out_channels[0] should be divisible by z_channels but has block_out_channels[0] = {block_out_channels[0]} and z_channels = {z_channels}"
+            )
 
         self.z_channels = z_channels
         self.block_out_channels = block_out_channels
@@ -353,7 +358,9 @@ def __init__(
         for i in range(len(block_out_channels)):
             block_out_channel = block_out_channels[i]
             for _ in range(self.num_res_blocks + 1):
-                self.up_blocks.append(HunyuanImageResnetBlock(in_channels=block_in_channel, out_channels=block_out_channel))
+                self.up_blocks.append(
+                    HunyuanImageResnetBlock(in_channels=block_in_channel, out_channels=block_out_channel)
+                )
                 block_in_channel = block_out_channel
 
             if i < np.log2(ffactor_spatial) and i != len(block_out_channels) - 1:
@@ -369,9 +376,8 @@ def __init__(
         self.gradient_checkpointing = False
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        
         h = self.conv_in(x) + x.repeat_interleave(repeats=self.repeat, dim=1)
-        
+
         if torch.is_grad_enabled() and self.gradient_checkpointing:
             h = self._gradient_checkpointing_func(self.mid_block, h)
         else:
@@ -388,7 +394,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return h
 
 
-
 class AutoencoderKLHunyuanImage(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     r"""
     A VAE model for 2D images with spatial tiling support.
@@ -425,7 +430,7 @@ def __init__(
             ffactor_spatial=ffactor_spatial,
             downsample_match_channel=downsample_match_channel,
         )
-        
+
         self.decoder = HunyuanImageDecoder2D(
             z_channels=latent_channels,
             out_channels=out_channels,
@@ -450,9 +455,9 @@ def enable_tiling(
         tile_overlap_factor: Optional[float] = None,
     ) -> None:
         r"""
-        Enable spatial tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
+        Enable spatial tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles
+        to compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to
+        allow processing larger images.
 
         Args:
             tile_sample_min_size (`int`, *optional*):
@@ -528,7 +533,7 @@ def encode(
     def _decode(self, z: torch.Tensor, return_dict: bool = True):
 
         batch_size, num_channels, height, width = z.shape
-        
+
         if self.use_tiling and (width > self.tile_latent_min_size or height > self.tile_latent_min_size):
             return self.tiled_decode(z, return_dict=return_dict)
 
@@ -587,7 +592,7 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
 
         Args:
             x (`torch.Tensor`): Input tensor of shape (B, C, T, H, W).
-        
+
         Returns:
             `torch.Tensor`:
                 The latent representation of the encoded images.
@@ -618,7 +623,7 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
             result_rows.append(torch.cat(result_row, dim=-1))
 
         moments = torch.cat(result_rows, dim=-2)
-        
+
         return moments
 
     def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
diff --git a/src/diffusers/models/transformers/transformer_hunyuanimage.py b/src/diffusers/models/transformers/transformer_hunyuanimage.py
@@ -28,14 +28,13 @@
 from ..cache_utils import CacheMixin
 from ..embeddings import (
     CombinedTimestepTextProjEmbeddings,
-    PixArtAlphaTextProjection,
     TimestepEmbedding,
     Timesteps,
     get_1d_rotary_pos_embed,
 )
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
-from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle, FP32LayerNorm
+from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -64,7 +63,7 @@ def __call__(
         key = attn.to_k(hidden_states)
         value = attn.to_v(hidden_states)
 
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2) # batch_size, heads, seq_len, head_dim
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)  # batch_size, heads, seq_len, head_dim
         key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
         value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
 
@@ -228,8 +227,8 @@ def forward(
 class HunyuanImageIndividualTokenRefinerBlock(nn.Module):
     def __init__(
         self,
-        num_attention_heads: int, # 28
-        attention_head_dim: int, # 128
+        num_attention_heads: int,  # 28
+        attention_head_dim: int,  # 128
         mlp_width_ratio: str = 4.0,
         mlp_drop_rate: float = 0.0,
         attention_bias: bool = True,
@@ -321,6 +320,7 @@ def forward(
 
         return hidden_states
 
+
 # txt_in
 class HunyuanImageTokenRefiner(nn.Module):
     def __init__(
@@ -381,7 +381,7 @@ def __init__(self, patch_size: int, rope_dim: List[int], theta: float = 256.0) -
         self.theta = theta
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        _, _ , height, width = hidden_states.shape
+        _, _, height, width = hidden_states.shape
         rope_sizes = [height // self.patch_size, width // self.patch_size]
 
         axes_grids = []
@@ -635,7 +635,6 @@ def __init__(
     ) -> None:
         super().__init__()
 
-
         inner_dim = num_attention_heads * attention_head_dim
         out_channels = out_channels or in_channels
 
@@ -644,8 +643,7 @@ def __init__(
         self.context_embedder = HunyuanImageTokenRefiner(
             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
         )
-        self.context_embedder_2 = HunyuanImageByT5TextProjection(
-            text_embed_2_dim, 2048, inner_dim)
+        self.context_embedder_2 = HunyuanImageByT5TextProjection(text_embed_2_dim, 2048, inner_dim)
 
         self.time_guidance_embed = HunyuanImageCombinedTimeGuidanceEmbedding(inner_dim, guidance_embeds)
 
@@ -739,7 +737,6 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
 
-    
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -785,24 +782,36 @@ def forward(
         new_encoder_hidden_states = []
         new_encoder_attention_mask = []
 
-        for text, text_mask, text_2, text_mask_2 in zip(encoder_hidden_states, encoder_attention_mask, encoder_hidden_states_2, encoder_attention_mask_2):
+        for text, text_mask, text_2, text_mask_2 in zip(
+            encoder_hidden_states, encoder_attention_mask, encoder_hidden_states_2, encoder_attention_mask_2
+        ):
             text_mask = text_mask.bool()
             text_mask_2 = text_mask_2.bool()
             # Concatenate: [valid_mllm, valid_byt5, invalid_mllm, invalid_byt5]
-            new_encoder_hidden_states.append(torch.cat([
-                text_2[text_mask_2], # valid byt5
-                text[text_mask], # valid mllm
-                text_2[~text_mask_2], # invalid byt5
-                text[~text_mask], # invalid mllm
-            ], dim=0))
+            new_encoder_hidden_states.append(
+                torch.cat(
+                    [
+                        text_2[text_mask_2],  # valid byt5
+                        text[text_mask],  # valid mllm
+                        text_2[~text_mask_2],  # invalid byt5
+                        text[~text_mask],  # invalid mllm
+                    ],
+                    dim=0,
+                )
+            )
 
             # Apply same reordering to attention masks
-            new_encoder_attention_mask.append(torch.cat([
-                text_mask_2[text_mask_2],
-                text_mask[text_mask],
-                text_mask_2[~text_mask_2],
-                text_mask[~text_mask],
-            ], dim=0))
+            new_encoder_attention_mask.append(
+                torch.cat(
+                    [
+                        text_mask_2[text_mask_2],
+                        text_mask[text_mask],
+                        text_mask_2[~text_mask_2],
+                        text_mask[~text_mask],
+                    ],
+                    dim=0,
+                )
+            )
 
         encoder_hidden_states = torch.stack(new_encoder_hidden_states)
         encoder_attention_mask = torch.stack(new_encoder_attention_mask)
@@ -854,10 +863,10 @@ def forward(
         hidden_states = self.norm_out(hidden_states, temb)
         hidden_states = self.proj_out(hidden_states)
 
-        hidden_states = hidden_states.reshape(
-            batch_size, post_patch_height, post_patch_width, -1, p, p
-        )
-        hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) # batch_size, channels, height, patch_size, width, patch_size
+        hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, -1, p, p)
+        hidden_states = hidden_states.permute(
+            0, 3, 1, 4, 2, 5
+        )  # batch_size, channels, height, patch_size, width, patch_size
         hidden_states = hidden_states.flatten(4, 5).flatten(2, 3)
 
         if USE_PEFT_BACKEND: