vllm-project
diff --git a/‎vllm_omni/diffusion/models/hunyuan_image_3/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎vllm_omni/diffusion/models/hunyuan_image_3/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py‎
Lines changed: 53 additions & 31 deletions b/‎vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py‎
Lines changed: 53 additions & 31 deletions
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """hunyuan Image 3 diffusion model components."""
 
-from vllm_omni.diffusion.models.hunyuan_image_3.pipeline_hunyuan_image_3 import (
-    HunyuanImage3Pipeline,
-)
 from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_image_3_transformer import (
     HunyuanImage3Model,
     HunyuanImage3Text2ImagePipeline,
 )
+from vllm_omni.diffusion.models.hunyuan_image_3.pipeline_hunyuan_image_3 import (
+    HunyuanImage3Pipeline,
+)
 
 __all__ = [
     "HunyuanImage3Pipeline",
 
@@ -11,21 +11,21 @@
 # limitations under the License.
 # ==============================================================================
 
-from dataclasses import dataclass
-from typing import Tuple, Optional
 import math
-import random
+from dataclasses import dataclass
+from typing import Optional, Tuple  # noqa: UP035
+
 import numpy as np
-from einops import rearrange
 import torch
-from torch import Tensor, nn
 import torch.nn.functional as F
-
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils.torch_utils import randn_tensor
 from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from torch import Tensor, nn
+
 
 class DiagonalGaussianDistribution(object):
     def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
@@ -57,6 +57,7 @@ def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTens
         x = self.mean + self.std * sample
         return x
 
+
 @dataclass
 class DecoderOutput(BaseOutput):
     sample: torch.FloatTensor
@@ -71,6 +72,7 @@ def forward_with_checkpointing(module, *inputs, use_checkpointing=False):
     def create_custom_forward(module):
         def custom_forward(*inputs):
             return module(*inputs)
+
         return custom_forward
 
     if use_checkpointing:
@@ -81,7 +83,7 @@ def custom_forward(*inputs):
 
 class Conv3d(nn.Conv3d):
     """
-    Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5. 
+    Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5.
     Only symmetric padding is supported.
     """
 
@@ -102,9 +104,9 @@ def forward(self, input):
                         value=0,
                     )
                     if i > 0:
-                        padded_chunk[:, :, :self.padding[0]] = chunks[i - 1][:, :, -self.padding[0]:]
+                        padded_chunk[:, :, : self.padding[0]] = chunks[i - 1][:, :, -self.padding[0] :]
                     if i < len(chunks) - 1:
-                        padded_chunk[:, :, -self.padding[0]:] = chunks[i + 1][:, :, :self.padding[0]]
+                        padded_chunk[:, :, -self.padding[0] :] = chunks[i + 1][:, :, : self.padding[0]]
                 else:
                     padded_chunk = chunks[i]
                 padded_chunks.append(padded_chunk)
@@ -120,7 +122,8 @@ def forward(self, input):
 
 
 class AttnBlock(nn.Module):
-    """ Attention with torch sdpa implementation. """
+    """Attention with torch sdpa implementation."""
+
     def __init__(self, in_channels: int):
         super().__init__()
         self.in_channels = in_channels
@@ -178,6 +181,7 @@ def forward(self, x):
             x = self.nin_shortcut(x)
         return x + h
 
+
 class DownsampleDCAE(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
         super().__init__()
@@ -198,6 +202,7 @@ def forward(self, x: Tensor):
         shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
         return h + shortcut
 
+
 class UpsampleDCAE(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
         super().__init__()
@@ -215,10 +220,12 @@ def forward(self, x: Tensor):
         shortcut = rearrange(shortcut, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
         return h + shortcut
 
+
 class Encoder(nn.Module):
     """
     The encoder network of AutoencoderKLConv3D.
     """
+
     def __init__(
         self,
         in_channels: int,
@@ -251,8 +258,9 @@ def __init__(
             down.block = block
 
             add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
-            add_temporal_downsample = (add_spatial_downsample and
-                                       bool(i_level >= np.log2(ffactor_spatial // ffactor_temporal)))
+            add_temporal_downsample = add_spatial_downsample and bool(
+                i_level >= np.log2(ffactor_spatial // ffactor_temporal)
+            )
             if add_spatial_downsample or add_temporal_downsample:
                 assert i_level < len(block_out_channels) - 1
                 block_out = block_out_channels[i_level + 1] if downsample_match_channel else block_in
@@ -280,7 +288,8 @@ def forward(self, x: Tensor) -> Tensor:
         for i_level in range(len(self.block_out_channels)):
             for i_block in range(self.num_res_blocks):
                 h = forward_with_checkpointing(
-                    self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+                    self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing
+                )
             if hasattr(self.down[i_level], "downsample"):
                 h = forward_with_checkpointing(self.down[i_level].downsample, h, use_checkpointing=use_checkpointing)
 
@@ -298,10 +307,12 @@ def forward(self, x: Tensor) -> Tensor:
         h += shortcut
         return h
 
+
 class Decoder(nn.Module):
     """
     The decoder network of AutoencoderKLConv3D.
     """
+
     def __init__(
         self,
         z_channels: int,
@@ -380,10 +391,12 @@ def forward(self, z: Tensor) -> Tensor:
         h = self.conv_out(h)
         return h
 
+
 class AutoencoderKLConv3D(ModelMixin, ConfigMixin):
     """
     Autoencoder model with KL-regularized latent space based on 3D convolutions.
     """
+
     _supports_gradient_checkpointing = True
 
     @register_to_config
@@ -402,8 +415,8 @@ def __init__(
         shift_factor: Optional[float] = None,
         downsample_match_channel: bool = True,
         upsample_match_channel: bool = True,
-        only_encoder: bool = False,     # only build encoder for saving memory
-        only_decoder: bool = False,     # only build decoder for saving memory
+        only_encoder: bool = False,  # only build encoder for saving memory
+        only_decoder: bool = False,  # only build decoder for saving memory
     ):
         super().__init__()
         self.ffactor_spatial = ffactor_spatial
@@ -449,27 +462,29 @@ def __init__(
 
         # use torch.compile for faster encode speed
         self.use_compile = False
-    
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Encoder, Decoder)):
             module.gradient_checkpointing = value
-            
+
     def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
         blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
         for x in range(blend_extent):
-            b[:, :, :, :, x] = \
-                a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
         return b
 
     def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
         blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
         for y in range(blend_extent):
-            b[:, :, :, y, :] = \
-                a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
         return b
 
     def spatial_tiled_decode(self, z: torch.Tensor):
-        """ spatial tailing for frames """
+        """spatial tailing for frames"""
         B, C, T, H, W = z.shape
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
         blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)  # 256 * 0.25 = 64
@@ -479,7 +494,7 @@ def spatial_tiled_decode(self, z: torch.Tensor):
         for i in range(0, H, overlap_size):
             row = []
             for j in range(0, W, overlap_size):
-                tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
                 decoded = self.decoder(tile)
                 row.append(decoded)
             rows.append(row)
@@ -498,7 +513,7 @@ def spatial_tiled_decode(self, z: torch.Tensor):
         return dec
 
     def temporal_tiled_decode(self, z: torch.Tensor):
-        """ temporal tailing for frames """
+        """temporal tailing for frames"""
         B, C, T, H, W = z.shape
         overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
         blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)  # 64 * 0.25 = 16
@@ -507,9 +522,10 @@ def temporal_tiled_decode(self, z: torch.Tensor):
 
         row = []
         for i in range(0, T, overlap_size):
-            tile = z[:, :, i: i + self.tile_latent_min_tsize, :, :]
+            tile = z[:, :, i : i + self.tile_latent_min_tsize, :, :]
             if self.use_spatial_tiling and (
-                    tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
+            ):
                 decoded = self.spatial_tiled_decode(tile)
             else:
                 decoded = self.decoder(tile)
@@ -522,23 +538,27 @@ def temporal_tiled_decode(self, z: torch.Tensor):
             result_row.append(tile[:, :, :t_limit, :, :])
         dec = torch.cat(result_row, dim=-3)
         return dec
-    
+
     def encode(self, x: Tensor, return_dict: bool = True):
         """
         Encodes the input by passing through the encoder network.
         Support slicing and tiling for memory efficiency.
         """
+
         def _encode(x):
             if self.use_temporal_tiling and x.shape[-3] > self.tile_sample_min_tsize:
                 return self.temporal_tiled_encode(x)
             if self.use_spatial_tiling and (
-                    x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+                x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size
+            ):
                 return self.spatial_tiled_encode(x)
 
             if self.use_compile:
+
                 @torch.compile
                 def encoder(x):
                     return self.encoder(x)
+
                 return encoder(x)
             return self.encoder(x)
 
@@ -567,17 +587,19 @@ def encoder(x):
             return (posterior,)
 
         return AutoencoderKLOutput(latent_dist=posterior)
-    
+
     def decode(self, z: Tensor, return_dict: bool = True, generator=None):
         """
         Decodes the input by passing through the decoder network.
         Support slicing and tiling for memory efficiency.
         """
+
         def _decode(z):
             if self.use_temporal_tiling and z.shape[-3] > self.tile_latent_min_tsize:
                 return self.temporal_tiled_decode(z)
             if self.use_spatial_tiling and (
-                    z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+                z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size
+            ):
                 return self.spatial_tiled_decode(z)
             return self.decoder(z)