From 77eaf324e710d1f1edd42448b63f104daed674fc Mon Sep 17 00:00:00 2001 From: cesaryuan Date: Sun, 26 Oct 2025 12:34:55 +0800 Subject: [PATCH] Fix: update type hints for Tuple parameters across multiple files to support variable-length tuples --- examples/community/img2img_inpainting.py | 2 +- examples/community/matryoshka.py | 12 +++++------ .../pipeline_faithdiff_stable_diffusion_xl.py | 6 +++--- .../models/autoencoders/autoencoder_dc.py | 14 ++++++------- .../models/autoencoders/autoencoder_kl.py | 6 +++--- .../autoencoders/autoencoder_kl_cogvideox.py | 6 +++--- .../autoencoder_kl_hunyuan_video.py | 2 +- .../autoencoder_kl_hunyuanimage_refiner.py | 2 +- .../autoencoders/autoencoder_kl_mochi.py | 4 ++-- .../autoencoders/autoencoder_kl_qwenimage.py | 2 +- .../autoencoder_kl_temporal_decoder.py | 6 +++--- .../models/autoencoders/autoencoder_kl_wan.py | 2 +- .../models/controlnets/controlnet_xs.py | 20 +++++++++---------- .../transformers/transformer_hunyuan_video.py | 2 +- .../transformer_hunyuan_video_framepack.py | 2 +- .../transformers/transformer_hunyuanimage.py | 2 +- .../transformers/transformer_skyreels_v2.py | 2 +- .../models/transformers/transformer_wan.py | 2 +- .../transformers/transformer_wan_vace.py | 2 +- src/diffusers/models/unets/unet_1d.py | 8 ++++---- .../models/unets/unet_2d_condition.py | 12 +++++------ src/diffusers/models/unets/unet_kandinsky3.py | 2 +- .../unets/unet_spatio_temporal_condition.py | 8 ++++---- .../models/unets/unet_stable_cascade.py | 10 +++++----- src/diffusers/models/vae_flax.py | 14 ++++++------- .../pipelines/audioldm2/modeling_audioldm2.py | 6 +++--- .../versatile_diffusion/modeling_text_unet.py | 6 +++--- src/diffusers/pipelines/shap_e/renderer.py | 8 ++++---- 28 files changed, 85 insertions(+), 85 deletions(-) diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 595df107cacb..bef682425a2c 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -45,7 +45,7 @@ def check_size(image, height, width): raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}") -def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)): +def overlay_inner_image(image, inner_image, paste_offset: Tuple[int, ...] = (0, 0)): inner_image = inner_image.convert("RGBA") image = image.convert("RGB") diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index 97ad8b9e86c6..295ef8411923 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -1966,16 +1966,16 @@ def __init__( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, @@ -2294,10 +2294,10 @@ def __init__( def _check_config( self, - down_block_types: Tuple[str], - up_block_types: Tuple[str], + down_block_types: Tuple[str, ...], + up_block_types: Tuple[str, ...], only_cross_attention: Union[bool, Tuple[bool]], - block_out_channels: Tuple[int], + block_out_channels: Tuple[int, ...], layers_per_block: Union[int, Tuple[int]], cross_attention_dim: Union[int, Tuple[int]], transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]], diff --git a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py index a8fdc133d08b..c79d212754b8 100644 --- a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py +++ b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py @@ -438,16 +438,16 @@ def __init__( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 724ec3bb760c..ec301ef8ad51 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -102,7 +102,7 @@ def get_block( attention_head_dim: int, norm_type: str, act_fn: str, - qkv_mutliscales: Tuple[int] = (), + qkv_mutliscales: Tuple[int, ...] = (), ): if block_type == "ResBlock": block = ResBlock(in_channels, out_channels, norm_type, act_fn) @@ -206,8 +206,8 @@ def __init__( latent_channels: int, attention_head_dim: int = 32, block_type: Union[str, Tuple[str]] = "ResBlock", - block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024), - layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), + layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2), qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), downsample_block_type: str = "pixel_unshuffle", out_shortcut: bool = True, @@ -292,8 +292,8 @@ def __init__( latent_channels: int, attention_head_dim: int = 32, block_type: Union[str, Tuple[str]] = "ResBlock", - block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024), - layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), + layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2), qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), norm_type: Union[str, Tuple[str]] = "rms_norm", act_fn: Union[str, Tuple[str]] = "silu", @@ -440,8 +440,8 @@ def __init__( decoder_block_types: Union[str, Tuple[str]] = "ResBlock", encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), - encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3), - decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3), + encoder_layers_per_block: Tuple[int, ...] = (2, 2, 2, 3, 3, 3), + decoder_layers_per_block: Tuple[int, ...] = (3, 3, 3, 3, 3, 3), encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), upsample_block_type: str = "pixel_shuffle", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 1a72aa3cfeb3..ffc8778e7aca 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -78,9 +78,9 @@ def __init__( self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - up_block_types: Tuple[str] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int] = (64,), + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), layers_per_block: int = 1, act_fn: str = "silu", latent_channels: int = 4, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 5096b725d0bb..79433f7b9232 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -995,19 +995,19 @@ def __init__( self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", ), - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", ), - block_out_channels: Tuple[int] = (128, 256, 256, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 256, 512), latent_channels: int = 16, layers_per_block: int = 3, act_fn: str = "silu", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py index 88b9bb507ff6..ddc0aed6b0ff 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py @@ -653,7 +653,7 @@ def __init__( "HunyuanVideoUpBlock3D", "HunyuanVideoUpBlock3D", ), - block_out_channels: Tuple[int] = (128, 256, 512, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512), layers_per_block: int = 2, act_fn: str = "silu", norm_num_groups: int = 32, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py index af40c7a6cbd0..2249063a9f00 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py @@ -601,7 +601,7 @@ def __init__( in_channels: int = 3, out_channels: int = 3, latent_channels: int = 32, - block_out_channels: Tuple[int] = (128, 256, 512, 1024, 1024), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024, 1024), layers_per_block: int = 2, spatial_compression_ratio: int = 16, temporal_compression_ratio: int = 4, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py index 3ded9a0a5491..7a64ac7de172 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py @@ -688,8 +688,8 @@ def __init__( self, in_channels: int = 15, out_channels: int = 3, - encoder_block_out_channels: Tuple[int] = (64, 128, 256, 384), - decoder_block_out_channels: Tuple[int] = (128, 256, 512, 768), + encoder_block_out_channels: Tuple[int, ...] = (64, 128, 256, 384), + decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 768), latent_channels: int = 12, layers_per_block: Tuple[int, ...] = (3, 3, 4, 6, 3), act_fn: str = "silu", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py index 14db6aeb61e3..0aadbad9f4de 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py @@ -679,7 +679,7 @@ def __init__( self, base_dim: int = 96, z_dim: int = 16, - dim_mult: Tuple[int] = [1, 2, 4, 4], + dim_mult: Tuple[int, ...] = (1, 2, 4, 4), num_res_blocks: int = 2, attn_scales: List[float] = [], temperal_downsample: List[bool] = [False, True, True], diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index ab76254d19e2..167fdc6bda92 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -31,7 +31,7 @@ def __init__( self, in_channels: int = 4, out_channels: int = 3, - block_out_channels: Tuple[int] = (128, 256, 512, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512), layers_per_block: int = 2, ): super().__init__() @@ -172,8 +172,8 @@ def __init__( self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - block_out_channels: Tuple[int] = (64,), + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), layers_per_block: int = 1, latent_channels: int = 4, sample_size: int = 32, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py index dae26f8086d5..94318f9c4f5a 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -971,7 +971,7 @@ def __init__( base_dim: int = 96, decoder_base_dim: Optional[int] = None, z_dim: int = 16, - dim_mult: Tuple[int] = [1, 2, 4, 4], + dim_mult: Tuple[int, ...] = (1, 2, 4, 4), num_res_blocks: int = 2, attn_scales: List[float] = [], temperal_downsample: List[bool] = [False, True, True], diff --git a/src/diffusers/models/controlnets/controlnet_xs.py b/src/diffusers/models/controlnets/controlnet_xs.py index f5c69b9a46ad..a6c4a46a27b7 100644 --- a/src/diffusers/models/controlnets/controlnet_xs.py +++ b/src/diffusers/models/controlnets/controlnet_xs.py @@ -293,14 +293,14 @@ def __init__( self, conditioning_channels: int = 3, conditioning_channel_order: str = "rgb", - conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), time_embedding_mix: float = 1.0, learn_time_embedding: bool = False, num_attention_heads: Union[int, Tuple[int]] = 4, - block_out_channels: Tuple[int] = (4, 8, 16, 16), - base_block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (4, 8, 16, 16), + base_block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), cross_attention_dim: int = 1024, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", @@ -436,7 +436,7 @@ def from_unet( time_embedding_mix: int = 1.0, conditioning_channels: int = 3, conditioning_channel_order: str = "rgb", - conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), ): r""" Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`]. @@ -529,14 +529,14 @@ def __init__( self, # unet configs sample_size: Optional[int] = 96, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), norm_num_groups: Optional[int] = 32, cross_attention_dim: Union[int, Tuple[int]] = 1024, transformer_layers_per_block: Union[int, Tuple[int]] = 1, @@ -550,10 +550,10 @@ def __init__( # additional controlnet configs time_embedding_mix: float = 1.0, ctrl_conditioning_channels: int = 3, - ctrl_conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + ctrl_conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), ctrl_conditioning_channel_order: str = "rgb", ctrl_learn_time_embedding: bool = False, - ctrl_block_out_channels: Tuple[int] = (4, 8, 16, 16), + ctrl_block_out_channels: Tuple[int, ...] = (4, 8, 16, 16), ctrl_num_attention_heads: Union[int, Tuple[int]] = 4, ctrl_max_norm_num_groups: int = 32, ): diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index bc857ccab463..5ef6f1217b80 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -895,7 +895,7 @@ def __init__( text_embed_dim: int = 4096, pooled_projection_dim: int = 768, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (16, 56, 56), + rope_axes_dim: Tuple[int, ...] = (16, 56, 56), image_condition_type: Optional[str] = None, ) -> None: super().__init__() diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index 60b40fff3cb8..601ba0f0b472 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -139,7 +139,7 @@ def __init__( text_embed_dim: int = 4096, pooled_projection_dim: int = 768, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (16, 56, 56), + rope_axes_dim: Tuple[int, ...] = (16, 56, 56), image_condition_type: Optional[str] = None, has_image_proj: int = False, image_proj_dim: int = 1152, diff --git a/src/diffusers/models/transformers/transformer_hunyuanimage.py b/src/diffusers/models/transformers/transformer_hunyuanimage.py index 7f37bf815b4b..bcca9a4cb978 100644 --- a/src/diffusers/models/transformers/transformer_hunyuanimage.py +++ b/src/diffusers/models/transformers/transformer_hunyuanimage.py @@ -689,7 +689,7 @@ def __init__( text_embed_dim: int = 3584, text_embed_2_dim: Optional[int] = None, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (64, 64), + rope_axes_dim: Tuple[int, ...] = (64, 64), use_meanflow: bool = False, ) -> None: super().__init__() diff --git a/src/diffusers/models/transformers/transformer_skyreels_v2.py b/src/diffusers/models/transformers/transformer_skyreels_v2.py index 6b600aa22487..62698b97cc8a 100644 --- a/src/diffusers/models/transformers/transformer_skyreels_v2.py +++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py @@ -570,7 +570,7 @@ class SkyReelsV2Transformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 16, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index dd75fb124f1a..d04d6707435f 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -560,7 +560,7 @@ class WanTransformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 40, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py index 30c38c244ad8..1be4f73e33e2 100644 --- a/src/diffusers/models/transformers/transformer_wan_vace.py +++ b/src/diffusers/models/transformers/transformer_wan_vace.py @@ -182,7 +182,7 @@ class WanVACETransformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 40, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py index 4c4c528a59ad..a027c553ed06 100644 --- a/src/diffusers/models/unets/unet_1d.py +++ b/src/diffusers/models/unets/unet_1d.py @@ -86,11 +86,11 @@ def __init__( flip_sin_to_cos: bool = True, use_timestep_embedding: bool = False, freq_shift: float = 0.0, - down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), - up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), - mid_block_type: Tuple[str] = "UNetMidBlock1D", + down_block_types: Tuple[str, ...] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), + up_block_types: Tuple[str, ...] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), + mid_block_type: str = "UNetMidBlock1D", out_block_type: str = None, - block_out_channels: Tuple[int] = (32, 32, 64), + block_out_channels: Tuple[int, ...] = (32, 32, 64), act_fn: str = None, norm_num_groups: int = 8, layers_per_block: int = 1, diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index f04d3dfa0136..af29a50ecfdc 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -177,16 +177,16 @@ def __init__( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, @@ -486,10 +486,10 @@ def __init__( def _check_config( self, - down_block_types: Tuple[str], - up_block_types: Tuple[str], + down_block_types: Tuple[str, ...], + up_block_types: Tuple[str, ...], only_cross_attention: Union[bool, Tuple[bool]], - block_out_channels: Tuple[int], + block_out_channels: Tuple[int, ...], layers_per_block: Union[int, Tuple[int]], cross_attention_dim: Union[int, Tuple[int]], transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]], diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index 27241ce2e674..7c7948c4fdef 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -54,7 +54,7 @@ def __init__( groups: int = 32, attention_head_dim: int = 64, layers_per_block: Union[int, Tuple[int]] = 3, - block_out_channels: Tuple[int] = (384, 768, 1536, 3072), + block_out_channels: Tuple[int, ...] = (384, 768, 1536, 3072), cross_attention_dim: Union[int, Tuple[int]] = 4096, encoder_hid_dim: int = 4096, ): diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 059a6e807c8e..63e9bee40917 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -73,25 +73,25 @@ def __init__( sample_size: Optional[int] = None, in_channels: int = 8, out_channels: int = 4, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal", ), - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", ), - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), addition_time_embed_dim: int = 256, projection_class_embeddings_input_dim: int = 768, layers_per_block: Union[int, Tuple[int]] = 2, cross_attention_dim: Union[int, Tuple[int]] = 1024, transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1, - num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20), + num_attention_heads: Union[int, Tuple[int, ...]] = (5, 10, 20, 20), num_frames: int = 25, ): super().__init__() diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index e79ce8ac1da4..23d358c1bf51 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -145,10 +145,10 @@ def __init__( timestep_ratio_embedding_dim: int = 64, patch_size: int = 1, conditioning_dim: int = 2048, - block_out_channels: Tuple[int] = (2048, 2048), - num_attention_heads: Tuple[int] = (32, 32), - down_num_layers_per_block: Tuple[int] = (8, 24), - up_num_layers_per_block: Tuple[int] = (24, 8), + block_out_channels: Tuple[int, ...] = (2048, 2048), + num_attention_heads: Tuple[int, ...] = (32, 32), + down_num_layers_per_block: Tuple[int, ...] = (8, 24), + up_num_layers_per_block: Tuple[int, ...] = (24, 8), down_blocks_repeat_mappers: Optional[Tuple[int]] = ( 1, 1, @@ -167,7 +167,7 @@ def __init__( kernel_size=3, dropout: Union[float, Tuple[float]] = (0.1, 0.1), self_attn: Union[bool, Tuple[bool]] = True, - timestep_conditioning_type: Tuple[str] = ("sca", "crp"), + timestep_conditioning_type: Tuple[str, ...] = ("sca", "crp"), switch_level: Optional[Tuple[bool]] = None, ): """ diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py index 13653b90372a..5aad386a89e8 100644 --- a/src/diffusers/models/vae_flax.py +++ b/src/diffusers/models/vae_flax.py @@ -532,8 +532,8 @@ class FlaxEncoder(nn.Module): in_channels: int = 3 out_channels: int = 3 - down_block_types: Tuple[str] = ("DownEncoderBlock2D",) - block_out_channels: Tuple[int] = (64,) + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 2 norm_num_groups: int = 32 act_fn: str = "silu" @@ -650,8 +650,8 @@ class FlaxDecoder(nn.Module): in_channels: int = 3 out_channels: int = 3 - up_block_types: Tuple[str] = ("UpDecoderBlock2D",) - block_out_channels: int = (64,) + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 2 norm_num_groups: int = 32 act_fn: str = "silu" @@ -823,9 +823,9 @@ class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin): in_channels: int = 3 out_channels: int = 3 - down_block_types: Tuple[str] = ("DownEncoderBlock2D",) - up_block_types: Tuple[str] = ("UpDecoderBlock2D",) - block_out_channels: Tuple[int] = (64,) + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",) + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 1 act_fn: str = "silu" latent_channels: int = 4 diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index b6b40cd6e616..1585dac5afed 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -245,16 +245,16 @@ def __init__( out_channels: int = 4, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index 397fbc0d85b8..7c25713cd1d7 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -374,21 +374,21 @@ def __init__( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat", ), mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn", - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py index b268eae806a6..d1d05c894595 100644 --- a/src/diffusers/pipelines/shap_e/renderer.py +++ b/src/diffusers/pipelines/shap_e/renderer.py @@ -742,7 +742,7 @@ class ShapEParamsProjModel(ModelMixin, ConfigMixin): def __init__( self, *, - param_names: Tuple[str] = ( + param_names: Tuple[str, ...] = ( "nerstf.mlp.0.weight", "nerstf.mlp.1.weight", "nerstf.mlp.2.weight", @@ -786,13 +786,13 @@ class ShapERenderer(ModelMixin, ConfigMixin): def __init__( self, *, - param_names: Tuple[str] = ( + param_names: Tuple[str, ...] = ( "nerstf.mlp.0.weight", "nerstf.mlp.1.weight", "nerstf.mlp.2.weight", "nerstf.mlp.3.weight", ), - param_shapes: Tuple[Tuple[int]] = ( + param_shapes: Tuple[Tuple[int, int], ...] = ( (256, 93), (256, 256), (256, 256), @@ -804,7 +804,7 @@ def __init__( n_hidden_layers: int = 6, act_fn: str = "swish", insert_direction_at: int = 4, - background: Tuple[float] = ( + background: Tuple[float, ...] = ( 255.0, 255.0, 255.0,