From c6ad097592bef70188f7c8093a98f020817341b5 Mon Sep 17 00:00:00 2001 From: Junyu Chen Date: Mon, 18 Aug 2025 10:04:23 +0800 Subject: [PATCH 1/2] minor modification to support dc-ae-turbo --- .../models/autoencoders/autoencoder_dc.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 9c7d6360e06e..5004f3d6c50f 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -299,6 +299,7 @@ def __init__( act_fn: Union[str, Tuple[str]] = "silu", upsample_block_type: str = "pixel_shuffle", in_shortcut: bool = True, + conv_act_fn: str = "relu", ): super().__init__() @@ -349,7 +350,7 @@ def __init__( channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1] self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True) - self.conv_act = nn.ReLU() + self.conv_act = get_activation(conv_act_fn) self.conv_out = None if layers_per_block[0] > 0: @@ -414,6 +415,12 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin): The normalization type(s) to use in the decoder. decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`): The activation function(s) to use in the decoder. + encoder_out_shortcut (`bool`, defaults to `True`): + Whether to use shortcut at the end of the encoder. + decoder_in_shortcut (`bool`, defaults to `True`): + Whether to use shortcut at the beginning of the encoder. + decoder_conv_act_fn (`str`, defaults to `"relu"`): + The activation function to use at the end of the decoder. scaling_factor (`float`, defaults to `1.0`): The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent space to have unit variance when training the diffusion model. The latents are scaled with the formula `z = @@ -441,6 +448,9 @@ def __init__( downsample_block_type: str = "pixel_unshuffle", decoder_norm_types: Union[str, Tuple[str]] = "rms_norm", decoder_act_fns: Union[str, Tuple[str]] = "silu", + encoder_out_shortcut: bool = True, + decoder_in_shortcut: bool = True, + decoder_conv_act_fn: str = "relu", scaling_factor: float = 1.0, ) -> None: super().__init__() @@ -454,6 +464,7 @@ def __init__( layers_per_block=encoder_layers_per_block, qkv_multiscales=encoder_qkv_multiscales, downsample_block_type=downsample_block_type, + out_shortcut=encoder_out_shortcut, ) self.decoder = Decoder( in_channels=in_channels, @@ -466,6 +477,8 @@ def __init__( norm_type=decoder_norm_types, act_fn=decoder_act_fns, upsample_block_type=upsample_block_type, + in_shortcut=decoder_in_shortcut, + conv_act_fn=decoder_conv_act_fn, ) self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1) From 58592bdac6664faf798db3a99bd4f164c92313ab Mon Sep 17 00:00:00 2001 From: Junyu Chen Date: Mon, 18 Aug 2025 10:30:03 +0800 Subject: [PATCH 2/2] minor --- src/diffusers/models/autoencoders/autoencoder_dc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 5004f3d6c50f..d3f31de8546b 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -418,7 +418,7 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin): encoder_out_shortcut (`bool`, defaults to `True`): Whether to use shortcut at the end of the encoder. decoder_in_shortcut (`bool`, defaults to `True`): - Whether to use shortcut at the beginning of the encoder. + Whether to use shortcut at the beginning of the decoder. decoder_conv_act_fn (`str`, defaults to `"relu"`): The activation function to use at the end of the decoder. scaling_factor (`float`, defaults to `1.0`):