add tests

a-r-r-o-w · a-r-r-o-w · commit da834d5177d4 · 2024-12-04T13:18:51.000+01:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -767,7 +767,7 @@ def __init__(
             channels,
             kernel_size,
             padding=kernel_size // 2,
-            groups=3 * in_channels,
+            groups=channels,
             bias=False,
         )
         self.proj_out = nn.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
@@ -786,7 +786,7 @@ def __init__(
         in_channels: int,
         out_channels: int,
         num_attention_heads: Optional[int] = None,
-        heads_ratio: float = 1.0,
+        mult: float = 1.0,
         attention_head_dim: int = 8,
         norm_type: str = "batch_norm",
         kernel_sizes: Tuple[int, ...] = (5,),
@@ -804,9 +804,7 @@ def __init__(
         self.residual_connection = residual_connection
 
         num_attention_heads = (
-            int(in_channels // attention_head_dim * heads_ratio)
-            if num_attention_heads is None
-            else num_attention_heads
+            int(in_channels // attention_head_dim * mult) if num_attention_heads is None else num_attention_heads
         )
         inner_dim = num_attention_heads * attention_head_dim
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -24,6 +24,7 @@
 from ..attention_processor import SanaMultiscaleLinearAttention
 from ..modeling_utils import ModelMixin
 from ..normalization import RMSNorm, get_normalization
+from .vae import DecoderOutput
 
 
 class GLUMBConv(nn.Module):
@@ -90,8 +91,8 @@ class EfficientViTBlock(nn.Module):
     def __init__(
         self,
         in_channels: int,
-        heads_ratio: float = 1.0,
-        dim: int = 32,
+        mult: float = 1.0,
+        attention_head_dim: int = 32,
         qkv_multiscales: Tuple[int, ...] = (5,),
         norm_type: str = "batch_norm",
     ) -> None:
@@ -100,8 +101,8 @@ def __init__(
         self.attn = SanaMultiscaleLinearAttention(
             in_channels=in_channels,
             out_channels=in_channels,
-            heads_ratio=heads_ratio,
-            attention_head_dim=dim,
+            mult=mult,
+            attention_head_dim=attention_head_dim,
             norm_type=norm_type,
             kernel_sizes=qkv_multiscales,
             residual_connection=True,
@@ -122,6 +123,7 @@ def get_block(
     block_type: str,
     in_channels: int,
     out_channels: int,
+    attention_head_dim: int,
     norm_type: str,
     act_fn: str,
     qkv_mutliscales: Tuple[int] = (),
@@ -130,7 +132,9 @@ def get_block(
         block = ResBlock(in_channels, out_channels, norm_type, act_fn)
 
     elif block_type == "EfficientViTBlock":
-        block = EfficientViTBlock(in_channels, norm_type=norm_type, qkv_multiscales=qkv_mutliscales)
+        block = EfficientViTBlock(
+            in_channels, attention_head_dim=attention_head_dim, norm_type=norm_type, qkv_multiscales=qkv_mutliscales
+        )
 
     else:
         raise ValueError(f"Block with {block_type=} is not supported.")
@@ -224,6 +228,7 @@ def __init__(
         self,
         in_channels: int,
         latent_channels: int,
+        attention_head_dim: int = 32,
         block_type: Union[str, Tuple[str]] = "ResBlock",
         block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
         layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
@@ -262,6 +267,7 @@ def __init__(
                     block_type[i],
                     out_channel,
                     out_channel,
+                    attention_head_dim=attention_head_dim,
                     norm_type="rms_norm",
                     act_fn="silu",
                     qkv_mutliscales=qkv_multiscales[i],
@@ -305,6 +311,7 @@ def __init__(
         self,
         in_channels: int,
         latent_channels: int,
+        attention_head_dim: int = 32,
         block_type: Union[str, Tuple[str]] = "ResBlock",
         block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
         layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
@@ -348,6 +355,7 @@ def __init__(
                     block_type[i],
                     out_channel,
                     out_channel,
+                    attention_head_dim=attention_head_dim,
                     norm_type=norm_type[i],
                     act_fn=act_fn[i],
                     qkv_mutliscales=qkv_multiscales[i],
@@ -425,13 +433,14 @@ class AutoencoderDC(ModelMixin, ConfigMixin):
             A scaling factor applied during model operations.
     """
 
-    _supports_gradient_checkpointing = True
+    _supports_gradient_checkpointing = False
 
     @register_to_config
     def __init__(
         self,
         in_channels: int = 3,
         latent_channels: int = 32,
+        attention_head_dim: int = 32,
         encoder_block_types: Union[str, Tuple[str]] = "ResBlock",
         decoder_block_types: Union[str, Tuple[str]] = "ResBlock",
         encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
@@ -451,6 +460,7 @@ def __init__(
         self.encoder = Encoder(
             in_channels=in_channels,
             latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
             block_type=encoder_block_types,
             block_out_channels=encoder_block_out_channels,
             layers_per_block=encoder_layers_per_block,
@@ -460,6 +470,7 @@ def __init__(
         self.decoder = Decoder(
             in_channels=in_channels,
             latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
             block_type=decoder_block_types,
             block_out_channels=decoder_block_out_channels,
             layers_per_block=decoder_layers_per_block,
@@ -480,7 +491,9 @@ def decode(self, x: torch.Tensor) -> torch.Tensor:
         x = self.decoder(x)
         return x
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.encoder(x)
-        x = self.decoder(x)
-        return x
+    def forward(self, sample: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
+        z = self.encode(sample)
+        dec = self.decode(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
diff --git a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
@@ -34,6 +34,7 @@
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_stable_audio import StableAudioProjectionModel
 
+
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
 
@@ -732,7 +733,7 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
-                        
+
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
diff --git a/tests/models/autoencoders/test_models_autoencoder_dc.py b/tests/models/autoencoders/test_models_autoencoder_dc.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from diffusers import AutoencoderDC
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderDCTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderDC
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_dc_config(self):
+        return {
+            "in_channels": 3,
+            "latent_channels": 4,
+            "attention_head_dim": 2,
+            "encoder_block_types": (
+                "ResBlock",
+                "EfficientViTBlock",
+            ),
+            "decoder_block_types": (
+                "ResBlock",
+                "EfficientViTBlock",
+            ),
+            "encoder_block_out_channels": (8, 8),
+            "decoder_block_out_channels": (8, 8),
+            "encoder_qkv_multiscales": ((), (5,)),
+            "decoder_qkv_multiscales": ((), (5,)),
+            "encoder_layers_per_block": (1, 1),
+            "decoder_layers_per_block": [1, 1],
+            "downsample_block_type": "conv",
+            "upsample_block_type": "interpolate",
+            "decoder_norm_types": "rms_norm",
+            "decoder_act_fns": "silu",
+            "scaling_factor": 0.41407,
+        }
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_dc_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skip("AutoencoderDC does not support `norm_num_groups` because it does not use GroupNorm.")
+    def test_forward_with_norm_groups(self):
+        pass