autoencoder test

a-r-r-o-w · a-r-r-o-w · commit d0c61e074d11 · 2024-12-12T23:49:23.000+01:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
@@ -240,6 +240,8 @@ def __init__(
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
 
+        self.gradient_checkpointing = False
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
 
@@ -336,6 +338,8 @@ def __init__(
         else:
             self.downsamplers = None
 
+        self.gradient_checkpointing = False
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
 
@@ -410,6 +414,8 @@ def __init__(
         else:
             self.upsamplers = None
 
+        self.gradient_checkpointing = False
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
 
@@ -440,7 +446,7 @@ def custom_forward(*inputs):
         return hidden_states
 
 
-class EncoderCausal3D(nn.Module):
+class HunyuanVideoEncoder3D(nn.Module):
     r"""
     Causal encoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
     """
@@ -564,7 +570,7 @@ def custom_forward(*inputs):
         return hidden_states
 
 
-class DecoderCausal3D(nn.Module):
+class HunyuanVideoDecoder3D(nn.Module):
     r"""
     Causal decoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
     """
@@ -730,7 +736,7 @@ def __init__(
 
         self.time_compression_ratio = temporal_compression_ratio
 
-        self.encoder = EncoderCausal3D(
+        self.encoder = HunyuanVideoEncoder3D(
             in_channels=in_channels,
             out_channels=latent_channels,
             down_block_types=down_block_types,
@@ -744,7 +750,7 @@ def __init__(
             spatial_compression_ratio=spatial_compression_ratio,
         )
 
-        self.decoder = DecoderCausal3D(
+        self.decoder = HunyuanVideoDecoder3D(
             in_channels=latent_channels,
             out_channels=out_channels,
             up_block_types=up_block_types,
@@ -789,7 +795,7 @@ def __init__(
         self.tile_sample_stride_width = 192
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+        if isinstance(module, (HunyuanVideoEncoder3D, HunyuanVideoDecoder3D)):
             module.gradient_checkpointing = value
 
     def enable_tiling(
@@ -1151,7 +1157,5 @@ def forward(
             z = posterior.sample(generator=generator)
         else:
             z = posterior.mode()
-        dec = self.decode(z)
-        if not return_dict:
-            return (dec,)
+        dec = self.decode(z, return_dict=return_dict)
         return dec
diff --git a/tests/models/autoencoders/test_models_autoencoder_hunyuan_video.py b/tests/models/autoencoders/test_models_autoencoder_hunyuan_video.py
@@ -0,0 +1,159 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AutoencoderKLHunyuanVideo
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+
+enable_full_determinism()
+
+
+class AutoencoderKLHunyuanVideoTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKLHunyuanVideo
+    main_input_name = "sample"
+    base_precision = 1e-2
+
+    def get_autoencoder_kl_hunyuan_video_config(self):
+        return {
+            "in_channels": 3,
+            "out_channels": 3,
+            "latent_channels": 4,
+            "down_block_types": (
+                "HunyuanVideoDownBlock3D",
+                "HunyuanVideoDownBlock3D",
+            ),
+            "up_block_types": (
+                "HunyuanVideoUpBlock3D",
+                "HunyuanVideoUpBlock3D",
+            ),
+            "block_out_channels": (8, 8, 8, 8),
+            "layers_per_block": 1,
+            "act_fn": "silu",
+            "norm_num_groups": 4,
+            "scaling_factor": 0.476986,
+            "spatial_compression_ratio": 8,
+            "temporal_compression_ratio": 4,
+            "mid_block_add_attention": True,
+        }
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
+        num_frames = 9
+        num_channels = 3
+        sizes = (16, 16)
+
+        image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 9, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (3, 9, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = self.get_autoencoder_kl_hunyuan_video_config()
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )
+
+    def test_enable_disable_slicing(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_slicing()
+        output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE slicing should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_slicing()
+        output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_slicing.detach().cpu().numpy().all(),
+            output_without_slicing_2.detach().cpu().numpy().all(),
+            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
+        )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {
+            "HunyuanVideoDecoder3D",
+            "HunyuanVideoDownBlock3D",
+            "HunyuanVideoEncoder3D",
+            "HunyuanVideoMidBlock3D",
+            "HunyuanVideoUpBlock3D",
+        }
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @unittest.skip("Unsupported test.")
+    def test_outputs_equivalence(self):
+        pass