From 27940290ce608840419a568efee0b6d57da30b3e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 8 May 2025 09:14:12 +0530
Subject: [PATCH 1/5] start.

---
 ...els_transformer_hunyuan_video_framepack.py | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py

diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py
new file mode 100644
index 000000000000..743ad392f2ab
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py
@@ -0,0 +1,344 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import HunyuanVideoTransformer3DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 4
+        num_frames = 1
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        pooled_projection_dim = 8
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
+        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "encoder_attention_mask": encoder_attention_mask,
+            "guidance": guidance,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 4,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 10,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "num_refiner_layers": 1,
+            "patch_size": 1,
+            "patch_size_t": 1,
+            "guidance_embeds": True,
+            "text_embed_dim": 16,
+            "pooled_projection_dim": 8,
+            "rope_axes_dim": (2, 4, 4),
+            "image_condition_type": None,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 8
+        num_frames = 1
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        pooled_projection_dim = 8
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
+        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "encoder_attention_mask": encoder_attention_mask,
+            "guidance": guidance,
+        }
+
+    @property
+    def input_shape(self):
+        return (8, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 8,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 10,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "num_refiner_layers": 1,
+            "patch_size": 1,
+            "patch_size_t": 1,
+            "guidance_embeds": True,
+            "text_embed_dim": 16,
+            "pooled_projection_dim": 8,
+            "rope_axes_dim": (2, 4, 4),
+            "image_condition_type": None,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(expected_output_shape=(1, *self.output_shape))
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 2 * 4 + 1
+        num_frames = 1
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        pooled_projection_dim = 8
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
+        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "encoder_attention_mask": encoder_attention_mask,
+        }
+
+    @property
+    def input_shape(self):
+        return (8, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 2 * 4 + 1,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 10,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "num_refiner_layers": 1,
+            "patch_size": 1,
+            "patch_size_t": 1,
+            "guidance_embeds": False,
+            "text_embed_dim": 16,
+            "pooled_projection_dim": 8,
+            "rope_axes_dim": (2, 4, 4),
+            "image_condition_type": "latent_concat",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(expected_output_shape=(1, *self.output_shape))
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 2
+        num_frames = 1
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        pooled_projection_dim = 8
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
+        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "encoder_attention_mask": encoder_attention_mask,
+            "guidance": guidance,
+        }
+
+    @property
+    def input_shape(self):
+        return (8, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 2,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 10,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "num_refiner_layers": 1,
+            "patch_size": 1,
+            "patch_size_t": 1,
+            "guidance_embeds": True,
+            "text_embed_dim": 16,
+            "pooled_projection_dim": 8,
+            "rope_axes_dim": (2, 4, 4),
+            "image_condition_type": "token_replace",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_output(self):
+        super().test_output(expected_output_shape=(1, *self.output_shape))
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
\ No newline at end of file

From da07d8611d475429885addf18d43fe7e142fb4dd Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 8 May 2025 10:20:16 +0530
Subject: [PATCH 2/5] add tests for framepack transformer model.

---
 .../transformer_hunyuan_video_framepack.py    |   2 +-
 ...els_transformer_hunyuan_video_framepack.py | 296 ++----------------
 2 files changed, 35 insertions(+), 263 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
index 58b811569403..0331d9934a59 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -193,7 +193,7 @@ def __init__(
         if has_clean_x_embedder:
             self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
 
-        self.use_gradient_checkpointing = False
+        self.gradient_checkpointing = False
 
     def forward(
         self,
diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py
index 743ad392f2ab..5f485b210f2d 100644
--- a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py
+++ b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py
@@ -16,13 +16,9 @@
 
 import torch
 
-from diffusers import HunyuanVideoTransformer3DModel
+from diffusers import HunyuanVideoFramepackTransformer3DModel
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
-    is_torch_compile,
-    require_torch_2,
-    require_torch_gpu,
-    slow,
     torch_device,
 )
 
@@ -33,107 +29,39 @@
 
 
 class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
-    model_class = HunyuanVideoTransformer3DModel
+    model_class = HunyuanVideoFramepackTransformer3DModel
     main_input_name = "hidden_states"
     uses_custom_attn_processor = True
+    model_split_percents = [0.5, 0.7, 0.9]
 
     @property
     def dummy_input(self):
         batch_size = 1
         num_channels = 4
-        num_frames = 1
-        height = 16
-        width = 16
+        num_frames = 3
+        height = 4
+        width = 4
         text_encoder_embedding_dim = 16
+        image_encoder_embedding_dim = 16
         pooled_projection_dim = 8
         sequence_length = 12
 
         hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
         encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
         pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
         encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            "pooled_projections": pooled_projections,
-            "encoder_attention_mask": encoder_attention_mask,
-            "guidance": guidance,
-        }
-
-    @property
-    def input_shape(self):
-        return (4, 1, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 1, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 4,
-            "out_channels": 4,
-            "num_attention_heads": 2,
-            "attention_head_dim": 10,
-            "num_layers": 1,
-            "num_single_layers": 1,
-            "num_refiner_layers": 1,
-            "patch_size": 1,
-            "patch_size_t": 1,
-            "guidance_embeds": True,
-            "text_embed_dim": 16,
-            "pooled_projection_dim": 8,
-            "rope_axes_dim": (2, 4, 4),
-            "image_condition_type": None,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"HunyuanVideoTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
-    model_class = HunyuanVideoTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 8
-        num_frames = 1
-        height = 16
-        width = 16
-        text_encoder_embedding_dim = 16
-        pooled_projection_dim = 8
-        sequence_length = 12
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        image_embeds = torch.randn((batch_size, sequence_length, image_encoder_embedding_dim)).to(torch_device)
+        indices_latents = torch.ones((3,)).to(torch_device)
+        latents_clean = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
+        indices_latents_clean = torch.ones((num_frames - 1,)).to(torch_device)
+        latents_history_2x = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
+        indices_latents_history_2x = torch.ones((num_frames - 1,)).to(torch_device)
+        latents_history_4x = torch.randn((batch_size, num_channels, (num_frames - 1) * 4, height, width)).to(
+            torch_device
+        )
+        indices_latents_history_4x = torch.ones(((num_frames - 1) * 4,)).to(torch_device)
         timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
-        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
-        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
+        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
 
         return {
             "hidden_states": hidden_states,
@@ -142,203 +70,47 @@ def dummy_input(self):
             "pooled_projections": pooled_projections,
             "encoder_attention_mask": encoder_attention_mask,
             "guidance": guidance,
+            "image_embeds": image_embeds,
+            "indices_latents": indices_latents,
+            "latents_clean": latents_clean,
+            "indices_latents_clean": indices_latents_clean,
+            "latents_history_2x": latents_history_2x,
+            "indices_latents_history_2x": indices_latents_history_2x,
+            "latents_history_4x": latents_history_4x,
+            "indices_latents_history_4x": indices_latents_history_4x,
         }
 
     @property
     def input_shape(self):
-        return (8, 1, 16, 16)
+        return (4, 3, 4, 4)
 
     @property
     def output_shape(self):
-        return (4, 1, 16, 16)
+        return (4, 3, 4, 4)
 
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
-            "in_channels": 8,
+            "in_channels": 4,
             "out_channels": 4,
             "num_attention_heads": 2,
             "attention_head_dim": 10,
             "num_layers": 1,
             "num_single_layers": 1,
             "num_refiner_layers": 1,
-            "patch_size": 1,
+            "patch_size": 2,
             "patch_size_t": 1,
             "guidance_embeds": True,
             "text_embed_dim": 16,
             "pooled_projection_dim": 8,
             "rope_axes_dim": (2, 4, 4),
             "image_condition_type": None,
+            "has_image_proj": True,
+            "image_proj_dim": 16,
+            "has_clean_x_embedder": True,
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
 
-    def test_output(self):
-        super().test_output(expected_output_shape=(1, *self.output_shape))
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"HunyuanVideoTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
-    model_class = HunyuanVideoTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 2 * 4 + 1
-        num_frames = 1
-        height = 16
-        width = 16
-        text_encoder_embedding_dim = 16
-        pooled_projection_dim = 8
-        sequence_length = 12
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
-        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
-        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            "pooled_projections": pooled_projections,
-            "encoder_attention_mask": encoder_attention_mask,
-        }
-
-    @property
-    def input_shape(self):
-        return (8, 1, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 1, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 2 * 4 + 1,
-            "out_channels": 4,
-            "num_attention_heads": 2,
-            "attention_head_dim": 10,
-            "num_layers": 1,
-            "num_single_layers": 1,
-            "num_refiner_layers": 1,
-            "patch_size": 1,
-            "patch_size_t": 1,
-            "guidance_embeds": False,
-            "text_embed_dim": 16,
-            "pooled_projection_dim": 8,
-            "rope_axes_dim": (2, 4, 4),
-            "image_condition_type": "latent_concat",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        super().test_output(expected_output_shape=(1, *self.output_shape))
-
     def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"HunyuanVideoTransformer3DModel"}
+        expected_set = {"HunyuanVideoFramepackTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
-    model_class = HunyuanVideoTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 2
-        num_frames = 1
-        height = 16
-        width = 16
-        text_encoder_embedding_dim = 16
-        pooled_projection_dim = 8
-        sequence_length = 12
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
-        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
-        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            "pooled_projections": pooled_projections,
-            "encoder_attention_mask": encoder_attention_mask,
-            "guidance": guidance,
-        }
-
-    @property
-    def input_shape(self):
-        return (8, 1, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 1, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 2,
-            "out_channels": 4,
-            "num_attention_heads": 2,
-            "attention_head_dim": 10,
-            "num_layers": 1,
-            "num_single_layers": 1,
-            "num_refiner_layers": 1,
-            "patch_size": 1,
-            "patch_size_t": 1,
-            "guidance_embeds": True,
-            "text_embed_dim": 16,
-            "pooled_projection_dim": 8,
-            "rope_axes_dim": (2, 4, 4),
-            "image_condition_type": "token_replace",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        super().test_output(expected_output_shape=(1, *self.output_shape))
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"HunyuanVideoTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
\ No newline at end of file

From 76357aef59261b0b012e864eee57af0f1b09d1a5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 10 May 2025 18:43:18 +0530
Subject: [PATCH 3/5] merge conflicts.

---
 .../transformer_hunyuan_video_framepack.py      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
index 0331d9934a59..c2eb7fd2a705 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -152,9 +152,19 @@ def __init__(
 
         # 1. Latent and condition embedders
         self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+
+        # Framepack history projection embedder
+        self.clean_x_embedder = None
+        if has_clean_x_embedder:
+            self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
+
         self.context_embedder = HunyuanVideoTokenRefiner(
             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
         )
+
+        # Framepack image-conditioning embedder
+        self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
+
         self.time_text_embed = HunyuanVideoConditionEmbedding(
             inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
         )
@@ -186,13 +196,6 @@ def __init__(
         self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
 
-        # Framepack specific modules
-        self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
-
-        self.clean_x_embedder = None
-        if has_clean_x_embedder:
-            self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
-
         self.gradient_checkpointing = False
 
     def forward(

From d52d97b0e6bbe2ce2fe0d2c5971e6839a1c7e3fc Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 10 May 2025 18:44:24 +0530
Subject: [PATCH 4/5] make to square.

---
 .../models/transformers/transformer_hunyuan_video_framepack.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
index c2eb7fd2a705..349c0f797978 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -196,7 +196,7 @@ def __init__(
         self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
 
-        self.gradient_checkpointing = False
+        self.use_gradient_checkpointing = False
 
     def forward(
         self,

From b9ba05f2bc729339b4204d618f6f90f8d3feeb3b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 10 May 2025 18:44:53 +0530
Subject: [PATCH 5/5] fixes

---
 .../models/transformers/transformer_hunyuan_video_framepack.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
index 349c0f797978..c2eb7fd2a705 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -196,7 +196,7 @@ def __init__(
         self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
 
-        self.use_gradient_checkpointing = False
+        self.gradient_checkpointing = False
 
     def forward(
         self,