From 27940290ce608840419a568efee0b6d57da30b3e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 8 May 2025 09:14:12 +0530 Subject: [PATCH 1/5] start. --- ...els_transformer_hunyuan_video_framepack.py | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py new file mode 100644 index 000000000000..743ad392f2ab --- /dev/null +++ b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py @@ -0,0 +1,344 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch + +from diffusers import HunyuanVideoTransformer3DModel +from diffusers.utils.testing_utils import ( + enable_full_determinism, + is_torch_compile, + require_torch_2, + require_torch_gpu, + slow, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin + + +enable_full_determinism() + + +class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): + model_class = HunyuanVideoTransformer3DModel + main_input_name = "hidden_states" + uses_custom_attn_processor = True + + @property + def dummy_input(self): + batch_size = 1 + num_channels = 4 + num_frames = 1 + height = 16 + width = 16 + text_encoder_embedding_dim = 16 + pooled_projection_dim = 8 + sequence_length = 12 + + hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) + pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) + encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) + guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "encoder_attention_mask": encoder_attention_mask, + "guidance": guidance, + } + + @property + def input_shape(self): + return (4, 1, 16, 16) + + @property + def output_shape(self): + return (4, 1, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "in_channels": 4, + "out_channels": 4, + "num_attention_heads": 2, + "attention_head_dim": 10, + "num_layers": 1, + "num_single_layers": 1, + "num_refiner_layers": 1, + "patch_size": 1, + "patch_size_t": 1, + "guidance_embeds": True, + "text_embed_dim": 16, + "pooled_projection_dim": 8, + "rope_axes_dim": (2, 4, 4), + "image_condition_type": None, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"HunyuanVideoTransformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + + +class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): + model_class = HunyuanVideoTransformer3DModel + main_input_name = "hidden_states" + uses_custom_attn_processor = True + + @property + def dummy_input(self): + batch_size = 1 + num_channels = 8 + num_frames = 1 + height = 16 + width = 16 + text_encoder_embedding_dim = 16 + pooled_projection_dim = 8 + sequence_length = 12 + + hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) + pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) + encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) + guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "encoder_attention_mask": encoder_attention_mask, + "guidance": guidance, + } + + @property + def input_shape(self): + return (8, 1, 16, 16) + + @property + def output_shape(self): + return (4, 1, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "in_channels": 8, + "out_channels": 4, + "num_attention_heads": 2, + "attention_head_dim": 10, + "num_layers": 1, + "num_single_layers": 1, + "num_refiner_layers": 1, + "patch_size": 1, + "patch_size_t": 1, + "guidance_embeds": True, + "text_embed_dim": 16, + "pooled_projection_dim": 8, + "rope_axes_dim": (2, 4, 4), + "image_condition_type": None, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_output(self): + super().test_output(expected_output_shape=(1, *self.output_shape)) + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"HunyuanVideoTransformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + + +class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): + model_class = HunyuanVideoTransformer3DModel + main_input_name = "hidden_states" + uses_custom_attn_processor = True + + @property + def dummy_input(self): + batch_size = 1 + num_channels = 2 * 4 + 1 + num_frames = 1 + height = 16 + width = 16 + text_encoder_embedding_dim = 16 + pooled_projection_dim = 8 + sequence_length = 12 + + hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) + pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) + encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "encoder_attention_mask": encoder_attention_mask, + } + + @property + def input_shape(self): + return (8, 1, 16, 16) + + @property + def output_shape(self): + return (4, 1, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "in_channels": 2 * 4 + 1, + "out_channels": 4, + "num_attention_heads": 2, + "attention_head_dim": 10, + "num_layers": 1, + "num_single_layers": 1, + "num_refiner_layers": 1, + "patch_size": 1, + "patch_size_t": 1, + "guidance_embeds": False, + "text_embed_dim": 16, + "pooled_projection_dim": 8, + "rope_axes_dim": (2, 4, 4), + "image_condition_type": "latent_concat", + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_output(self): + super().test_output(expected_output_shape=(1, *self.output_shape)) + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"HunyuanVideoTransformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @require_torch_gpu + @require_torch_2 + @is_torch_compile + @slow + def test_torch_compile_recompilation_and_graph_break(self): + torch._dynamo.reset() + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict).to(torch_device) + model = torch.compile(model, fullgraph=True) + + with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + _ = model(**inputs_dict) + _ = model(**inputs_dict) + + +class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): + model_class = HunyuanVideoTransformer3DModel + main_input_name = "hidden_states" + uses_custom_attn_processor = True + + @property + def dummy_input(self): + batch_size = 1 + num_channels = 2 + num_frames = 1 + height = 16 + width = 16 + text_encoder_embedding_dim = 16 + pooled_projection_dim = 8 + sequence_length = 12 + + hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) + pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) + encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) + guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "encoder_attention_mask": encoder_attention_mask, + "guidance": guidance, + } + + @property + def input_shape(self): + return (8, 1, 16, 16) + + @property + def output_shape(self): + return (4, 1, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "in_channels": 2, + "out_channels": 4, + "num_attention_heads": 2, + "attention_head_dim": 10, + "num_layers": 1, + "num_single_layers": 1, + "num_refiner_layers": 1, + "patch_size": 1, + "patch_size_t": 1, + "guidance_embeds": True, + "text_embed_dim": 16, + "pooled_projection_dim": 8, + "rope_axes_dim": (2, 4, 4), + "image_condition_type": "token_replace", + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_output(self): + super().test_output(expected_output_shape=(1, *self.output_shape)) + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"HunyuanVideoTransformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) \ No newline at end of file From da07d8611d475429885addf18d43fe7e142fb4dd Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 8 May 2025 10:20:16 +0530 Subject: [PATCH 2/5] add tests for framepack transformer model. --- .../transformer_hunyuan_video_framepack.py | 2 +- ...els_transformer_hunyuan_video_framepack.py | 296 ++---------------- 2 files changed, 35 insertions(+), 263 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index 58b811569403..0331d9934a59 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -193,7 +193,7 @@ def __init__( if has_clean_x_embedder: self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim) - self.use_gradient_checkpointing = False + self.gradient_checkpointing = False def forward( self, diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py index 743ad392f2ab..5f485b210f2d 100644 --- a/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py +++ b/tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py @@ -16,13 +16,9 @@ import torch -from diffusers import HunyuanVideoTransformer3DModel +from diffusers import HunyuanVideoFramepackTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, - is_torch_compile, - require_torch_2, - require_torch_gpu, - slow, torch_device, ) @@ -33,107 +29,39 @@ class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): - model_class = HunyuanVideoTransformer3DModel + model_class = HunyuanVideoFramepackTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True + model_split_percents = [0.5, 0.7, 0.9] @property def dummy_input(self): batch_size = 1 num_channels = 4 - num_frames = 1 - height = 16 - width = 16 + num_frames = 3 + height = 4 + width = 4 text_encoder_embedding_dim = 16 + image_encoder_embedding_dim = 16 pooled_projection_dim = 8 sequence_length = 12 hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) - timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) - guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) - - return { - "hidden_states": hidden_states, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - "pooled_projections": pooled_projections, - "encoder_attention_mask": encoder_attention_mask, - "guidance": guidance, - } - - @property - def input_shape(self): - return (4, 1, 16, 16) - - @property - def output_shape(self): - return (4, 1, 16, 16) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 4, - "out_channels": 4, - "num_attention_heads": 2, - "attention_head_dim": 10, - "num_layers": 1, - "num_single_layers": 1, - "num_refiner_layers": 1, - "patch_size": 1, - "patch_size_t": 1, - "guidance_embeds": True, - "text_embed_dim": 16, - "pooled_projection_dim": 8, - "rope_axes_dim": (2, 4, 4), - "image_condition_type": None, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"HunyuanVideoTransformer3DModel"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - - -class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): - model_class = HunyuanVideoTransformer3DModel - main_input_name = "hidden_states" - uses_custom_attn_processor = True - - @property - def dummy_input(self): - batch_size = 1 - num_channels = 8 - num_frames = 1 - height = 16 - width = 16 - text_encoder_embedding_dim = 16 - pooled_projection_dim = 8 - sequence_length = 12 - - hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + image_embeds = torch.randn((batch_size, sequence_length, image_encoder_embedding_dim)).to(torch_device) + indices_latents = torch.ones((3,)).to(torch_device) + latents_clean = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device) + indices_latents_clean = torch.ones((num_frames - 1,)).to(torch_device) + latents_history_2x = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device) + indices_latents_history_2x = torch.ones((num_frames - 1,)).to(torch_device) + latents_history_4x = torch.randn((batch_size, num_channels, (num_frames - 1) * 4, height, width)).to( + torch_device + ) + indices_latents_history_4x = torch.ones(((num_frames - 1) * 4,)).to(torch_device) timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) - encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) - pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) - encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) - guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) + guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) return { "hidden_states": hidden_states, @@ -142,203 +70,47 @@ def dummy_input(self): "pooled_projections": pooled_projections, "encoder_attention_mask": encoder_attention_mask, "guidance": guidance, + "image_embeds": image_embeds, + "indices_latents": indices_latents, + "latents_clean": latents_clean, + "indices_latents_clean": indices_latents_clean, + "latents_history_2x": latents_history_2x, + "indices_latents_history_2x": indices_latents_history_2x, + "latents_history_4x": latents_history_4x, + "indices_latents_history_4x": indices_latents_history_4x, } @property def input_shape(self): - return (8, 1, 16, 16) + return (4, 3, 4, 4) @property def output_shape(self): - return (4, 1, 16, 16) + return (4, 3, 4, 4) def prepare_init_args_and_inputs_for_common(self): init_dict = { - "in_channels": 8, + "in_channels": 4, "out_channels": 4, "num_attention_heads": 2, "attention_head_dim": 10, "num_layers": 1, "num_single_layers": 1, "num_refiner_layers": 1, - "patch_size": 1, + "patch_size": 2, "patch_size_t": 1, "guidance_embeds": True, "text_embed_dim": 16, "pooled_projection_dim": 8, "rope_axes_dim": (2, 4, 4), "image_condition_type": None, + "has_image_proj": True, + "image_proj_dim": 16, + "has_clean_x_embedder": True, } inputs_dict = self.dummy_input return init_dict, inputs_dict - def test_output(self): - super().test_output(expected_output_shape=(1, *self.output_shape)) - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"HunyuanVideoTransformer3DModel"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - - -class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): - model_class = HunyuanVideoTransformer3DModel - main_input_name = "hidden_states" - uses_custom_attn_processor = True - - @property - def dummy_input(self): - batch_size = 1 - num_channels = 2 * 4 + 1 - num_frames = 1 - height = 16 - width = 16 - text_encoder_embedding_dim = 16 - pooled_projection_dim = 8 - sequence_length = 12 - - hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) - timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) - encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) - pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) - encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) - - return { - "hidden_states": hidden_states, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - "pooled_projections": pooled_projections, - "encoder_attention_mask": encoder_attention_mask, - } - - @property - def input_shape(self): - return (8, 1, 16, 16) - - @property - def output_shape(self): - return (4, 1, 16, 16) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 2 * 4 + 1, - "out_channels": 4, - "num_attention_heads": 2, - "attention_head_dim": 10, - "num_layers": 1, - "num_single_layers": 1, - "num_refiner_layers": 1, - "patch_size": 1, - "patch_size_t": 1, - "guidance_embeds": False, - "text_embed_dim": 16, - "pooled_projection_dim": 8, - "rope_axes_dim": (2, 4, 4), - "image_condition_type": "latent_concat", - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - super().test_output(expected_output_shape=(1, *self.output_shape)) - def test_gradient_checkpointing_is_applied(self): - expected_set = {"HunyuanVideoTransformer3DModel"} + expected_set = {"HunyuanVideoFramepackTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - - -class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): - model_class = HunyuanVideoTransformer3DModel - main_input_name = "hidden_states" - uses_custom_attn_processor = True - - @property - def dummy_input(self): - batch_size = 1 - num_channels = 2 - num_frames = 1 - height = 16 - width = 16 - text_encoder_embedding_dim = 16 - pooled_projection_dim = 8 - sequence_length = 12 - - hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) - timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) - encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device) - pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device) - encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device) - guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device, dtype=torch.float32) - - return { - "hidden_states": hidden_states, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - "pooled_projections": pooled_projections, - "encoder_attention_mask": encoder_attention_mask, - "guidance": guidance, - } - - @property - def input_shape(self): - return (8, 1, 16, 16) - - @property - def output_shape(self): - return (4, 1, 16, 16) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 2, - "out_channels": 4, - "num_attention_heads": 2, - "attention_head_dim": 10, - "num_layers": 1, - "num_single_layers": 1, - "num_refiner_layers": 1, - "patch_size": 1, - "patch_size_t": 1, - "guidance_embeds": True, - "text_embed_dim": 16, - "pooled_projection_dim": 8, - "rope_axes_dim": (2, 4, 4), - "image_condition_type": "token_replace", - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - super().test_output(expected_output_shape=(1, *self.output_shape)) - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"HunyuanVideoTransformer3DModel"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) \ No newline at end of file From 76357aef59261b0b012e864eee57af0f1b09d1a5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 10 May 2025 18:43:18 +0530 Subject: [PATCH 3/5] merge conflicts. --- .../transformer_hunyuan_video_framepack.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index 0331d9934a59..c2eb7fd2a705 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -152,9 +152,19 @@ def __init__( # 1. Latent and condition embedders self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim) + + # Framepack history projection embedder + self.clean_x_embedder = None + if has_clean_x_embedder: + self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim) + self.context_embedder = HunyuanVideoTokenRefiner( text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers ) + + # Framepack image-conditioning embedder + self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None + self.time_text_embed = HunyuanVideoConditionEmbedding( inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type ) @@ -186,13 +196,6 @@ def __init__( self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6) self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels) - # Framepack specific modules - self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None - - self.clean_x_embedder = None - if has_clean_x_embedder: - self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim) - self.gradient_checkpointing = False def forward( From d52d97b0e6bbe2ce2fe0d2c5971e6839a1c7e3fc Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 10 May 2025 18:44:24 +0530 Subject: [PATCH 4/5] make to square. --- .../models/transformers/transformer_hunyuan_video_framepack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index c2eb7fd2a705..349c0f797978 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -196,7 +196,7 @@ def __init__( self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6) self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels) - self.gradient_checkpointing = False + self.use_gradient_checkpointing = False def forward( self, From b9ba05f2bc729339b4204d618f6f90f8d3feeb3b Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 10 May 2025 18:44:53 +0530 Subject: [PATCH 5/5] fixes --- .../models/transformers/transformer_hunyuan_video_framepack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index 349c0f797978..c2eb7fd2a705 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -196,7 +196,7 @@ def __init__( self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6) self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels) - self.use_gradient_checkpointing = False + self.gradient_checkpointing = False def forward( self,