huggingface
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 8 additions & 0 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 5 additions & 1 deletion b/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py‎
Lines changed: 11 additions & 8 deletions b/‎src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/diffusers/pipelines/ltx/pipeline_ltx.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/pipelines/ltx/pipeline_ltx.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py‎
Lines changed: 116 additions & 0 deletions b/‎tests/models/transformers/test_models_transformer_hunyuan_video_framepack.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎tests/pipelines/consisid/test_consisid.py‎
Lines changed: 8 additions & 7 deletions b/‎tests/pipelines/consisid/test_consisid.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎tests/pipelines/dit/test_dit.py‎
Lines changed: 18 additions & 7 deletions b/‎tests/pipelines/dit/test_dit.py‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎tests/pipelines/easyanimate/test_easyanimate.py‎
Lines changed: 5 additions & 4 deletions b/‎tests/pipelines/easyanimate/test_easyanimate.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎tests/pipelines/mochi/test_mochi.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/pipelines/mochi/test_mochi.py‎
Lines changed: 6 additions & 6 deletions
@@ -1704,3 +1704,11 @@ def get_alpha_scales(down_weight, key):
         converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
 
     return converted_state_dict
+
+
+def _convert_non_diffusers_hidream_lora_to_diffusers(state_dict, non_diffusers_prefix="diffusion_model"):
+    if not all(k.startswith(non_diffusers_prefix) for k in state_dict):
+        raise ValueError("Invalid LoRA state dict for HiDream.")
+    converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict
@@ -43,6 +43,7 @@
     _convert_hunyuan_video_lora_to_diffusers,
     _convert_kohya_flux_lora_to_diffusers,
     _convert_musubi_wan_lora_to_diffusers,
+    _convert_non_diffusers_hidream_lora_to_diffusers,
     _convert_non_diffusers_lora_to_diffusers,
     _convert_non_diffusers_lumina2_lora_to_diffusers,
     _convert_non_diffusers_wan_lora_to_diffusers,
@@ -5371,7 +5372,6 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin):
 
     @classmethod
     @validate_hf_hub_args
-    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.lora_state_dict
     def lora_state_dict(
         cls,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -5465,6 +5465,10 @@ def lora_state_dict(
             logger.warning(warn_msg)
             state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
 
+        is_non_diffusers_format = any("diffusion_model" in k for k in state_dict)
+        if is_non_diffusers_format:
+            state_dict = _convert_non_diffusers_hidream_lora_to_diffusers(state_dict)
+
         return state_dict
 
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
 
@@ -152,9 +152,19 @@ def __init__(
 
         # 1. Latent and condition embedders
         self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+
+        # Framepack history projection embedder
+        self.clean_x_embedder = None
+        if has_clean_x_embedder:
+            self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
+
         self.context_embedder = HunyuanVideoTokenRefiner(
             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
         )
+
+        # Framepack image-conditioning embedder
+        self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
+
         self.time_text_embed = HunyuanVideoConditionEmbedding(
             inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
         )
@@ -186,14 +196,7 @@ def __init__(
         self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
 
-        # Framepack specific modules
-        self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
-
-        self.clean_x_embedder = None
-        if has_clean_x_embedder:
-            self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
-
-        self.use_gradient_checkpointing = False
+        self.gradient_checkpointing = False
 
     def forward(
         self,
 
@@ -789,6 +789,7 @@ def __call__(
                 ]
                 latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
 
+            latents = latents.to(self.vae.dtype)
             video = self.vae.decode(latents, timestep, return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)
 
 
@@ -0,0 +1,116 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import HunyuanVideoFramepackTransformer3DModel
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideoFramepackTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+    model_split_percents = [0.5, 0.7, 0.9]
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 4
+        num_frames = 3
+        height = 4
+        width = 4
+        text_encoder_embedding_dim = 16
+        image_encoder_embedding_dim = 16
+        pooled_projection_dim = 8
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        pooled_projections = torch.randn((batch_size, pooled_projection_dim)).to(torch_device)
+        encoder_attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
+        image_embeds = torch.randn((batch_size, sequence_length, image_encoder_embedding_dim)).to(torch_device)
+        indices_latents = torch.ones((3,)).to(torch_device)
+        latents_clean = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
+        indices_latents_clean = torch.ones((num_frames - 1,)).to(torch_device)
+        latents_history_2x = torch.randn((batch_size, num_channels, num_frames - 1, height, width)).to(torch_device)
+        indices_latents_history_2x = torch.ones((num_frames - 1,)).to(torch_device)
+        latents_history_4x = torch.randn((batch_size, num_channels, (num_frames - 1) * 4, height, width)).to(
+            torch_device
+        )
+        indices_latents_history_4x = torch.ones(((num_frames - 1) * 4,)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        guidance = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "encoder_attention_mask": encoder_attention_mask,
+            "guidance": guidance,
+            "image_embeds": image_embeds,
+            "indices_latents": indices_latents,
+            "latents_clean": latents_clean,
+            "indices_latents_clean": indices_latents_clean,
+            "latents_history_2x": latents_history_2x,
+            "indices_latents_history_2x": indices_latents_history_2x,
+            "latents_history_4x": latents_history_4x,
+            "indices_latents_history_4x": indices_latents_history_4x,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 3, 4, 4)
+
+    @property
+    def output_shape(self):
+        return (4, 3, 4, 4)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 4,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 10,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "num_refiner_layers": 1,
+            "patch_size": 2,
+            "patch_size_t": 1,
+            "guidance_embeds": True,
+            "text_embed_dim": 16,
+            "pooled_projection_dim": 8,
+            "rope_axes_dim": (2, 4, 4),
+            "image_condition_type": None,
+            "has_image_proj": True,
+            "image_proj_dim": 16,
+            "has_clean_x_embedder": True,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideoFramepackTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
@@ -24,9 +24,10 @@
 from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -316,19 +317,19 @@ def test_vae_tiling(self, expected_diff_max: float = 0.4):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class ConsisIDPipelineIntegrationTests(unittest.TestCase):
     prompt = "A painting of a squirrel eating a burger."
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_consisid(self):
         generator = torch.Generator("cpu").manual_seed(0)
@@ -338,8 +339,8 @@ def test_consisid(self):
 
         prompt = self.prompt
         image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true")
-        id_vit_hidden = [torch.ones([1, 2, 2])] * 1
-        id_cond = torch.ones(1, 2)
+        id_vit_hidden = [torch.ones([1, 577, 1024])] * 5
+        id_cond = torch.ones(1, 1280)
 
         videos = pipe(
             image=image,
@@ -357,5 +358,5 @@ def test_consisid(self):
         video = videos[0]
         expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
 
-        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
         assert max_diff < 1e-3, f"Max diff is too high. got {video}"
@@ -21,7 +21,15 @@
 
 from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DiTTransformer2DModel, DPMSolverMultistepScheduler
 from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_torch_accelerator,
+    torch_device,
+)
 
 from ..pipeline_params import (
     CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
@@ -107,23 +115,23 @@ def test_xformers_attention_forwardGenerator_pass(self):
 
 
 @nightly
-@require_torch_gpu
+@require_torch_accelerator
 class DiTPipelineIntegrationTests(unittest.TestCase):
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_dit_256(self):
         generator = torch.manual_seed(0)
 
         pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
-        pipe.to("cuda")
+        pipe.to(torch_device)
 
         words = ["vase", "umbrella", "white shark", "white wolf"]
         ids = pipe.get_label_ids(words)
@@ -139,7 +147,7 @@ def test_dit_256(self):
     def test_dit_512(self):
         pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
         pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
+        pipe.to(torch_device)
 
         words = ["vase", "umbrella"]
         ids = pipe.get_label_ids(words)
@@ -152,4 +160,7 @@ def test_dit_512(self):
                 f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}_512.npy"
             )
 
-            assert np.abs((expected_image - image).max()) < 1e-1
+            expected_slice = expected_image.flatten()
+            output_slice = image.flatten()
+
+            assert numpy_cosine_similarity_distance(expected_slice, output_slice) < 1e-2
@@ -27,9 +27,10 @@
     FlowMatchEulerDiscreteScheduler,
 )
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -256,19 +257,19 @@ def test_encode_prompt_works_in_isolation(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class EasyAnimatePipelineIntegrationTests(unittest.TestCase):
     prompt = "A painting of a squirrel eating a burger."
 
     def setUp(self):
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_EasyAnimate(self):
         generator = torch.Generator("cpu").manual_seed(0)
 
@@ -27,8 +27,8 @@
     enable_full_determinism,
     nightly,
     numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
-    require_torch_gpu,
+    require_big_accelerator,
+    require_torch_accelerator,
     torch_device,
 )
 
@@ -266,9 +266,9 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
 
 
 @nightly
-@require_torch_gpu
-@require_big_gpu_with_torch_cuda
-@pytest.mark.big_gpu_with_torch_cuda
+@require_torch_accelerator
+@require_big_accelerator
+@pytest.mark.big_accelerator
 class MochiPipelineIntegrationTests(unittest.TestCase):
     prompt = "A painting of a squirrel eating a burger."
 
@@ -302,5 +302,5 @@ def test_mochi(self):
         video = videos[0]
         expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
 
-        max_diff = numpy_cosine_similarity_distance(video, expected_video)
+        max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
         assert max_diff < 1e-3, f"Max diff is too high. got {video}"
Original file line number	Diff line number	Diff line change
`@@ -789,6 +789,7 @@ def __call__(`
`789`	`789`	`]`
`790`	`790`	`latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise`
`791`	`791`
	`792`	`+ latents = latents.to(self.vae.dtype)`
`792`	`793`	`video = self.vae.decode(latents, timestep, return_dict=False)[0]`
`793`	`794`	`video = self.video_processor.postprocess_video(video, output_type=output_type)`
`794`	`795`