update

a-r-r-o-w · a-r-r-o-w · commit af24beacc7ee · 2025-03-17T23:53:09.000+01:00
diff --git a/scripts/convert_hunyuan_video_to_diffusers.py b/scripts/convert_hunyuan_video_to_diffusers.py
@@ -160,8 +160,9 @@ def remap_single_transformer_blocks_(key, state_dict):
         "pooled_projection_dim": 768,
         "rope_theta": 256.0,
         "rope_axes_dim": (16, 56, 56),
+        "image_condition_type": None,
     },
-    "HYVideo-T/2-I2V": {
+    "HYVideo-T/2-I2V-33ch": {
         "in_channels": 16 * 2 + 1,
         "out_channels": 16,
         "num_attention_heads": 24,
@@ -178,6 +179,26 @@ def remap_single_transformer_blocks_(key, state_dict):
         "pooled_projection_dim": 768,
         "rope_theta": 256.0,
         "rope_axes_dim": (16, 56, 56),
+        "image_condition_type": "latent_concat",
+    },
+    "HYVideo-T/2-I2V-16ch": {
+        "in_channels": 16,
+        "out_channels": 16,
+        "num_attention_heads": 24,
+        "attention_head_dim": 128,
+        "num_layers": 20,
+        "num_single_layers": 40,
+        "num_refiner_layers": 2,
+        "mlp_ratio": 4.0,
+        "patch_size": 2,
+        "patch_size_t": 1,
+        "qk_norm": "rms_norm",
+        "guidance_embeds": True,
+        "text_embed_dim": 4096,
+        "pooled_projection_dim": 768,
+        "rope_theta": 256.0,
+        "rope_axes_dim": (16, 56, 56),
+        "image_condition_type": "token_replace",
     },
 }
 
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -228,8 +228,8 @@ def forward(
         )
 
 
-class HunyuanVideoTimestepTextProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim: int, pooled_projection_dim: int, image_condition_type: Optional[str] = None):
+class HunyuanVideoConditionEmbedding(nn.Module):
+    def __init__(self, embedding_dim: int, pooled_projection_dim: int, guidance_embeds: bool, image_condition_type: Optional[str] = None):
         super().__init__()
 
         self.image_condition_type = image_condition_type
@@ -238,7 +238,11 @@ def __init__(self, embedding_dim: int, pooled_projection_dim: int, image_conditi
         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
         self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
 
-    def forward(self, timestep: torch.Tensor, pooled_projection: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.guidance_embedder = None
+        if guidance_embeds:
+            self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+    def forward(self, timestep: torch.Tensor, pooled_projection: torch.Tensor, guidance: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
         timesteps_proj = self.time_proj(timestep)
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
         pooled_projections = self.text_embedder(pooled_projection)
@@ -248,8 +252,13 @@ def forward(self, timestep: torch.Tensor, pooled_projection: torch.Tensor) -> Tu
         if self.image_condition_type == "token_replace":
             token_replace_timestep = torch.zeros_like(timestep)
             token_replace_proj = self.time_proj(token_replace_timestep)
-            token_replace_emb = self.timestep_embedder(token_replace_proj)
-            token_replace_emb = token_replace_emb + conditioning
+            token_replace_emb = self.timestep_embedder(token_replace_proj.to(dtype=pooled_projection.dtype))
+            token_replace_emb = token_replace_emb + pooled_projections
+
+        if self.guidance_embedder is not None:
+            guidance_proj = self.time_proj(guidance)
+            guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
+            conditioning = conditioning + guidance_emb
 
         return conditioning, token_replace_emb
 
@@ -665,7 +674,7 @@ def forward(
 
         hidden_states_zero = norm_hidden_states[:, :num_tokens] * (1 + tr_scale_mlp[:, None]) + tr_shift_mlp[:, None]
         hidden_states_orig = norm_hidden_states[:, num_tokens:] * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
+        norm_hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
         norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
 
         # 4. Feed-forward
@@ -717,6 +726,10 @@ class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
             The value of theta to use in the RoPE layer.
         rope_axes_dim (`Tuple[int]`, defaults to `(16, 56, 56)`):
             The dimensions of the axes to use in the RoPE layer.
+        image_condition_type (`str`, *optional*, defaults to `None`):
+            The type of image conditioning to use. If `None`, no image conditioning is used. If `latent_concat`, the
+            image is concatenated to the latent stream. If `token_replace`, the image is used to replace first-frame
+            tokens in the latent stream and apply conditioning.
     """
 
     _supports_gradient_checkpointing = True
@@ -761,12 +774,9 @@ def __init__(
             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
         )
 
-        if guidance_embeds:
-            self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
-        else:
-            self.time_text_embed = HunyuanVideoTimestepTextProjEmbeddings(
-                inner_dim, pooled_projection_dim, image_condition_type
-            )
+        self.time_text_embed = HunyuanVideoConditionEmbedding(
+            inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
+        )
 
         # 2. RoPE
         self.rope = HunyuanVideoRotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
@@ -904,10 +914,7 @@ def forward(
         image_rotary_emb = self.rope(hidden_states)
 
         # 2. Conditional embeddings
-        if self.config.guidance_embeds:
-            temb = self.time_text_embed(timestep, guidance, pooled_projections)
-        else:
-            temb, token_replace_emb = self.time_text_embed(timestep, pooled_projections)
+        temb, token_replace_emb = self.time_text_embed(timestep, pooled_projections, guidance)
 
         hidden_states = self.x_embedder(hidden_states)
         encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py
@@ -54,6 +54,7 @@
         >>> from diffusers import HunyuanVideoImageToVideoPipeline, HunyuanVideoTransformer3DModel
         >>> from diffusers.utils import load_image, export_to_video
 
+        >>> # Available checkpoints: hunyuanvideo-community/HunyuanVideo-I2V, hunyuanvideo-community/HunyuanVideo-I2V-33ch
         >>> model_id = "hunyuanvideo-community/HunyuanVideo-I2V"
         >>> transformer = HunyuanVideoTransformer3DModel.from_pretrained(
         ...     model_id, subfolder="transformer", torch_dtype=torch.bfloat16
@@ -69,7 +70,12 @@
         ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png"
         ... )
 
-        >>> output = pipe(image=image, prompt=prompt).frames[0]
+        >>> # If using hunyuanvideo-community/HunyuanVideo-I2V
+        >>> output = pipe(image=image, prompt=prompt, guidance_scale=6.0).frames[0]
+        
+        >>> # If using hunyuanvideo-community/HunyuanVideo-I2V-33ch
+        >>> output = pipe(image=image, prompt=prompt, guidance_scale=1.0, true_cfg_scale=1.0).frames[0]
+        
         >>> export_to_video(output, "output.mp4", fps=15)
         ```
 """
@@ -804,10 +810,15 @@ def __call__(
             negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
             negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
 
-        # 4. Prepare timesteps
+        # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
 
+        # 6. Prepare guidance condition
+        guidance = None
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
+
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -824,14 +835,15 @@ def __call__(
                 if image_condition_type == "latent_concat":
                     latent_model_input = torch.cat([latents, image_latents, mask], dim=1).to(transformer_dtype)
                 elif image_condition_type == "token_replace":
-                    latent_model_input = latents.to(transformer_dtype)
+                    latent_model_input = torch.cat([image_latents, latents[:, :, 1:]], dim=2).to(transformer_dtype)
 
                 noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
                     encoder_hidden_states=prompt_embeds,
                     encoder_attention_mask=prompt_attention_mask,
                     pooled_projections=pooled_prompt_embeds,
+                    guidance=guidance,
                     attention_kwargs=attention_kwargs,
                     return_dict=False,
                 )[0]
@@ -843,6 +855,7 @@ def __call__(
                         encoder_hidden_states=negative_prompt_embeds,
                         encoder_attention_mask=negative_prompt_attention_mask,
                         pooled_projections=negative_pooled_prompt_embeds,
+                        guidance=guidance,
                         attention_kwargs=attention_kwargs,
                         return_dict=False,
                     )[0]
@@ -855,7 +868,7 @@ def __call__(
                     latents = latents = self.scheduler.step(
                         noise_pred[:, :, 1:], t, latents[:, :, 1:], return_dict=False
                     )[0]
-                    latents = torch.cat([image_latents, latents])
+                    latents = torch.cat([image_latents, latents], dim=2)
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}