convert

a-r-r-o-w · a-r-r-o-w · commit b6c7ae027792 · 2024-12-13T23:20:10.000+01:00
diff --git a/scripts/convert_hunyuan_video_to_diffusers.py b/scripts/convert_hunyuan_video_to_diffusers.py
@@ -5,7 +5,7 @@
 from accelerate import init_empty_weights
 from transformers import AutoModel, AutoTokenizer, CLIPTextModel, CLIPTokenizer
 
-from diffusers import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+from diffusers import AutoencoderKLHunyuanVideo, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
 
 
 def remap_norm_scale_shift_(key, state_dict):
@@ -15,6 +15,23 @@ def remap_norm_scale_shift_(key, state_dict):
     state_dict[key.replace("final_layer.adaLN_modulation.1", "norm_out.linear")] = new_weight
 
 
+def remap_token_refiner_blocks_(key, state_dict):
+    def rename_key(key):
+        new_key = key.replace("individual_token_refiner.blocks", "token_refiner.refiner_blocks")
+        new_key = new_key.replace("adaLN_modulation.1", "norm_out.linear")
+        return new_key
+
+    if "self_attn_qkv" in key:
+        weight = state_dict.pop(key)
+        to_q, to_k, to_v = weight.chunk(3, dim=0)
+        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_q"))] = to_q
+        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_k"))] = to_k
+        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_v"))] = to_v
+
+    else:
+        state_dict[rename_key(key)] = state_dict.pop(key)
+
+
 def remap_img_attn_qkv_(key, state_dict):
     weight = state_dict.pop(key)
     to_q, to_k, to_v = weight.chunk(3, dim=0)
@@ -31,14 +48,6 @@ def remap_txt_attn_qkv_(key, state_dict):
     state_dict[key.replace("txt_attn_qkv", "attn.add_v_proj")] = to_v
 
 
-def remap_self_attn_qkv_(key, state_dict):
-    weight = state_dict.pop(key)
-    to_q, to_k, to_v = weight.chunk(3, dim=0)
-    state_dict[key.replace("self_attn_qkv", "attn.to_q")] = to_q
-    state_dict[key.replace("self_attn_qkv", "attn.to_k")] = to_k
-    state_dict[key.replace("self_attn_qkv", "attn.to_v")] = to_v
-
-
 def remap_single_transformer_blocks_(key, state_dict):
     hidden_size = 3072
 
@@ -71,16 +80,16 @@ def remap_single_transformer_blocks_(key, state_dict):
 
 
 TRANSFORMER_KEYS_RENAME_DICT = {
-    # "time_in.mlp.0": "time_text_embed.timestep_embedder.linear_1",
-    # "time_in.mlp.2": "time_text_embed.timestep_embedder.linear_2",
-    # "guidance_in.mlp.0": "time_text_embed.guidance_embedder.linear_1",
-    # "guidance_in.mlp.2": "time_text_embed.guidance_embedder.linear_2",
-    # "vector_in.in_layer": "time_text_embed.text_embedder.linear_1",
-    # "vector_in.out_layer": "time_text_embed.text_embedder.linear_2",
-    "txt_in.t_embedder": "txt_in.time_embed",
-    "txt_in.c_embedder": "txt_in.context_embed",
+    "time_in.mlp.0": "time_text_embed.timestep_embedder.linear_1",
+    "time_in.mlp.2": "time_text_embed.timestep_embedder.linear_2",
+    "guidance_in.mlp.0": "time_text_embed.guidance_embedder.linear_1",
+    "guidance_in.mlp.2": "time_text_embed.guidance_embedder.linear_2",
+    "vector_in.in_layer": "time_text_embed.text_embedder.linear_1",
+    "vector_in.out_layer": "time_text_embed.text_embedder.linear_2",
+    "txt_in.t_embedder.mlp.0": "txt_in.time_text_embed.timestep_embedder.linear_1",
+    "txt_in.t_embedder.mlp.2": "txt_in.time_text_embed.timestep_embedder.linear_2",
+    "txt_in.c_embedder": "txt_in.time_text_embed.text_embedder",
     "double_blocks": "transformer_blocks",
-    "individual_token_refiner.blocks": "token_refiner.refiner_blocks",
     "img_attn_q_norm": "attn.norm_q",
     "img_attn_k_norm": "attn.norm_k",
     "img_attn_proj": "attn.to_out.0",
@@ -102,14 +111,15 @@ def remap_single_transformer_blocks_(key, state_dict):
     "final_layer.linear": "proj_out",
     "fc1": "net.0.proj",
     "fc2": "net.2",
+    "input_embedder": "proj_in",
 }
 
 TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "final_layer.adaLN_modulation.1": remap_norm_scale_shift_,
     "img_attn_qkv": remap_img_attn_qkv_,
     "txt_attn_qkv": remap_txt_attn_qkv_,
-    "self_attn_qkv": remap_self_attn_qkv_,
     "single_blocks": remap_single_transformer_blocks_,
+    "final_layer.adaLN_modulation.1": remap_norm_scale_shift_,
+    "individual_token_refiner.blocks": remap_token_refiner_blocks_,
 }
 
 VAE_KEYS_RENAME_DICT = {}
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
-from functools import partial
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -24,7 +22,11 @@
 from ...utils import is_torch_version
 from ..attention import FeedForward
 from ..attention_processor import Attention, AttentionProcessor
-from ..embeddings import get_1d_rotary_pos_embed, get_timestep_embedding
+from ..embeddings import (
+    CombinedTimestepGuidanceTextProjEmbeddings,
+    CombinedTimestepTextProjEmbeddings,
+    get_1d_rotary_pos_embed,
+)
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
@@ -123,19 +125,6 @@ def __call__(
         return hidden_states, encoder_hidden_states
 
 
-class MLPEmbedder(nn.Module):
-    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
-
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-
-
 class PatchEmbed(nn.Module):
     def __init__(
         self,
@@ -154,49 +143,21 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class TextProjection(nn.Module):
-    def __init__(self, in_channels, hidden_size, act_layer):
+class HunyuanVideoAdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
         super().__init__()
-        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True)
-        self.act_1 = act_layer()
-        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
-
-    def forward(self, caption):
-        hidden_states = self.linear_1(caption)
-        hidden_states = self.act_1(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-
 
-class TimestepEmbedder(nn.Module):
-    """
-    Embeds scalar timesteps into vector representations.
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        act_layer,
-        frequency_embedding_size=256,
-        max_period=10000,
-        out_size=None,
-    ):
-        super().__init__()
-        self.frequency_embedding_size = frequency_embedding_size
-        self.max_period = max_period
-        if out_size is None:
-            out_size = hidden_size
-
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-            act_layer(),
-            nn.Linear(hidden_size, out_size, bias=True),
-        )
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
 
-    def forward(self, t):
-        t_freq = get_timestep_embedding(t, self.frequency_embedding_size, flip_sin_to_cos=True, max_period=self.max_period, downscale_freq_shift=0).type(self.mlp[0].weight.dtype)
-        t_emb = self.mlp(t_freq)
-        return t_emb
+    def forward(
+        self, temb: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
 
 
 class IndividualTokenRefinerBlock(nn.Module):
@@ -224,29 +185,27 @@ def __init__(
         self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
         self.mlp = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="silu", dropout=mlp_drop_rate)
 
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(hidden_size, 2 * hidden_size, bias=True),
-        )
+        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         temb: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        gate_msa, gate_mlp = self.adaLN_modulation(temb).chunk(2, dim=1)
-
         norm_hidden_states = self.norm1(hidden_states)
 
         attn_output = self.attn(
             hidden_states=norm_hidden_states,
             encoder_hidden_states=None,
             attention_mask=attention_mask,
         )
-        hidden_states = hidden_states + attn_output * gate_msa.unsqueeze(1)
 
-        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) * gate_mlp.unsqueeze(1)
+        gate_msa, gate_mlp = self.norm_out(temb)
+        hidden_states = hidden_states + attn_output * gate_msa
+
+        ff_output = self.mlp(self.norm2(hidden_states))
+        hidden_states = hidden_states + ff_output * gate_mlp
 
         return hidden_states
 
@@ -313,10 +272,10 @@ def __init__(
 
         hidden_size = num_attention_heads * attention_head_dim
 
-        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True)
-        self.time_embed = TimestepEmbedder(hidden_size, nn.SiLU)
-        self.context_embed = TextProjection(in_channels, hidden_size, nn.SiLU)
-
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
+            embedding_dim=hidden_size, pooled_projection_dim=in_channels
+        )
+        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
         self.token_refiner = IndividualTokenRefiner(
             num_attention_heads=num_attention_heads,
             attention_head_dim=attention_head_dim,
@@ -332,21 +291,17 @@ def forward(
         timestep: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
-        original_dtype = hidden_states.dtype
-        temb = self.time_embed(timestep)
-
         if attention_mask is None:
             pooled_projections = hidden_states.mean(dim=1)
         else:
+            original_dtype = hidden_states.dtype
             mask_float = attention_mask.float().unsqueeze(-1)
             pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
             pooled_projections = pooled_projections.to(original_dtype)
 
-        pooled_projections = self.context_embed(pooled_projections)
-        emb = temb + pooled_projections
-
-        hidden_states = self.input_embedder(hidden_states)
-        hidden_states = self.token_refiner(hidden_states, emb, attention_mask)
+        temb = self.time_text_embed(timestep, pooled_projections)
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
 
         return hidden_states
 
@@ -561,14 +516,7 @@ def __init__(
             text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
         )
 
-        # time modulation
-        self.time_in = TimestepEmbedder(inner_dim, nn.SiLU)
-
-        # text modulation
-        self.vector_in = MLPEmbedder(text_embed_dim_2, inner_dim)
-
-        # guidance modulation
-        self.guidance_in = TimestepEmbedder(inner_dim, nn.SiLU)
+        self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, text_embed_dim_2)
 
         # 3. RoPE
         self.rope = HunyuanVideoRotaryPosEmbed(patch_size, patch_size_t, rope_dim_list, rope_theta)
@@ -679,30 +627,55 @@ def forward(
 
         image_rotary_emb = self.rope(hidden_states)
 
-        temb = self.time_in(timestep)
-        temb = temb + self.vector_in(encoder_hidden_states_2)
-        temb = temb + self.guidance_in(guidance)
+        temb = self.time_text_embed(timestep, guidance, encoder_hidden_states_2)
 
         # Embed image and text.
         hidden_states = self.img_in(hidden_states)
         encoder_hidden_states = self.txt_in(encoder_hidden_states, timestep, encoder_attention_mask)
 
-        use_reentrant = is_torch_version(">=", "1.11.0")
-        block_forward = (
-            partial(torch.utils.checkpoint.checkpoint, use_reentrant=use_reentrant)
-            if torch.is_grad_enabled() and self.gradient_checkpointing
-            else lambda x: x
-        )
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
 
-        for _, block in enumerate(self.transformer_blocks):
-            hidden_states, encoder_hidden_states = block_forward(block)(
-                hidden_states, encoder_hidden_states, temb, image_rotary_emb
-            )
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
 
-        for block in self.single_transformer_blocks:
-            hidden_states, encoder_hidden_states = block_forward(block)(
-                hidden_states, encoder_hidden_states, temb, image_rotary_emb
-            )
+                return custom_forward
+
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            for block in self.single_transformer_blocks:
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+        else:
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states, encoder_hidden_states, temb, image_rotary_emb
+                )
+
+            for block in self.single_transformer_blocks:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states, encoder_hidden_states, temb, image_rotary_emb
+                )
 
         hidden_states = self.norm_out(hidden_states, temb)
         hidden_states = self.proj_out(hidden_states)