add backwardability

tolgacangoz · tolgacangoz · commit 3178c4ebd286 · 2025-07-10T17:50:16.000+03:00
diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
@@ -25,7 +25,7 @@
     "text_embedding.0": "condition_embedder.text_embedder.linear_1",
     "text_embedding.2": "condition_embedder.text_embedder.linear_2",
     "time_projection.1": "condition_embedder.time_proj",
-    "head.modulation": "norm_out.linear.weight",
+    "head.modulation": "scale_shift_table",
     "head.head": "proj_out",
     "modulation": "scale_shift_table",
     "ffn.0": "ffn.net.0.proj",
@@ -67,7 +67,7 @@
     "text_embedding.0": "condition_embedder.text_embedder.linear_1",
     "text_embedding.2": "condition_embedder.text_embedder.linear_2",
     "time_projection.1": "condition_embedder.time_proj",
-    "head.modulation": "norm_out.linear.weight",
+    "head.modulation": "scale_shift_table",
     "head.head": "proj_out",
     "modulation": "scale_shift_table",
     "ffn.0": "ffn.net.0.proj",
@@ -105,12 +105,8 @@
     "after_proj": "proj_out",
 }
 
-TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "norm_out.linear.bias": lambda key, state_dict: state_dict.setdefault(key, torch.zeros(state_dict["norm_out.linear.weight"].shape[0]))
-}
-VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "norm_out.linear.bias": lambda key, state_dict: state_dict.setdefault(key, torch.zeros(state_dict["norm_out.linear.weight"].shape[0]))
-}
+TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
 
 
 def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
@@ -312,10 +308,6 @@ def convert_transformer(model_type: str):
                 continue
             handler_fn_inplace(key, original_state_dict)
 
-    for special_key, handler_fn_inplace in SPECIAL_KEYS_REMAP.items():
-        if special_key not in original_state_dict:
-            handler_fn_inplace(special_key, original_state_dict)
-
     transformer.load_state_dict(original_state_dict, strict=True, assign=True)
     return transformer
 
diff --git a/src/diffusers/models/transformers/latte_transformer_3d.py b/src/diffusers/models/transformers/latte_transformer_3d.py
@@ -171,6 +171,15 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if "scale_shift_table" in state_dict:
+            scale_shift_table = state_dict.pop("scale_shift_table")
+            state_dict[prefix + "norm_out.linear.weight"] = scale_shift_table[1]
+            state_dict[prefix + "norm_out.linear.bias"] = scale_shift_table[0]
+        return super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -185,9 +185,18 @@ def __init__(
         )
         self.caption_projection = None
         if self.config.caption_channels is not None:
-            self.caption_projection = PixArtAlphaTextProjection(
-                in_features=self.config.caption_channels, hidden_size=self.inner_dim
-            )
+                self.caption_projection = PixArtAlphaTextProjection(
+                    in_features=self.config.caption_channels, hidden_size=self.inner_dim
+                )
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if "scale_shift_table" in state_dict:
+            scale_shift_table = state_dict.pop("scale_shift_table")
+            state_dict[prefix + "norm_out.linear.weight"] = scale_shift_table[1]
+            state_dict[prefix + "norm_out.linear.bias"] = scale_shift_table[0]
+        return super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
 
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py
@@ -310,6 +310,15 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if "scale_shift_table" in state_dict:
+            scale_shift_table = state_dict.pop("scale_shift_table")
+            state_dict[prefix + "norm_out.linear.weight"] = scale_shift_table[1]
+            state_dict[prefix + "norm_out.linear.bias"] = scale_shift_table[0]
+        return super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
@@ -400,6 +400,27 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        key = "scale_shift_table"
+        if prefix + key in state_dict:
+            scale_shift_table = state_dict.pop(prefix + key)
+            inner_dim = scale_shift_table.shape[-1]
+
+            weight = torch.eye(inner_dim).repeat(2, 1)
+            bias = scale_shift_table.reshape(2, inner_dim).flatten()
+
+            state_dict[prefix + "norm_out.linear.weight"] = weight
+            state_dict[prefix + "norm_out.linear.bias"] = bias
+
+        if prefix + "norm_out.weight" in state_dict:
+            state_dict.pop(prefix + "norm_out.weight")
+        if prefix + "norm_out.bias" in state_dict:
+            state_dict.pop(prefix + "norm_out.bias")
+
+        return super(LTXVideoTransformer3DModel, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
@@ -439,6 +439,27 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        key = "scale_shift_table"
+        if prefix + key in state_dict:
+            scale_shift_table = state_dict.pop(prefix + key)
+            inner_dim = scale_shift_table.shape[-1]
+
+            weight = torch.eye(inner_dim).repeat(2, 1)
+            bias = scale_shift_table.reshape(2, inner_dim).flatten()
+
+            state_dict[prefix + "norm_out.linear.weight"] = weight
+            state_dict[prefix + "norm_out.linear.bias"] = bias
+
+        if prefix + "norm_out.weight" in state_dict:
+            state_dict.pop(prefix + "norm_out.weight")
+        if prefix + "norm_out.bias" in state_dict:
+            state_dict.pop(prefix + "norm_out.bias")
+
+        return super(WanTransformer3DModel, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -270,6 +270,27 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        key = "scale_shift_table"
+        if prefix + key in state_dict:
+            scale_shift_table = state_dict.pop(prefix + key)
+            inner_dim = scale_shift_table.shape[-1]
+
+            weight = torch.eye(inner_dim).repeat(2, 1)
+            bias = scale_shift_table.reshape(2, inner_dim).flatten()
+
+            state_dict[prefix + "norm_out.linear.weight"] = weight
+            state_dict[prefix + "norm_out.linear.bias"] = bias
+
+        if prefix + "norm_out.weight" in state_dict:
+            state_dict.pop(prefix + "norm_out.weight")
+        if prefix + "norm_out.bias" in state_dict:
+            state_dict.pop(prefix + "norm_out.bias")
+
+        return super(WanVACETransformer3DModel, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,