add

yiyixuxu · yiyixuxu · commit bf2c6e0a0846 · 2025-07-28T12:22:09.000+02:00
diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
@@ -320,7 +320,27 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
         }
         RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
         SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
-    return config, RENAME_DICT, SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-TI2V-5B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-TI2V-5B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 14336,
+                "freq_dim": 256,
+                "in_channels": 48,
+                "num_attention_heads": 24,
+                "num_layers": 30,
+                "out_channels": 48,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
     return config, RENAME_DICT, SPECIAL_KEYS_REMAP
 
 
@@ -567,106 +587,110 @@ def convert_vae():
   "in_channels": 12,
   "out_channels": 12,
   "decoder_base_dim": 256,
+  "scale_factor_temporal": 4,
+  "scale_factor_spatial": 16,
+  "patch_size": 2,
   "latents_mean":[
-    -0.2289, 
-    -0.0052, 
-    -0.1323, 
-    -0.2339, 
+    -0.2289,
+    -0.0052,
+    -0.1323,
+    -0.2339,
     -0.2799,
-    -0.0174,
-    -0.1838,
-    -0.1557,
+    0.0174,
+    0.1838,
+    0.1557,
     -0.1382,
-    -0.0542,
-    -0.2813,
-    -0.0891,
-    -0.1570,
+    0.0542,
+    0.2813,
+    0.0891,
+    0.1570,
     -0.0098,
-    -0.0375,
+    0.0375,
     -0.1825,
     -0.2246,
     -0.1207,
     -0.0698,
-    -0.5109,
-    -0.2665,
+    0.5109,
+    0.2665,
     -0.2108,
     -0.2158,
-    -0.2502,
+    0.2502,
     -0.2055,
     -0.0322,
-    -0.1109,
-    -0.1567,
+    0.1109,
+    0.1567,
     -0.0729,
-    -0.0899,
+    0.0899,
     -0.2799,
     -0.1230,
     -0.0313,
     -0.1649,
-    -0.0117,
-    -0.0723,
+    0.0117,
+    0.0723,
     -0.2839,
     -0.2083,
     -0.0520,
-    -0.3748,
-    -0.0152,
-    -0.1957,
-    -0.1433,
+    0.3748,
+    0.0152,
+    0.1957,
+    0.1433,
     -0.2944,
-    -0.3573,
+    0.3573,
     -0.0548,
     -0.1681,
     -0.0667,
     ],
-    "latents_std":[
-    -0.4765,
-    -1.0364,
-    -0.4514,
-    -1.1677,
-    -0.5313,
-    -0.4990,
-    -0.4818,
-    -0.5013,
-    -0.8158,
-    -1.0344,
-    -0.5894,
-    -1.0901,
-    -0.6885,
-    -0.6165,
-    -0.8454,
-    -0.4978,
-    -0.5759,
-    -0.3523,
-    -0.7135,
-    -0.6804,
-    -0.5833,
-    -1.4146,
-    -0.8986,
-    -0.5659,
-    -0.7069,
-    -0.5338,
-    -0.4889,
-    -0.4917,
-    -0.4069,
-    -0.4999,
-    -0.6866,
-    -0.4093,
-    -0.5709,
-    -0.6065,
-    -0.6415,
-    -0.4944,
-    -0.5726,
-    -1.2042,
-    -0.5458,
-    -1.6887,
-    -0.3971,
-    -1.0600,
-    -0.3943,
-    -0.5537,
-    -0.5444,
-    -0.4089,
-    -0.7468,
-    -0.7744,
+   "latents_std": [
+    0.4765,
+    1.0364,
+    0.4514,
+    1.1677,
+    0.5313,
+    0.4990,
+    0.4818,
+    0.5013,
+    0.8158,
+    1.0344,
+    0.5894,
+    1.0901,
+    0.6885,
+    0.6165,
+    0.8454,
+    0.4978,
+    0.5759,
+    0.3523,
+    0.7135,
+    0.6804,
+    0.5833,
+    1.4146,
+    0.8986,
+    0.5659,
+    0.7069,
+    0.5338,
+    0.4889,
+    0.4917,
+    0.4069,
+    0.4999,
+    0.6866,
+    0.4093,
+    0.5709,
+    0.6065,
+    0.6415,
+    0.4944,
+    0.5726,
+    1.2042,
+    0.5458,
+    1.6887,
+    0.3971,
+    1.0600,
+    0.3943,
+    0.5537,
+    0.5444,
+    0.4089,
+    0.7468,
+    0.7744,
     ],
+    "clip_output": False,
 }
 
 
@@ -855,7 +879,7 @@ def convert_vae_22():
             new_state_dict[key] = value
 
     with init_empty_weights():
-        vae = AutoencoderKLWan(**vae22_config)
+        vae = AutoencoderKLWan(**vae22_diffusers_config)
     vae.load_state_dict(new_state_dict, strict=True, assign=True)
     return vae
 
@@ -878,7 +902,7 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    if "Wan2.2" in args.model_type:
+    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
         transformer = convert_transformer(args.model_type, stage="high_noise_model")
         transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
     else:
@@ -892,7 +916,12 @@ def get_args():
 
     text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
-    flow_shift = 16.0 if "FLF2V" in args.model_type else 3.0
+    if "FLF2V" in args.model_type:
+        flow_shift = 16.0
+    elif "TI2V" in args.model_type:
+        flow_shift = 5.0
+    else:
+        flow_shift = 3.0
     scheduler = UniPCMultistepScheduler(
         prediction_type="flow_prediction", use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift
     )
@@ -902,7 +931,7 @@ def get_args():
         dtype = DTYPE_MAPPING[args.dtype]
         transformer.to(dtype)
 
-    if "Wan2.2" and "I2V" in args.model_type:
+    if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
         pipe = WanImageToVideoPipeline(
             transformer=transformer,
             transformer_2=transformer_2,
@@ -922,6 +951,15 @@ def get_args():
             scheduler=scheduler,
             boundary_ratio=0.875,
         )
+    elif "Wan2.2" and "TI2V" in args.model_type:
+        pipe = WanPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            expand_timesteps=True,
+        )
     elif "I2V" in args.model_type or "FLF2V" in args.model_type:
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -1012,6 +1012,9 @@ def __init__(
         in_channels: int = 3,
         out_channels: int = 3,
         patch_size: Optional[int] = None,
+        scale_factor_temporal: Optional[int] = 4,
+        scale_factor_spatial: Optional[int] = 8,
+        clip_output: bool = True,
     ) -> None:
         super().__init__()
 
@@ -1193,7 +1196,8 @@ def _decode(self, z: torch.Tensor, return_dict: bool = True):
                 out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
                 out = torch.cat([out, out_], 2)
 
-        out = torch.clamp(out, min=-1.0, max=1.0)
+        if self.config.clip_output:
+            out = torch.clamp(out, min=-1.0, max=1.0)
         if self.config.patch_size is not None:
             out = unpatchify(out, patch_size=self.config.patch_size)
         self.clear_cache()
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
@@ -170,8 +170,11 @@ def forward(
         timestep: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
         encoder_hidden_states_image: Optional[torch.Tensor] = None,
-    ):
+        timestep_seq_len: Optional[int] = None,
+    ):  
         timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (1, timestep_seq_len))
 
         time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
         if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
@@ -309,9 +312,24 @@ def forward(
         temb: torch.Tensor,
         rotary_emb: torch.Tensor,
     ) -> torch.Tensor:
-        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
-            self.scale_shift_table + temb.float()
-        ).chunk(6, dim=1)
+
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb.float()
+            ).chunk(6, dim=1)
 
         # 1. Self-attention
         norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
@@ -469,10 +487,22 @@ def forward(
         hidden_states = self.patch_embedding(hidden_states)
         hidden_states = hidden_states.flatten(2).transpose(1, 2)
 
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten() # batch_size * seq_len
+        else:
+            ts_seq_len = None
+
         temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
-            timestep, encoder_hidden_states, encoder_hidden_states_image
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
         )
-        timestep_proj = timestep_proj.unflatten(1, (6, -1))
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
 
         if encoder_hidden_states_image is not None:
             encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
@@ -488,7 +518,14 @@ def forward(
                 hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
 
         # 5. Output norm, projection & unpatchify
-        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        if temb.ndim ==3: 
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
 
         # Move the shift and scale tensors to the same device as hidden_states.
         # When using multi-GPU inference via accelerate these will be on the
diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py