huggingface · tolgacangoz · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
@@ -40,6 +40,7 @@ The following Wan models are supported in Diffusers:
 - [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
 - [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
 - [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
+- [Wan 2.2 S2V 14B](https://huggingface.co/Wan-AI/Wan2.2-S2V-14B-Diffusers)
 
 > [!TIP]
 > Click on the Wan models in the right sidebar for more examples of video generation.
@@ -95,15 +96,15 @@ pipeline = WanPipeline.from_pretrained(
 pipeline.to("cuda")
 
 prompt = """
-The camera rushes from far to near in a low-angle shot, 
-revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+The camera rushes from far to near in a low-angle shot,
+revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
 shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
 """
 negative_prompt = """
-Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, 
-low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, 
+Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
+low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
 misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
 """
 
@@ -150,15 +151,15 @@ pipeline.transformer = torch.compile(
 )
 
 prompt = """
-The camera rushes from far to near in a low-angle shot, 
-revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+The camera rushes from far to near in a low-angle shot,
+revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
 shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
 """
 negative_prompt = """
-Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, 
-low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, 
+Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
+low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
 misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
 """
 
@@ -236,6 +237,129 @@ export_to_video(output, "output.mp4", fps=16)
 </hfoption>
 </hfoptions>
 
+
+### Wan-S2V: Audio-Driven Cinematic Video Generation
+
+[Wan-S2V](https://huggingface.co/papers/2508.18621) by the Wan Team.
+
+*Current state-of-the-art (SOTA) methods for audio-driven character animation demonstrate promising performance for scenarios primarily involving speech and singing. However, they often fall short in more complex film and television productions, which demand sophisticated elements such as nuanced character interactions, realistic body movements, and dynamic camera work. To address this long-standing challenge of achieving film-level character animation, we propose an audio-driven model, which we refere to as Wan-S2V, built upon Wan. Our model achieves significantly enhanced expressiveness and fidelity in cinematic contexts compared to existing approaches. We conducted extensive experiments, benchmarking our method against cutting-edge models such as Hunyuan-Avatar and Omnihuman. The experimental results consistently demonstrate that our approach significantly outperforms these existing solutions. Additionally, we explore the versatility of our method through its applications in long-form video generation and precise video lip-sync editing.*
+
+The project page: https://humanaigc.github.io/wan-s2v-webpage/
+
+This model was contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
+
+The example below demonstrates how to use the speech-to-video pipeline to generate a video using a text description, a starting frame, an audio, and a pose video.
+
+<hfoptions id="S2V usage">
+<hfoption id="usage">
+
+```python
+import numpy as np, math
+import torch
+from diffusers import AutoencoderKLWan, WanSpeechToVideoPipeline
+from diffusers.utils import export_to_merged_video_audio, load_image, load_audio, load_video, export_to_video
+from transformers import Wav2Vec2ForCTC
+import requests
+from PIL import Image
+from io import BytesIO
+
+
+model_id = "Wan-AI/Wan2.2-S2V-14B-Diffusers"
+audio_encoder = Wav2Vec2ForCTC.from_pretrained(model_id, subfolder="audio_encoder", dtype=torch.float32)
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipe = WanSpeechToVideoPipeline.from_pretrained(
+    model_id, vae=vae, audio_encoder=audio_encoder, torch_dtype=torch.bfloat16
+)
+pipe.to("cuda")
+
+headers = {"User-Agent": "Mozilla/5.0"}
+url = "https://upload.wikimedia.org/wikipedia/commons/4/46/Albert_Einstein_sticks_his_tongue.jpg"
+resp = requests.get(url, headers=headers, timeout=30)
+image = Image.open(BytesIO(resp.content))
+
+audio, sampling_rate = load_audio("https://github.com/Wan-Video/Wan2.2/raw/refs/heads/main/examples/Five%20Hundred%20Miles.MP3")
+#pose_video_path_or_url = "https://github.com/Wan-Video/Wan2.2/raw/refs/heads/main/examples/pose.mp4"
+
+def get_size_less_than_area(height,
+                            width,
+                            target_area=1024 * 704,
+                            divisor=64):
+    if height * width <= target_area:
+        # If the original image area is already less than or equal to the target,
+        # no resizing is needed—just padding. Still need to ensure that the padded area doesn't exceed the target.
+        max_upper_area = target_area
+        min_scale = 0.1
+        max_scale = 1.0
+    else:
+        # Resize to fit within the target area and then pad to multiples of `divisor`
+        max_upper_area = target_area  # Maximum allowed total pixel count after padding
+        d = divisor - 1
+        b = d * (height + width)
+        a = height * width
+        c = d**2 - max_upper_area
+
+        # Calculate scale boundaries using quadratic equation
+        min_scale = (-b + math.sqrt(b**2 - 2 * a * c)) / (2 * a)  # Scale when maximum padding is applied
+        max_scale = math.sqrt(max_upper_area / (height * width))  # Scale without any padding
+
+    # We want to choose the largest possible scale such that the final padded area does not exceed max_upper_area
+    # Use binary search-like iteration to find this scale
+    find_it = False
+    for i in range(100):
+        scale = max_scale - (max_scale - min_scale) * i / 100
+        new_height, new_width = int(height * scale), int(width * scale)
+
+        # Pad to make dimensions divisible by 64
+        pad_height = (64 - new_height % 64) % 64
+        pad_width = (64 - new_width % 64) % 64
+        pad_top = pad_height // 2
+        pad_bottom = pad_height - pad_top
+        pad_left = pad_width // 2
+        pad_right = pad_width - pad_left
+
+        padded_height, padded_width = new_height + pad_height, new_width + pad_width
+
+        if padded_height * padded_width <= max_upper_area:
+            find_it = True
+            break
+
+    if find_it:
+        return padded_height, padded_width
+    else:
+        # Fallback: calculate target dimensions based on aspect ratio and divisor alignment
+        aspect_ratio = width / height
+        target_width = int(
+            (target_area * aspect_ratio)**0.5 // divisor * divisor)
+        target_height = int(
+            (target_area / aspect_ratio)**0.5 // divisor * divisor)
+
+        # Ensure the result is not larger than the original resolution
+        if target_width >= width or target_height >= height:
+            target_width = int(width // divisor * divisor)
+            target_height = int(height // divisor * divisor)
+
+        return target_height, target_width
+
+height, width = get_size_less_than_area(first_frame.height, first_frame.width, 480*832)
+
+prompt = "Einstein singing a song."
+
+output = pipe(
+    prompt=prompt, image=image, audio=audio, sampling_rate=sampling_rate,
+    height=height, width=width, num_frames_per_chunk=80,
+    #pose_video_path_or_url=pose_video_path_or_url,
+).frames[0]
+export_to_video(output, "output.mp4", fps=16)
+
+# Lastly, we need to merge the video and audio into a new video, with the duration set to
+# the shorter of the two and overwrite the original video file.
+export_to_merged_video_audio("output.mp4", "audio.mp3")
+```
+
+</hfoption>
+</hfoptions>
+
+
 ### Any-to-Video Controllable Generation
 
 Wan VACE supports various generation techniques which achieve controllable video generation. Some of the capabilities include:
@@ -281,10 +405,10 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
 
   # use "steamboat willie style" to trigger the LoRA
   prompt = """
-  steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot, 
-  revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-  for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-  Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+  steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
+  revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+  for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+  Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
   shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
   """
 
@@ -353,6 +477,12 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
   - all
   - __call__
 
+## WanSpeechToVideoPipeline
+
+[[autodoc]] WanSpeechToVideoPipeline
+  - all
+  - __call__
+
 ## WanVideoToVideoPipeline
 
 [[autodoc]] WanVideoToVideoPipeline
@@ -361,4 +491,4 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
 
 ## WanPipelineOutput
 
-[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
+[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
@@ -6,13 +6,22 @@
 from accelerate import init_empty_weights
 from huggingface_hub import hf_hub_download, snapshot_download
 from safetensors.torch import load_file
-from transformers import AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, UMT5EncoderModel
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    CLIPVisionModelWithProjection,
+    UMT5EncoderModel,
+    Wav2Vec2ForCTC,
+    Wav2Vec2Processor,
+)
 
 from diffusers import (
     AutoencoderKLWan,
     UniPCMultistepScheduler,
     WanImageToVideoPipeline,
     WanPipeline,
+    WanS2VTransformer3DModel,
+    WanSpeechToVideoPipeline,
     WanTransformer3DModel,
     WanVACEPipeline,
     WanVACETransformer3DModel,
@@ -105,8 +114,59 @@
     "after_proj": "proj_out",
 }
 
+S2V_TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+    # S2V-specific audio component mappings
+    "casual_audio_encoder.encoder.conv2.conv": "condition_embedder.causal_audio_encoder.encoder.conv2.conv.conv",
+    "casual_audio_encoder.encoder.conv3.conv": "condition_embedder.causal_audio_encoder.encoder.conv3.conv.conv",
+    "casual_audio_encoder.weights": "condition_embedder.causal_audio_encoder.weighted_avg.weights",
+    # Pose condition encoder mappings
+    "cond_encoder.weight": "condition_embedder.pose_embedder.weight",
+    "cond_encoder.bias": "condition_embedder.pose_embedder.bias",
+    "trainable_cond_mask": "trainable_condition_mask",
+    "patch_embedding": "motion_in.patch_embedding",
+    # Audio injector attention mappings - convert original q/k/v/o format to diffusers format
+    **{
+        f"audio_injector.injector.{i}.{src}": f"audio_injector.injector.{i}.{dst}"
+        for i in range(12)
+        for src, dst in [("q", "to_q"), ("k", "to_k"), ("v", "to_v"), ("o", "to_out.0")]
+    },
+}
+
 TRANSFORMER_SPECIAL_KEYS_REMAP = {}
 VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+S2V_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
 
 
 def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
@@ -364,6 +424,36 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
         }
         RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
         SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-S2V-14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-S2V-14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "audio_dim": 1024,
+                "audio_inject_layers": [0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39],
+                "enable_adain": True,
+                "adain_mode": "attn_norm",
+                "pose_dim": 16,
+                "enable_framepack": True,
+                "framepack_drop_mode": "padd",
+                "add_last_motion": True,
+                "zero_timestep": True,
+            },
+        }
+        RENAME_DICT = S2V_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = S2V_TRANSFORMER_SPECIAL_KEYS_REMAP
     return config, RENAME_DICT, SPECIAL_KEYS_REMAP
 
 
@@ -380,7 +470,9 @@ def convert_transformer(model_type: str, stage: str = None):
     original_state_dict = load_sharded_safetensors(model_dir)
 
     with init_empty_weights():
-        if "VACE" not in model_type:
+        if "S2V" in model_type:
+            transformer = WanS2VTransformer3DModel.from_config(diffusers_config)
+        elif "VACE" not in model_type:
             transformer = WanTransformer3DModel.from_config(diffusers_config)
         else:
             transformer = WanVACETransformer3DModel.from_config(diffusers_config)
@@ -926,7 +1018,7 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
+    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "S2V" not in args.model_type:
         transformer = convert_transformer(args.model_type, stage="high_noise_model")
         transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
     else:
@@ -942,7 +1034,7 @@ def get_args():
     tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
     if "FLF2V" in args.model_type:
         flow_shift = 16.0
-    elif "TI2V" in args.model_type:
+    elif "TI2V" in args.model_type or "S2V" in args.model_type:
         flow_shift = 5.0
     else:
         flow_shift = 3.0
@@ -1016,6 +1108,22 @@ def get_args():
             vae=vae,
             scheduler=scheduler,
         )
+    elif "S2V" in args.model_type:
+        audio_encoder = Wav2Vec2ForCTC.from_pretrained(
+            "Wan-AI/Wan2.2-S2V-14B", subfolder="wav2vec2-large-xlsr-53-english"
+        )
+        audio_processor = Wav2Vec2Processor.from_pretrained(
+            "Wan-AI/Wan2.2-S2V-14B", subfolder="wav2vec2-large-xlsr-53-english"
+        )
+        pipe = WanSpeechToVideoPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            audio_encoder=audio_encoder,
+            audio_processor=audio_processor,
+        )
     else:
         pipe = WanPipeline(
             transformer=transformer,