PR review pt.2

Gleb Sterkin · Gleb Sterkin · commit e4cd847e0281 · 2026-03-24T15:30:31.000+02:00
diff --git a/video/wan2.1/README.md b/video/wan2.1/README.md
@@ -97,7 +97,7 @@ Pass `--sampler euler` to use Euler sampling for step-distilled models:
 For text to video pipeline you can try [this 4 steps distilled model](https://huggingface.co/lightx2v/Wan2.1-Distill-Models/blob/main/wan2.1_t2v_14b_lightx2v_4step.safetensors)
 
 ```shell
-wget https://huggingface.co/lightx2v/Wan2.1-Distill-Models/blob/main/wan2.1_t2v_14b_lightx2v_4step.safetensors
+wget https://huggingface.co/lightx2v/Wan2.1-Distill-Models/resolve/main/wan2.1_t2v_14b_lightx2v_4step.safetensors
 ```
 
 ```shell
@@ -150,5 +150,5 @@ Recommended thresholds (1.3B):
 |![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_005.gif)|![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_01.gif)|![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_025.gif)|
 
 # References
-1. [Original WAN 2.1 implemetation](https://github.com/Wan-Video/Wan2.1)
+1. [Original WAN 2.1 implementation](https://github.com/Wan-Video/Wan2.1)
 2. [LightX2V](https://github.com/ModelTC/LightX2V)
diff --git a/video/wan2.1/img2video.py b/video/wan2.1/img2video.py
@@ -49,7 +49,7 @@ def quantization_predicate(name, m):
     )
     parser.add_argument(
         "--n-prompt",
-        default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        default="Text, watermarks, blury image, JPEG artifacts",
     )
     parser.add_argument(
         "--teacache",
diff --git a/video/wan2.1/txt2video.py b/video/wan2.1/txt2video.py
@@ -1,4 +1,4 @@
-# Copyright © 2025 Apple Inc.
+# Copyright © 2026 Apple Inc.
 
 """Generate videos from text using Wan2.1."""
 
@@ -48,7 +48,7 @@ def quantization_predicate(name, m):
     )
     parser.add_argument(
         "--n-prompt",
-        default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        default="Text, watermarks, blury image, JPEG artifacts",
     )
     parser.add_argument(
         "--teacache",
diff --git a/video/wan2.1/wan/layers.py b/video/wan2.1/wan/layers.py
@@ -22,16 +22,6 @@ def _residual_gate(x, y, gate):
     return x + y * gate
 
 
-class WanRMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = mx.ones((dim,))
-
-    def __call__(self, x: mx.array) -> mx.array:
-        return mx.fast.rms_norm(x, self.weight, self.eps)
-
-
 class WanSelfAttention(nn.Module):
     def __init__(
         self,
@@ -48,10 +38,10 @@ def __init__(
         self.qkv = nn.Linear(dim, dim * 3)
         self.o = nn.Linear(dim, dim)
 
-        self.norm_q = WanRMSNorm(dim, eps=eps)
-        self.norm_k = WanRMSNorm(dim, eps=eps)
+        self.norm_q = nn.RMSNorm(dim, eps=eps)
+        self.norm_k = nn.RMSNorm(dim, eps=eps)
 
-    def _attend(self, x, grid_sizes, freqs):
+    def _attend(self, x, grid_sizes):
         """Compute self-attention. Returns attn output [B, n, L, d]."""
         B, L, _ = x.shape
         n, d = self.num_heads, self.head_dim
@@ -66,17 +56,17 @@ def _attend(self, x, grid_sizes, freqs):
         k = k.reshape(B, L, n, d)
         v = v.reshape(B, L, n, d)
 
-        q = rope_apply(q, grid_sizes, freqs)
-        k = rope_apply(k, grid_sizes, freqs)
+        q = rope_apply(q, grid_sizes, self.head_dim)
+        k = rope_apply(k, grid_sizes, self.head_dim)
 
         q = q.transpose(0, 2, 1, 3)
         k = k.transpose(0, 2, 1, 3)
         v = v.transpose(0, 2, 1, 3)
         return mx.fast.scaled_dot_product_attention(q, k, v, scale=self.head_dim**-0.5)
 
-    def __call__(self, x, grid_sizes, freqs):
+    def __call__(self, x, grid_sizes):
         B, L, C = x.shape
-        attn = self._attend(x, grid_sizes, freqs)
+        attn = self._attend(x, grid_sizes)
         return self.o(attn.transpose(0, 2, 1, 3).reshape(B, L, C))
 
 
@@ -97,8 +87,8 @@ def __init__(
         self.kv = nn.Linear(dim, dim * 2)
         self.o = nn.Linear(dim, dim)
 
-        self.norm_q = WanRMSNorm(dim, eps=eps)
-        self.norm_k = WanRMSNorm(dim, eps=eps)
+        self.norm_q = nn.RMSNorm(dim, eps=eps)
+        self.norm_k = nn.RMSNorm(dim, eps=eps)
 
     def _attend(self, x, context, context_lens):
         """Compute text cross-attention. Returns (q, attn_out) both [B, n, L, d]."""
@@ -147,7 +137,7 @@ def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
         super().__init__(dim, num_heads, eps)
         self.k_img = nn.Linear(dim, dim)
         self.v_img = nn.Linear(dim, dim)
-        self.norm_k_img = WanRMSNorm(dim, eps=eps)
+        self.norm_k_img = nn.RMSNorm(dim, eps=eps)
 
     def __call__(self, x, context, context_lens):
         img_ctx_len = context.shape[1] - T5_CONTEXT_TOKEN_NUMBER
@@ -220,7 +210,6 @@ def __call__(
         x: mx.array,
         e: mx.array,
         grid_sizes: list,
-        freqs: dict,
         context: mx.array,
         context_lens: Optional[mx.array],
     ) -> mx.array:
@@ -230,7 +219,6 @@ def __call__(
         y = self.self_attn(
             mx.fast.layer_norm(x, e[0, 1], e[0, 0], self.eps),
             grid_sizes,
-            freqs,
         )
         x = _residual_gate(x, y, e[:, 2])
 
diff --git a/video/wan2.1/wan/model.py b/video/wan2.1/wan/model.py
@@ -18,7 +18,6 @@
 from einops import rearrange
 
 from .layers import Head, WanAttentionBlock
-from .rope import precompute_rope_freqs
 
 
 @partial(mx.compile, shapeless=True)
@@ -101,19 +100,6 @@ def __init__(
         # Output head
         self.head = Head(dim, out_dim, patch_size, eps)
 
-        # Precompute RoPE frequencies (not saved in checkpoint)
-        self._freqs = precompute_rope_freqs(
-            max_frames=1024,
-            max_height=1024,
-            max_width=1024,
-            head_dim=self.head_dim,
-            theta=10000.0,
-        )
-
-    @property
-    def freqs(self):
-        return self._freqs
-
     def _embed_image(self, clip_fea: mx.array) -> mx.array:
         """Project CLIP features through img_emb MLP."""
         x = self.img_emb_norm1(clip_fea)
@@ -205,7 +191,7 @@ def __call__(
         else:
             x_in = x
             for block in self.blocks:
-                x = block(x, e, grid_sizes, self.freqs, context, context_lens)
+                x = block(x, e, grid_sizes, context, context_lens)
             new_residual = x - x_in
 
         # Output head
@@ -347,27 +333,3 @@ def _merge_qkv_weights(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
                 merged[key] = value
 
         return merged
-
-
-def create_wan_model(model_size: str = "1.3B", **kwargs) -> WanModel:
-    configs = {
-        "1.3B": {
-            "dim": 1536,
-            "ffn_dim": 8960,
-            "freq_dim": 256,
-            "num_heads": 12,
-            "num_layers": 30,
-        },
-        "14B": {
-            "dim": 5120,
-            "ffn_dim": 13824,
-            "freq_dim": 256,
-            "num_heads": 40,
-            "num_layers": 40,
-        },
-    }
-    if model_size not in configs:
-        raise ValueError(f"Unknown model size: {model_size}")
-    config = configs[model_size]
-    config.update(kwargs)
-    return WanModel(**config)
diff --git a/video/wan2.1/wan/pipeline.py b/video/wan2.1/wan/pipeline.py
@@ -365,7 +365,8 @@ def generate_latents(
                 else:
                     noise_pred = noise_cond
 
-            # Scheduler step
+            # Scheduler step — async_eval starts GPU work before yielding
+            # so the caller's mx.eval(x_t) blocks for less time.
             x_t = sampler.step(noise_pred, t, x_t)
             mx.async_eval(x_t)
             yield x_t
diff --git a/video/wan2.1/wan/rope.py b/video/wan2.1/wan/rope.py
@@ -7,7 +7,6 @@
 Uses mx.fast.rope for optimized Metal kernel.
 """
 
-from functools import partial
 from typing import Tuple
 
 import mlx.core as mx
@@ -29,71 +28,6 @@ def get_rope_dimensions(head_dim: int) -> Tuple[int, int, int]:
     return frame_dim, height_dim, width_dim
 
 
-def precompute_rope_freqs(
-    max_frames: int,
-    max_height: int,
-    max_width: int,
-    head_dim: int,
-    theta: float = 10000.0,
-) -> dict:
-    """
-    Precompute RoPE frequencies for 3D positions.
-
-    Each axis gets its own frequency computation with its own dimension.
-    """
-    frame_dim, height_dim, width_dim = get_rope_dimensions(head_dim)
-
-    dim_frame = frame_dim // 2
-    dim_height = height_dim // 2
-    dim_width = width_dim // 2
-
-    frame_inv_freq = 1.0 / (
-        theta ** (mx.arange(0, frame_dim, 2, dtype=mx.float32) / frame_dim)
-    )
-    height_inv_freq = 1.0 / (
-        theta ** (mx.arange(0, height_dim, 2, dtype=mx.float32) / height_dim)
-    )
-    width_inv_freq = 1.0 / (
-        theta ** (mx.arange(0, width_dim, 2, dtype=mx.float32) / width_dim)
-    )
-
-    frame_positions = mx.arange(max_frames, dtype=mx.float32)
-    height_positions = mx.arange(max_height, dtype=mx.float32)
-    width_positions = mx.arange(max_width, dtype=mx.float32)
-
-    frame_freqs = frame_positions[:, None] * frame_inv_freq[None, :]
-    frame_cos, frame_sin = mx.cos(frame_freqs), mx.sin(frame_freqs)
-
-    height_freqs = height_positions[:, None] * height_inv_freq[None, :]
-    height_cos, height_sin = mx.cos(height_freqs), mx.sin(height_freqs)
-
-    width_freqs = width_positions[:, None] * width_inv_freq[None, :]
-    width_cos, width_sin = mx.cos(width_freqs), mx.sin(width_freqs)
-
-    return {
-        "frame": {
-            "cos": frame_cos,
-            "sin": frame_sin,
-            "dim": dim_frame,
-            "full_dim": frame_dim,
-        },
-        "height": {
-            "cos": height_cos,
-            "sin": height_sin,
-            "dim": dim_height,
-            "full_dim": height_dim,
-        },
-        "width": {
-            "cos": width_cos,
-            "sin": width_sin,
-            "dim": dim_width,
-            "full_dim": width_dim,
-        },
-        "theta": theta,
-        "head_dim": head_dim,
-    }
-
-
 @mx.compile
 def _rope_3d(x, f, h, w, frame_dim, height_dim, width_dim, theta):
     B = x.shape[0]
@@ -129,24 +63,23 @@ def _rope_3d(x, f, h, w, frame_dim, height_dim, width_dim, theta):
 def rope_apply(
     x: mx.array,
     grid_sizes: list,
-    freqs: dict,
+    head_dim: int,
+    theta: float = 10000.0,
 ) -> mx.array:
     """
     Apply 3D RoPE using mx.fast.rope with reshapes.
 
     Args:
         x: Tensor of shape [B, L, H, D]
         grid_sizes: List of [frames, height, width] per batch element
-        freqs: Precomputed frequencies from precompute_rope_freqs()
+        head_dim: Dimension per attention head
+        theta: RoPE base frequency
 
     Returns:
         Rotated tensor with same shape as x
     """
     f, h, w = grid_sizes[0]
 
-    theta = freqs["theta"]
-    frame_dim = freqs["frame"]["full_dim"]
-    height_dim = freqs["height"]["full_dim"]
-    width_dim = freqs["width"]["full_dim"]
+    frame_dim, height_dim, width_dim = get_rope_dimensions(head_dim)
 
     return _rope_3d(x, f, h, w, frame_dim, height_dim, width_dim, theta)
diff --git a/video/wan2.1/wan/sampler.py b/video/wan2.1/wan/sampler.py
@@ -29,9 +29,6 @@ def __init__(
         solver_order: int = 2,
         prediction_type: str = "flow_prediction",
         shift: Optional[float] = 1.0,
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
         predict_x0: bool = True,
         solver_type: str = "bh2",
         lower_order_final: bool = True,
@@ -48,9 +45,6 @@ def __init__(
         self.solver_order = solver_order
         self.prediction_type = prediction_type
         self.shift = shift
-        self.thresholding = thresholding
-        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
-        self.sample_max_value = sample_max_value
         self.predict_x0 = predict_x0
         self.solver_type = solver_type
         self.lower_order_final = lower_order_final
@@ -111,35 +105,11 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return 1 - sigma, sigma
 
     def convert_model_output(self, model_output, sample):
-        sigma = self.sigmas[self.step_index]
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-
+        sigma_t = self.sigmas[self.step_index]
         if self.predict_x0:
-            sigma_t = self.sigmas[self.step_index]
-            x0_pred = sample - sigma_t * model_output
-            if self.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-            return x0_pred
+            return sample - sigma_t * model_output
         else:
-            sigma_t = self.sigmas[self.step_index]
-            epsilon = sample - (1 - sigma_t) * model_output
-            return epsilon
-
-    def _threshold_sample(self, sample):
-        dtype = sample.dtype
-        batch_size, channels, *remaining_dims = sample.shape
-        num_elements = 1
-        for d in remaining_dims:
-            num_elements *= d
-        sample = sample.reshape(batch_size, channels * num_elements)
-        abs_sample = mx.abs(sample)
-        sorted_abs = mx.sort(abs_sample, axis=1)
-        quantile_idx = int(self.dynamic_thresholding_ratio * abs_sample.shape[1])
-        s = sorted_abs[:, quantile_idx : quantile_idx + 1]
-        s = mx.clip(s, 1.0, self.sample_max_value)
-        sample = mx.clip(sample, -s, s) / s
-        sample = sample.reshape(batch_size, channels, *remaining_dims)
-        return sample.astype(dtype)
+            return sample - (1 - sigma_t) * model_output
 
     def multistep_uni_p_bh_update(self, model_output, sample, order):
         model_output_list = self.model_outputs
@@ -197,6 +167,8 @@ def multistep_uni_p_bh_update(self, model_output, sample, order):
             if order == 2:
                 rhos_p = mx.array([0.5], dtype=x.dtype)
             else:
+                # Run on CPU for numerical stability (float64 not supported on Metal GPU),
+                # matching the reference implementation.
                 with mx.stream(mx.cpu):
                     rhos_p = mx.linalg.solve(R[:-1, :-1], b[:-1]).astype(x.dtype)
         else:
@@ -286,6 +258,8 @@ def multistep_uni_c_bh_update(
         if order == 1:
             rhos_c = mx.array([0.5], dtype=x.dtype)
         else:
+            # Run on CPU for numerical stability (float64 not supported on Metal GPU),
+            # matching the reference implementation.
             with mx.stream(mx.cpu):
                 rhos_c = mx.linalg.solve(R, b).astype(x.dtype)
 
diff --git a/video/wan2.1/wan/t5.py b/video/wan2.1/wan/t5.py
diff --git a/video/wan2.1/wan/utils.py b/video/wan2.1/wan/utils.py
diff --git a/video/wan2.1/wan/vae.py b/video/wan2.1/wan/vae.py
diff --git a/video/wan2.1/wan/vae_layers.py b/video/wan2.1/wan/vae_layers.py

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ def quantization_predicate(name, m):`
`49`	`49`	`)`
`50`	`50`	`parser.add_argument(`
`51`	`51`	`"--n-prompt",`
`52`		`- default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",`
	`52`	`+ default="Text, watermarks, blury image, JPEG artifacts",`
`53`	`53`	`)`
`54`	`54`	`parser.add_argument(`
`55`	`55`	`"--teacache",`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright © 2025 Apple Inc.`
	`1`	`+# Copyright © 2026 Apple Inc.`
`2`	`2`
`3`	`3`	`"""Generate videos from text using Wan2.1."""`
`4`	`4`
`@@ -48,7 +48,7 @@ def quantization_predicate(name, m):`
`48`	`48`	`)`
`49`	`49`	`parser.add_argument(`
`50`	`50`	`"--n-prompt",`
`51`		`- default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",`
	`51`	`+ default="Text, watermarks, blury image, JPEG artifacts",`
`52`	`52`	`)`
`53`	`53`	`parser.add_argument(`
`54`	`54`	`"--teacache",`