ml-explore
diff --git a/‎video/wan2.1/README.md‎
Lines changed: 2 additions & 2 deletions b/‎video/wan2.1/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎video/wan2.1/img2video.py‎
Lines changed: 1 addition & 1 deletion b/‎video/wan2.1/img2video.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎video/wan2.1/txt2video.py‎
Lines changed: 2 additions & 2 deletions b/‎video/wan2.1/txt2video.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎video/wan2.1/wan/layers.py‎
Lines changed: 10 additions & 22 deletions b/‎video/wan2.1/wan/layers.py‎
Lines changed: 10 additions & 22 deletions
diff --git a/‎video/wan2.1/wan/model.py‎
Lines changed: 1 addition & 39 deletions b/‎video/wan2.1/wan/model.py‎
Lines changed: 1 addition & 39 deletions
@@ -97,7 +97,7 @@ Pass `--sampler euler` to use Euler sampling for step-distilled models:
 For text to video pipeline you can try [this 4 steps distilled model](https://huggingface.co/lightx2v/Wan2.1-Distill-Models/blob/main/wan2.1_t2v_14b_lightx2v_4step.safetensors)
 
 ```shell
-wget https://huggingface.co/lightx2v/Wan2.1-Distill-Models/blob/main/wan2.1_t2v_14b_lightx2v_4step.safetensors
+wget https://huggingface.co/lightx2v/Wan2.1-Distill-Models/resolve/main/wan2.1_t2v_14b_lightx2v_4step.safetensors
 ```
 
 ```shell
@@ -150,5 +150,5 @@ Recommended thresholds (1.3B):
 |![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_005.gif)|![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_01.gif)|![WAN t2v 1.3B teacache=0.05](static/out_t2v_1_3b_teacache_025.gif)|
 
 # References
-1. [Original WAN 2.1 implemetation](https://github.com/Wan-Video/Wan2.1)
+1. [Original WAN 2.1 implementation](https://github.com/Wan-Video/Wan2.1)
 2. [LightX2V](https://github.com/ModelTC/LightX2V)
@@ -49,7 +49,7 @@ def quantization_predicate(name, m):
     )
     parser.add_argument(
         "--n-prompt",
-        default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        default="Text, watermarks, blury image, JPEG artifacts",
     )
     parser.add_argument(
         "--teacache",
 
@@ -1,4 +1,4 @@
-# Copyright © 2025 Apple Inc.
+# Copyright © 2026 Apple Inc.
 
 """Generate videos from text using Wan2.1."""
 
@@ -48,7 +48,7 @@ def quantization_predicate(name, m):
     )
     parser.add_argument(
         "--n-prompt",
-        default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        default="Text, watermarks, blury image, JPEG artifacts",
     )
     parser.add_argument(
         "--teacache",
 
@@ -22,16 +22,6 @@ def _residual_gate(x, y, gate):
     return x + y * gate
 
 
-class WanRMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = mx.ones((dim,))
-
-    def __call__(self, x: mx.array) -> mx.array:
-        return mx.fast.rms_norm(x, self.weight, self.eps)
-
-
 class WanSelfAttention(nn.Module):
     def __init__(
         self,
@@ -48,10 +38,10 @@ def __init__(
         self.qkv = nn.Linear(dim, dim * 3)
         self.o = nn.Linear(dim, dim)
 
-        self.norm_q = WanRMSNorm(dim, eps=eps)
-        self.norm_k = WanRMSNorm(dim, eps=eps)
+        self.norm_q = nn.RMSNorm(dim, eps=eps)
+        self.norm_k = nn.RMSNorm(dim, eps=eps)
 
-    def _attend(self, x, grid_sizes, freqs):
+    def _attend(self, x, grid_sizes):
         """Compute self-attention. Returns attn output [B, n, L, d]."""
         B, L, _ = x.shape
         n, d = self.num_heads, self.head_dim
@@ -66,17 +56,17 @@ def _attend(self, x, grid_sizes, freqs):
         k = k.reshape(B, L, n, d)
         v = v.reshape(B, L, n, d)
 
-        q = rope_apply(q, grid_sizes, freqs)
-        k = rope_apply(k, grid_sizes, freqs)
+        q = rope_apply(q, grid_sizes, self.head_dim)
+        k = rope_apply(k, grid_sizes, self.head_dim)
 
         q = q.transpose(0, 2, 1, 3)
         k = k.transpose(0, 2, 1, 3)
         v = v.transpose(0, 2, 1, 3)
         return mx.fast.scaled_dot_product_attention(q, k, v, scale=self.head_dim**-0.5)
 
-    def __call__(self, x, grid_sizes, freqs):
+    def __call__(self, x, grid_sizes):
         B, L, C = x.shape
-        attn = self._attend(x, grid_sizes, freqs)
+        attn = self._attend(x, grid_sizes)
         return self.o(attn.transpose(0, 2, 1, 3).reshape(B, L, C))
 
 
@@ -97,8 +87,8 @@ def __init__(
         self.kv = nn.Linear(dim, dim * 2)
         self.o = nn.Linear(dim, dim)
 
-        self.norm_q = WanRMSNorm(dim, eps=eps)
-        self.norm_k = WanRMSNorm(dim, eps=eps)
+        self.norm_q = nn.RMSNorm(dim, eps=eps)
+        self.norm_k = nn.RMSNorm(dim, eps=eps)
 
     def _attend(self, x, context, context_lens):
         """Compute text cross-attention. Returns (q, attn_out) both [B, n, L, d]."""
@@ -147,7 +137,7 @@ def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
         super().__init__(dim, num_heads, eps)
         self.k_img = nn.Linear(dim, dim)
         self.v_img = nn.Linear(dim, dim)
-        self.norm_k_img = WanRMSNorm(dim, eps=eps)
+        self.norm_k_img = nn.RMSNorm(dim, eps=eps)
 
     def __call__(self, x, context, context_lens):
         img_ctx_len = context.shape[1] - T5_CONTEXT_TOKEN_NUMBER
@@ -220,7 +210,6 @@ def __call__(
         x: mx.array,
         e: mx.array,
         grid_sizes: list,
-        freqs: dict,
         context: mx.array,
         context_lens: Optional[mx.array],
     ) -> mx.array:
@@ -230,7 +219,6 @@ def __call__(
         y = self.self_attn(
             mx.fast.layer_norm(x, e[0, 1], e[0, 0], self.eps),
             grid_sizes,
-            freqs,
         )
         x = _residual_gate(x, y, e[:, 2])
 
 
@@ -18,7 +18,6 @@
 from einops import rearrange
 
 from .layers import Head, WanAttentionBlock
-from .rope import precompute_rope_freqs
 
 
 @partial(mx.compile, shapeless=True)
@@ -101,19 +100,6 @@ def __init__(
         # Output head
         self.head = Head(dim, out_dim, patch_size, eps)
 
-        # Precompute RoPE frequencies (not saved in checkpoint)
-        self._freqs = precompute_rope_freqs(
-            max_frames=1024,
-            max_height=1024,
-            max_width=1024,
-            head_dim=self.head_dim,
-            theta=10000.0,
-        )
-
-    @property
-    def freqs(self):
-        return self._freqs
-
     def _embed_image(self, clip_fea: mx.array) -> mx.array:
         """Project CLIP features through img_emb MLP."""
         x = self.img_emb_norm1(clip_fea)
@@ -205,7 +191,7 @@ def __call__(
         else:
             x_in = x
             for block in self.blocks:
-                x = block(x, e, grid_sizes, self.freqs, context, context_lens)
+                x = block(x, e, grid_sizes, context, context_lens)
             new_residual = x - x_in
 
         # Output head
@@ -347,27 +333,3 @@ def _merge_qkv_weights(weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
                 merged[key] = value
 
         return merged
-
-
-def create_wan_model(model_size: str = "1.3B", **kwargs) -> WanModel:
-    configs = {
-        "1.3B": {
-            "dim": 1536,
-            "ffn_dim": 8960,
-            "freq_dim": 256,
-            "num_heads": 12,
-            "num_layers": 30,
-        },
-        "14B": {
-            "dim": 5120,
-            "ffn_dim": 13824,
-            "freq_dim": 256,
-            "num_heads": 40,
-            "num_layers": 40,
-        },
-    }
-    if model_size not in configs:
-        raise ValueError(f"Unknown model size: {model_size}")
-    config = configs[model_size]
-    config.update(kwargs)
-    return WanModel(**config)
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ def quantization_predicate(name, m):`
`49`	`49`	`)`
`50`	`50`	`parser.add_argument(`
`51`	`51`	`"--n-prompt",`
`52`		`- default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",`
	`52`	`+ default="Text, watermarks, blury image, JPEG artifacts",`
`53`	`53`	`)`
`54`	`54`	`parser.add_argument(`
`55`	`55`	`"--teacache",`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright © 2025 Apple Inc.`
	`1`	`+# Copyright © 2026 Apple Inc.`
`2`	`2`
`3`	`3`	`"""Generate videos from text using Wan2.1."""`
`4`	`4`
`@@ -48,7 +48,7 @@ def quantization_predicate(name, m):`
`48`	`48`	`)`
`49`	`49`	`parser.add_argument(`
`50`	`50`	`"--n-prompt",`
`51`		`- default="镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",`
	`51`	`+ default="Text, watermarks, blury image, JPEG artifacts",`
`52`	`52`	`)`
`53`	`53`	`parser.add_argument(`
`54`	`54`	`"--teacache",`