modelscope
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎diffsynth_engine/models/basic/attention.py‎
Lines changed: 75 additions & 12 deletions b/‎diffsynth_engine/models/basic/attention.py‎
Lines changed: 75 additions & 12 deletions
diff --git a/‎diffsynth_engine/models/wan/wan_dit.py‎
Lines changed: 88 additions & 23 deletions b/‎diffsynth_engine/models/wan/wan_dit.py‎
Lines changed: 88 additions & 23 deletions
diff --git a/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 2 additions & 2 deletions b/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 2 additions & 2 deletions
@@ -1,7 +1,7 @@
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.9.10
+  rev: v0.11.5
   hooks:
     # Run the linter.
     - id: ruff
 
@@ -2,6 +2,9 @@
 import torch.nn as nn
 from einops import rearrange
 from typing import Optional
+from yunchang import LongContextAttention
+from yunchang.kernels import AttnType
+
 from diffsynth_engine.utils import logging
 from diffsynth_engine.utils.flag import (
     FLASH_ATTN_3_AVAILABLE,
@@ -12,12 +15,15 @@
     SPARGE_ATTN_AVAILABLE,
 )
 
+logger = logging.get_logger(__name__)
+
+
 if FLASH_ATTN_3_AVAILABLE:
     from flash_attn_interface import flash_attn_func as flash_attn3
 if FLASH_ATTN_2_AVAILABLE:
     from flash_attn import flash_attn_func as flash_attn2
 if XFORMERS_AVAILABLE:
-    import xformers.ops.memory_efficient_attention as xformers_attn
+    from xformers.ops import memory_efficient_attention as xformers_attn
 if SDPA_AVAILABLE:
 
     def sdpa_attn(q, k, v, attn_mask=None, scale=None):
@@ -50,20 +56,28 @@ def sparge_attn(self, q, k, v, attn_mask=None, scale=None):
         return out.transpose(1, 2)
 
 
-logger = logging.get_logger(__name__)
-
-
-def eager_attn(query, key, value, attn_mask=None, scale=None):
-    scale = 1 / query.shape[-1] ** 0.5 if scale is None else scale
-    query = query * scale
-    attn = torch.matmul(query, key.transpose(-2, -1))
+def eager_attn(q, k, v, attn_mask=None, scale=None):
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    scale = 1 / q.shape[-1] ** 0.5 if scale is None else scale
+    q = q * scale
+    attn = torch.matmul(q, k.transpose(-2, -1))
     if attn_mask is not None:
         attn = attn + attn_mask
     attn = attn.softmax(-1)
-    return attn @ value
-
-
-def attention(q, k, v, attn_mask=None, attn_impl: Optional[str] = None, scale: Optional[float] = None):
+    out = attn @ v
+    return out.transpose(1, 2)
+
+
+def attention(
+    q,
+    k,
+    v,
+    attn_impl: Optional[str] = None,
+    attn_mask: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+):
     """
     q: [B, Lq, Nq, C1]
     k: [B, Lk, Nk, C1]
@@ -152,3 +166,52 @@ def forward(
         out = attention(q, k, v, attn_mask=attn_mask, attn_impl=self.attn_impl, scale=self.scale)
         out = rearrange(out, "b s n d -> b s (n d)", n=self.num_heads)
         return self.to_out(out)
+
+
+def long_context_attention(
+    q,
+    k,
+    v,
+    attn_impl: Optional[str] = None,
+    attn_mask: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+):
+    """
+    q: [B, Lq, Nq, C1]
+    k: [B, Lk, Nk, C1]
+    v: [B, Lk, Nk, C2]
+    """
+    assert attn_impl in [
+        None,
+        "auto",
+        "eager",
+        "flash_attn_2",
+        "flash_attn_3",
+        "xformers",
+        "sdpa",
+        "sage_attn",
+        "sparge_attn",
+    ]
+    if attn_impl is None or attn_impl == "auto":
+        if FLASH_ATTN_3_AVAILABLE:
+            attn_func = LongContextAttention(attn_type=AttnType.FA3)
+        elif FLASH_ATTN_2_AVAILABLE:
+            attn_func = LongContextAttention(attn_type=AttnType.FA)
+        elif SDPA_AVAILABLE:
+            attn_func = LongContextAttention(attn_type=AttnType.TORCH)
+        else:
+            raise ValueError("No available long context attention implementation")
+    else:
+        if attn_impl == "flash_attn_3":
+            attn_func = LongContextAttention(attn_type=AttnType.FA3)
+        elif attn_impl == "flash_attn_2":
+            attn_func = LongContextAttention(attn_type=AttnType.FA)
+        elif attn_impl == "sdpa":
+            attn_func = LongContextAttention(attn_type=AttnType.TORCH)
+        elif attn_impl == "sage_attn":
+            attn_func = LongContextAttention(attn_type=AttnType.SAGE_FP8)
+        elif attn_impl == "sparge_attn":
+            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE)
+        else:
+            raise ValueError(f"Invalid long context attention implementation: {attn_impl}")
+    return attn_func(q, k, v, softmax_scale=scale)
@@ -2,19 +2,24 @@
 import json
 import torch
 import torch.nn as nn
+import torch.distributed as dist
 from typing import Tuple, Optional
 from einops import rearrange
 
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
+from diffsynth_engine.models.basic.attention import attention, long_context_attention
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import (
     WAN_DIT_1_3B_T2V_CONFIG_FILE,
     WAN_DIT_14B_I2V_CONFIG_FILE,
     WAN_DIT_14B_T2V_CONFIG_FILE,
 )
-
 from diffsynth_engine.utils.gguf import gguf_inference
-from diffsynth_engine.models.basic.attention import attention
+from diffsynth_engine.utils.parallel import (
+    get_sp_group,
+    get_sp_world_size,
+    get_sp_rank,
+)
 
 
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
@@ -99,7 +104,21 @@ def forward(self, x, freqs):
         q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
-        x = attention(q=rope_apply(q, freqs), k=rope_apply(k, freqs), v=v, attn_impl=self.attn_impl).flatten(2)
+        if getattr(self, "use_usp", False):
+            x = long_context_attention(
+                q=rope_apply(q, freqs),
+                k=rope_apply(k, freqs),
+                v=v,
+                attn_impl=self.attn_impl,
+            )
+        else:
+            x = attention(
+                q=rope_apply(q, freqs),
+                k=rope_apply(k, freqs),
+                v=v,
+                attn_impl=self.attn_impl,
+            )
+        x = x.flatten(2)
         return self.o(x)
 
 
@@ -259,6 +278,7 @@ def __init__(
         num_layers: int,
         has_image_input: bool,
         attn_impl: Optional[str] = None,
+        use_usp: bool = False,
         device: str = "cpu",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -301,6 +321,11 @@ def __init__(
         if has_image_input:
             self.img_emb = MLP(1280, dim, device=device, dtype=dtype)  # clip_feature_dim = 1280
 
+        if use_usp:
+            setattr(self, "use_usp", True)
+            for block in self.blocks:
+                setattr(block.self_attn, "use_usp", True)
+
     def patchify(self, x: torch.Tensor):
         x = self.patch_embedding(x)  # b c f h w -> b 4c f h/2 w/2
         grid_size = x.shape[2:]
@@ -348,15 +373,34 @@ def forward(
                 .reshape(f * h * w, 1, -1)
                 .to(x.device)
             )
+            if getattr(self, "use_usp", False):
+                s, p = x.size(1), get_sp_world_size()  # (sequence_length, parallelism)
+                split_size = [s // p + 1 if i < s % p else s // p for i in range(p)]
+                x = torch.split(x, split_size, dim=1)[get_sp_rank()]
+                freqs = torch.split(freqs, split_size, dim=0)[get_sp_rank()]
+
             for block in self.blocks:
                 x = block(x, context, t_mod, freqs)
             x = self.head(x, t)
+
+            if getattr(self, "use_usp", False):
+                b, d = x.size(0), x.size(2)  # (batch_size, out_dim)
+                xs = [torch.zeros((b, s, d), dtype=x.dtype, device=x.device) for s in split_size]
+                dist.all_gather(xs, x, group=get_sp_group())
+                x = torch.concat(xs, dim=1)
             x = self.unpatchify(x, (f, h, w))
             return x
 
     @classmethod
     def from_state_dict(
-        cls, state_dict, device, dtype, model_type="1.3b-t2v", attn_impl: Optional[str] = None, assign=True
+        cls,
+        state_dict,
+        device,
+        dtype,
+        model_type="1.3b-t2v",
+        attn_impl: Optional[str] = None,
+        use_usp=False,
+        assign=True,
     ):
         if model_type == "1.3b-t2v":
             config = json.load(open(WAN_DIT_1_3B_T2V_CONFIG_FILE, "r"))
@@ -367,7 +411,9 @@ def from_state_dict(
         else:
             raise ValueError(f"Unsupported model type: {model_type}")
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, **config, device=device, dtype=dtype, attn_impl=attn_impl)
+            model = torch.nn.utils.skip_init(
+                cls, **config, device=device, dtype=dtype, attn_impl=attn_impl, use_usp=use_usp
+            )
             model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=assign)
         model.to(device=device, dtype=dtype)
@@ -377,7 +423,7 @@ def get_tp_plan(self):
         from torch.distributed.tensor.parallel import (
             ColwiseParallel,
             RowwiseParallel,
-            SequenceParallel,
+            PrepareModuleInput,
             PrepareModuleOutput,
         )
         from torch.distributed.tensor import Replicate, Shard
@@ -388,45 +434,64 @@ def get_tp_plan(self):
             "time_embedding.0": ColwiseParallel(),
             "time_embedding.2": RowwiseParallel(),
             "time_projection.1": ColwiseParallel(output_layouts=Replicate()),
+            "blocks.0": PrepareModuleInput(
+                input_layouts=(Replicate(), None, None, None),
+                desired_input_layouts=(Shard(1), None, None, None),  # sequence parallel
+                use_local_output=True,
+            ),
+            "head": PrepareModuleOutput(
+                output_layouts=Shard(1),
+                desired_output_layouts=Replicate(),
+                use_local_output=True,
+            ),
         }
         for idx in range(len(self.blocks)):
             tp_plan.update(
                 {
-                    f"blocks.{idx}.norm1": SequenceParallel(use_local_output=True),
-                    f"blocks.{idx}.norm2": SequenceParallel(use_local_output=True),
-                    f"blocks.{idx}.norm3": SequenceParallel(use_local_output=True),
-                    f"blocks.{idx}.ffn.0": ColwiseParallel(),
-                    f"blocks.{idx}.ffn.2": RowwiseParallel(),
-                    f"blocks.{idx}.self_attn.q": ColwiseParallel(output_layouts=Replicate()),
-                    f"blocks.{idx}.self_attn.k": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.self_attn": PrepareModuleInput(
+                        input_layouts=(Shard(1), None),
+                        desired_input_layouts=(Replicate(), None),
+                    ),
+                    f"blocks.{idx}.self_attn.q": ColwiseParallel(output_layouts=Shard(1)),
+                    f"blocks.{idx}.self_attn.k": ColwiseParallel(output_layouts=Shard(1)),
                     f"blocks.{idx}.self_attn.v": ColwiseParallel(),
-                    f"blocks.{idx}.self_attn.o": RowwiseParallel(),
+                    f"blocks.{idx}.self_attn.o": RowwiseParallel(output_layouts=Shard(1)),
                     f"blocks.{idx}.self_attn.norm_q": PrepareModuleOutput(
-                        output_layouts=Replicate(),
+                        output_layouts=Shard(1),
                         desired_output_layouts=Shard(-1),
                     ),
                     f"blocks.{idx}.self_attn.norm_k": PrepareModuleOutput(
-                        output_layouts=Replicate(),
+                        output_layouts=Shard(1),
                         desired_output_layouts=Shard(-1),
                     ),
-                    f"blocks.{idx}.cross_attn.q": ColwiseParallel(output_layouts=Replicate()),
-                    f"blocks.{idx}.cross_attn.k": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.cross_attn": PrepareModuleInput(
+                        input_layouts=(Shard(1), None),
+                        desired_input_layouts=(Replicate(), None),
+                    ),
+                    f"blocks.{idx}.cross_attn.q": ColwiseParallel(output_layouts=Shard(1)),
+                    f"blocks.{idx}.cross_attn.k": ColwiseParallel(output_layouts=Shard(1)),
                     f"blocks.{idx}.cross_attn.v": ColwiseParallel(),
-                    f"blocks.{idx}.cross_attn.o": RowwiseParallel(),
+                    f"blocks.{idx}.cross_attn.o": RowwiseParallel(output_layouts=Shard(1)),
                     f"blocks.{idx}.cross_attn.norm_q": PrepareModuleOutput(
-                        output_layouts=Replicate(),
+                        output_layouts=Shard(1),
                         desired_output_layouts=Shard(-1),
                     ),
                     f"blocks.{idx}.cross_attn.norm_k": PrepareModuleOutput(
-                        output_layouts=Replicate(),
+                        output_layouts=Shard(1),
                         desired_output_layouts=Shard(-1),
                     ),
-                    f"blocks.{idx}.cross_attn.k_img": ColwiseParallel(output_layouts=Replicate()),
+                    f"blocks.{idx}.cross_attn.k_img": ColwiseParallel(output_layouts=Shard(1)),
                     f"blocks.{idx}.cross_attn.v_img": ColwiseParallel(),
                     f"blocks.{idx}.cross_attn.norm_k_img": PrepareModuleOutput(
-                        output_layouts=Replicate(),
+                        output_layouts=Shard(1),
                         desired_output_layouts=Shard(-1),
                     ),
+                    f"blocks.{idx}.ffn": PrepareModuleInput(
+                        input_layouts=(Shard(1),),
+                        desired_input_layouts=(Replicate(),),
+                    ),
+                    f"blocks.{idx}.ffn.0": ColwiseParallel(),
+                    f"blocks.{idx}.ffn.2": RowwiseParallel(output_layouts=Shard(1)),
                 }
             )
         return tp_plan
@@ -205,12 +205,12 @@ def load_models_to_device(self, load_model_names: List[str] | None = None):
         for model_name in self.model_names:
             if model_name not in load_model_names:
                 model = getattr(self, model_name)
-                if model is not None and next(model.parameters()).device != "cpu":
+                if model is not None and (p := next(model.parameters(), None)) is not None and p.device != "cpu":
                     model.to("cpu")
         # load the needed models to device
         for model_name in load_model_names:
             model = getattr(self, model_name)
-            if model is not None and next(model.parameters()).device != self.device:
+            if model is not None and (p := next(model.parameters(), None)) is not None and p.device != self.device:
                 model.to(self.device)
         # fresh the cuda cache
         torch.cuda.empty_cache()