modelscope
diff --git a/‎diffsynth_engine/__init__.py‎
Lines changed: 10 additions & 8 deletions b/‎diffsynth_engine/__init__.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎diffsynth_engine/configs/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎diffsynth_engine/configs/__init__.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎diffsynth_engine/configs/controlnet.py‎
Lines changed: 17 additions & 0 deletions b/‎diffsynth_engine/configs/controlnet.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎diffsynth_engine/configs/pipeline.py‎
Lines changed: 206 additions & 0 deletions b/‎diffsynth_engine/configs/pipeline.py‎
Lines changed: 206 additions & 0 deletions
diff --git a/‎diffsynth_engine/models/basic/attention.py‎
Lines changed: 43 additions & 4 deletions b/‎diffsynth_engine/models/basic/attention.py‎
Lines changed: 43 additions & 4 deletions
@@ -1,12 +1,14 @@
+from .configs import (
+    SDPipelineConfig,
+    SDXLPipelineConfig,
+    FluxPipelineConfig,
+    WanPipelineConfig,
+)
 from .pipelines import (
     FluxImagePipeline,
     SDXLImagePipeline,
     SDImagePipeline,
     WanVideoPipeline,
-    FluxModelConfig,
-    SDXLModelConfig,
-    SDModelConfig,
-    WanModelConfig,
     ControlNetParams,
 )
 from .models.flux import FluxControlNet, FluxIPAdapter, FluxRedux
@@ -23,6 +25,10 @@
 )
 
 __all__ = [
+    "SDPipelineConfig",
+    "SDXLPipelineConfig",
+    "FluxPipelineConfig",
+    "WanPipelineConfig",
     "FluxImagePipeline",
     "FluxControlNet",
     "FluxIPAdapter",
@@ -32,10 +38,6 @@
     "SDXLImagePipeline",
     "SDImagePipeline",
     "WanVideoPipeline",
-    "FluxModelConfig",
-    "SDXLModelConfig",
-    "SDModelConfig",
-    "WanModelConfig",
     "FluxInpaintingTool",
     "FluxOutpaintingTool",
     "FluxIPAdapterRefTool",
 
@@ -0,0 +1,23 @@
+from .pipeline import (
+    BaseConfig,
+    AttentionConfig,
+    OptimizationConfig,
+    ParallelConfig,
+    SDPipelineConfig,
+    SDXLPipelineConfig,
+    FluxPipelineConfig,
+    WanPipelineConfig,
+)
+from .controlnet import ControlType
+
+__all__ = [
+    "BaseConfig",
+    "AttentionConfig",
+    "OptimizationConfig",
+    "ParallelConfig",
+    "SDPipelineConfig",
+    "SDXLPipelineConfig",
+    "FluxPipelineConfig",
+    "WanPipelineConfig",
+    "ControlType",
+]
@@ -0,0 +1,17 @@
+from enum import Enum
+
+
+# FLUX ControlType
+class ControlType(Enum):
+    normal = "normal"
+    bfl_control = "bfl_control"
+    bfl_fill = "bfl_fill"
+    bfl_kontext = "bfl_kontext"
+
+    def get_in_channel(self):
+        if self in [ControlType.normal, ControlType.bfl_kontext]:
+            return 64
+        elif self == ControlType.bfl_control:
+            return 128
+        elif self == ControlType.bfl_fill:
+            return 384
@@ -0,0 +1,206 @@
+import os
+import torch
+from dataclasses import dataclass, field
+from typing import List, Tuple, Optional
+
+from diffsynth_engine.configs.controlnet import ControlType
+
+
+@dataclass
+class BaseConfig:
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    model_dtype: torch.dtype
+    batch_cfg: bool = False
+    vae_tiled: bool = False
+    vae_tile_size: int | Tuple[int, int] = 256
+    vae_tile_stride: int | Tuple[int, int] = 256
+    device: str = "cuda"
+    offload_mode: Optional[str] = None
+
+
+@dataclass
+class AttentionConfig:
+    dit_attn_impl: str = "auto"
+    # Sparge Attention
+    sparge_smooth_k: bool = True
+    sparge_cdfthreshd: float = 0.6
+    sparge_simthreshd1: float = 0.98
+    sparge_pvthreshd: float = 50.0
+
+
+@dataclass
+class OptimizationConfig:
+    use_fp8_linear: bool = False
+    use_fbcache: bool = False
+    fbcache_relative_l1_threshold: float = 0.05
+
+
+@dataclass
+class ParallelConfig:
+    parallelism: int = 1
+    use_cfg_parallel: bool = False
+    cfg_degree: Optional[int] = None
+    sp_ulysses_degree: Optional[int] = None
+    sp_ring_degree: Optional[int] = None
+    tp_degree: Optional[int] = None
+    use_fsdp: bool = False
+
+
+@dataclass
+class SDPipelineConfig(BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.float16
+    clip_dtype: torch.dtype = torch.float16
+    vae_dtype: torch.dtype = torch.float32
+
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        offload_mode: Optional[str] = None,
+    ) -> "SDPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            offload_mode=offload_mode,
+        )
+
+
+@dataclass
+class SDXLPipelineConfig(BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_l_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    clip_g_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.float16
+    clip_l_dtype: torch.dtype = torch.float16
+    clip_g_dtype: torch.dtype = torch.float16
+    vae_dtype: torch.dtype = torch.float32
+
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        offload_mode: Optional[str] = None,
+    ) -> "SDXLPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            offload_mode=offload_mode,
+        )
+
+
+@dataclass
+class FluxPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.bfloat16
+    clip_dtype: torch.dtype = torch.bfloat16
+    t5_dtype: torch.dtype = torch.bfloat16
+    vae_dtype: torch.dtype = torch.bfloat16
+
+    load_text_encoder: bool = True
+    control_type: ControlType = ControlType.normal
+
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+    ) -> "FluxPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            parallelism=parallelism,
+            use_fsdp=True,
+            offload_mode=offload_mode,
+        )
+
+    def __post_init__(self):
+        init_parallel_config(self)
+
+
+@dataclass
+class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.bfloat16
+    t5_dtype: torch.dtype = torch.bfloat16
+    vae_dtype: torch.dtype = torch.bfloat16
+    image_encoder_dtype: torch.dtype = torch.bfloat16
+
+    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor, set by model type
+
+    # override BaseConfig
+    vae_tiled: bool = True
+    vae_tile_size: Tuple[int, int] = (34, 34)
+    vae_tile_stride: Tuple[int, int] = (18, 16)
+
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+    ) -> "WanPipelineConfig":
+        return cls(
+            model_path=model_path,
+            image_encoder_path=image_encoder_path,
+            device=device,
+            parallelism=parallelism,
+            use_cfg_parallel=True,
+            use_fsdp=True,
+            offload_mode=offload_mode,
+        )
+
+    def __post_init__(self):
+        init_parallel_config(self)
+
+
+def init_parallel_config(config: FluxPipelineConfig | WanPipelineConfig):
+    assert config.parallelism in (1, 2, 4, 8), "parallelism must be 1, 2, 4 or 8"
+    config.batch_cfg = True if config.parallelism > 1 and config.use_cfg_parallel else config.batch_cfg
+
+    if config.use_cfg_parallel is True and config.cfg_degree is not None:
+        raise ValueError("use_cfg_parallel and cfg_degree should not be specified together")
+    config.cfg_degree = (2 if config.use_cfg_parallel else 1) if config.cfg_degree is None else config.cfg_degree
+
+    if config.tp_degree is not None:
+        assert config.sp_ulysses_degree is None and config.sp_ring_degree is None, (
+            "not allowed to enable sequence parallel and tensor parallel together; "
+            "either set sp_ulysses_degree=None, sp_ring_degree=None or set tp_degree=None during pipeline initialization"
+        )
+        assert config.use_fsdp is False, (
+            "not allowed to enable fully sharded data parallel and tensor parallel together; "
+            "either set use_fsdp=False or set tp_degree=None during pipeline initialization"
+        )
+        assert config.parallelism == config.cfg_degree * config.tp_degree, (
+            f"parallelism ({config.parallelism}) must be equal to cfg_degree ({config.cfg_degree}) * tp_degree ({config.tp_degree})"
+        )
+        config.sp_ulysses_degree = 1
+        config.sp_ring_degree = 1
+    elif config.sp_ulysses_degree is None and config.sp_ring_degree is None:
+        # use ulysses if not specified
+        config.sp_ulysses_degree = config.parallelism // config.cfg_degree
+        config.sp_ring_degree = 1
+        config.tp_degree = 1
+    elif config.sp_ulysses_degree is not None and config.sp_ring_degree is not None:
+        assert config.parallelism == config.cfg_degree * config.sp_ulysses_degree * config.sp_ring_degree, (
+            f"parallelism ({config.parallelism}) must be equal to cfg_degree ({config.cfg_degree}) * "
+            f"sp_ulysses_degree ({config.sp_ulysses_degree}) * sp_ring_degree ({config.sp_ring_degree})"
+        )
+        config.tp_degree = 1
+    else:
+        raise ValueError("sp_ulysses_degree and sp_ring_degree must be specified together")
@@ -61,12 +61,33 @@ def sage_attn(q, k, v, attn_mask=None, scale=None):
 
 if SPARGE_ATTN_AVAILABLE:
     from spas_sage_attn import spas_sage2_attn_meansim_cuda
+    from spas_sage_attn.autotune import SparseAttentionMeansim
 
-    def sparge_attn(q, k, v, attn_mask=None, scale=None):
+    def sparge_attn(
+        q,
+        k,
+        v,
+        attn_mask=None,
+        scale=None,
+        smooth_k=True,
+        simthreshd1=0.6,
+        cdfthreshd=0.98,
+        pvthreshd=50,
+    ):
         q = q.transpose(1, 2)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
-        out = spas_sage2_attn_meansim_cuda(q, k, v, attn_mask=attn_mask, scale=scale)
+        out = spas_sage2_attn_meansim_cuda(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            scale=scale,
+            smooth_k=smooth_k,
+            simthreshd1=simthreshd1,
+            cdfthreshd=cdfthreshd,
+            pvthreshd=pvthreshd,
+        )
         return out.transpose(1, 2)
 
 
@@ -91,6 +112,7 @@ def attention(
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    **kwargs,
 ):
     """
     q: [B, Lq, Nq, C1]
@@ -133,7 +155,17 @@ def attention(
         elif attn_impl == "sage_attn":
             return sage_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         elif attn_impl == "sparge_attn":
-            return sparge_attn(q, k, v, attn_mask=attn_mask, scale=scale)
+            return sparge_attn(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                scale=scale,
+                smooth_k=kwargs.get("sparge_smooth_k", True),
+                simthreshd1=kwargs.get("sparge_simthreshd1", 0.6),
+                cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
+                pvthreshd=kwargs.get("sparge_pvthreshd", 50),
+            )
         else:
             raise ValueError(f"Invalid attention implementation: {attn_impl}")
 
@@ -189,6 +221,7 @@ def long_context_attention(
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    **kwargs,
 ):
     """
     q: [B, Lq, Nq, C1]
@@ -226,7 +259,13 @@ def long_context_attention(
         elif attn_impl == "sage_attn":
             attn_func = LongContextAttention(attn_type=AttnType.SAGE_FP8)
         elif attn_impl == "sparge_attn":
-            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE)
+            attn_processor = SparseAttentionMeansim()
+            # default args from spas_sage2_attn_meansim_cuda
+            attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
+            attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
+            attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
+            attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
+            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)
         else:
             raise ValueError(f"Invalid long context attention implementation: {attn_impl}")
     return attn_func(q, k, v, softmax_scale=scale)