Fix/import vsa (#200)

akaitsuki-ii · web-flow · commit 1f969ae6e124 · 2025-11-06T11:49:20.000+08:00
* fix cycle import vsa

* fix cycle import parallel
diff --git a/diffsynth_engine/__init__.py b/diffsynth_engine/__init__.py
@@ -12,11 +12,13 @@
     WanStateDicts,
     QwenImageStateDicts,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
     ControlNetParams,
     ControlType,
     QwenImageControlNetParams,
     QwenImageControlType,
-    LoraConfig,
 )
 from .pipelines import (
     SDImagePipeline,
@@ -59,6 +61,9 @@
     "WanStateDicts",
     "QwenImageStateDicts",
     "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlNetParams",
     "ControlType",
     "QwenImageControlNetParams",
@@ -79,7 +84,6 @@
     "FluxIPAdapterRefTool",
     "FluxReplaceByControlTool",
     "FluxReduxRefTool",
-    "LoraConfig",
     "fetch_model",
     "fetch_modelscope_model",
     "register_fetch_modelscope_model",
diff --git a/diffsynth_engine/configs/__init__.py b/diffsynth_engine/configs/__init__.py
@@ -17,14 +17,16 @@
     WanStateDicts,
     WanS2VStateDicts,
     QwenImageStateDicts,
-    LoraConfig,
     AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+    LoraConfig,
 )
 from .controlnet import (
     ControlType,
     ControlNetParams,
-    QwenImageControlNetParams,
     QwenImageControlType,
+    QwenImageControlNetParams,
 )
 
 __all__ = [
@@ -46,10 +48,12 @@
     "WanStateDicts",
     "WanS2VStateDicts",
     "QwenImageStateDicts",
-    "QwenImageControlType",
-    "QwenImageControlNetParams",
+    "AttnImpl",
+    "SpargeAttentionParams",
+    "VideoSparseAttentionParams",
+    "LoraConfig",
     "ControlType",
     "ControlNetParams",
-    "LoraConfig",
-    "AttnImpl",
+    "QwenImageControlType",
+    "QwenImageControlNetParams",
 ]
diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -5,7 +5,6 @@
 from typing import List, Dict, Tuple, Optional
 
 from diffsynth_engine.configs.controlnet import ControlType
-from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 
 
 @dataclass
@@ -52,23 +51,6 @@ class AttentionConfig:
     dit_attn_impl: AttnImpl = AttnImpl.AUTO
     attn_params: Optional[SpargeAttentionParams | VideoSparseAttentionParams] = None
 
-    def get_attn_kwargs(self, latents: torch.Tensor, device: str) -> Dict:
-        attn_kwargs = {"attn_impl": self.dit_attn_impl.value}
-        if isinstance(self.attn_params, SpargeAttentionParams):
-            assert self.dit_attn_impl == AttnImpl.SPARGE
-            attn_kwargs.update(
-                {
-                    "smooth_k": self.attn_params.smooth_k,
-                    "simthreshd1": self.attn_params.simthreshd1,
-                    "cdfthreshd": self.attn_params.cdfthreshd,
-                    "pvthreshd": self.attn_params.pvthreshd,
-                }
-            )
-        elif isinstance(self.attn_params, VideoSparseAttentionParams):
-            assert self.dit_attn_impl == AttnImpl.VSA
-            attn_kwargs.update(get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.attn_params.sparsity, device=device))
-        return attn_kwargs
-
 
 @dataclass
 class OptimizationConfig:
diff --git a/diffsynth_engine/models/basic/video_sparse_attention.py b/diffsynth_engine/models/basic/video_sparse_attention.py
@@ -2,9 +2,12 @@
 import math
 import functools
 
-from vsa import video_sparse_attn as vsa_core
+from diffsynth_engine.utils.flag import VIDEO_SPARSE_ATTN_AVAILABLE
 from diffsynth_engine.utils.parallel import get_sp_ulysses_group, get_sp_ring_world_size
 
+if VIDEO_SPARSE_ATTN_AVAILABLE:
+    from vsa import video_sparse_attn as vsa_core
+
 VSA_TILE_SIZE = (4, 4, 4)
 
 
diff --git a/diffsynth_engine/pipelines/base.py b/diffsynth_engine/pipelines/base.py
@@ -5,7 +5,15 @@
 from typing import Dict, List, Tuple, Union, Optional
 from PIL import Image
 
-from diffsynth_engine.configs import BaseConfig, BaseStateDicts, LoraConfig
+from diffsynth_engine.configs import (
+    BaseConfig,
+    BaseStateDicts,
+    LoraConfig,
+    AttnImpl,
+    SpargeAttentionParams,
+    VideoSparseAttentionParams,
+)
+from diffsynth_engine.models.basic.video_sparse_attention import get_vsa_kwargs
 from diffsynth_engine.utils.offload import enable_sequential_cpu_offload, offload_model_to_dict, restore_model_from_dict
 from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
@@ -33,6 +41,7 @@ def __init__(
         dtype=torch.float16,
     ):
         super().__init__()
+        self.config = None
         self.vae_tiled = vae_tiled
         self.vae_tile_size = vae_tile_size
         self.vae_tile_stride = vae_tile_stride
@@ -48,7 +57,7 @@ def from_pretrained(cls, model_path_or_config: str | BaseConfig) -> "BasePipelin
         raise NotImplementedError()
 
     @classmethod
-    def from_state_dict(cls, state_dicts: BaseStateDicts, pipeline_config: BaseConfig) -> "BasePipeline":
+    def from_state_dict(cls, state_dicts: BaseStateDicts, config: BaseConfig) -> "BasePipeline":
         raise NotImplementedError()
 
     def update_weights(self, state_dicts: BaseStateDicts) -> None:
@@ -260,6 +269,25 @@ def prepare_latents(
         )
         return init_latents, latents, sigmas, timesteps
 
+    def get_attn_kwargs(self, latents: torch.Tensor) -> Dict:
+        attn_kwargs = {"attn_impl": self.config.dit_attn_impl.value}
+        if isinstance(self.config.attn_params, SpargeAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.SPARGE
+            attn_kwargs.update(
+                {
+                    "smooth_k": self.config.attn_params.smooth_k,
+                    "simthreshd1": self.config.attn_params.simthreshd1,
+                    "cdfthreshd": self.config.attn_params.cdfthreshd,
+                    "pvthreshd": self.config.attn_params.pvthreshd,
+                }
+            )
+        elif isinstance(self.config.attn_params, VideoSparseAttentionParams):
+            assert self.config.dit_attn_impl == AttnImpl.VSA
+            attn_kwargs.update(
+                get_vsa_kwargs(latents.shape[2:], (1, 2, 2), self.config.attn_params.sparsity, device=self.device)
+            )
+        return attn_kwargs
+
     def eval(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -751,7 +751,7 @@ def predict_noise(
         latents = latents.to(self.dtype)
         self.load_models_to_device(["dit"])
 
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             hidden_states=latents,
             timestep=timestep,
@@ -886,7 +886,7 @@ def predict_multicontrolnet(
                 empty_cache()
                 param.model.to(self.device)
 
-            attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+            attn_kwargs = self.get_attn_kwargs(latents)
             double_block_output, single_block_output = param.model(
                 hidden_states=latents,
                 control_condition=control_condition,
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -208,7 +208,9 @@ def from_pretrained(cls, model_path_or_config: str | QwenImagePipelineConfig) ->
             )
         if config.load_encoder:
             logger.info(f"loading state dict from {config.encoder_path} ...")
-            encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
+            encoder_state_dict = cls.load_model_checkpoint(
+                config.encoder_path, device="cpu", dtype=config.encoder_dtype
+            )
 
         state_dicts = QwenImageStateDicts(
             model=model_state_dict,
@@ -547,7 +549,7 @@ def predict_noise(
         entity_masks: Optional[List[torch.Tensor]] = None,
     ):
         self.load_models_to_device(["dit"])
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             image=latents,
             edit=image_latents,
diff --git a/diffsynth_engine/pipelines/wan_s2v.py b/diffsynth_engine/pipelines/wan_s2v.py
@@ -394,7 +394,7 @@ def predict_noise(
         void_audio_input: torch.Tensor | None = None,
     ):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
 
         noise_pred = model(
             x=latents,
diff --git a/diffsynth_engine/pipelines/wan_video.py b/diffsynth_engine/pipelines/wan_video.py
@@ -144,7 +144,7 @@ def load_loras(
         lora_list: List[Tuple[str, float]],
         fused: bool = True,
         save_original_weight: bool = False,
-        lora_converter: Optional[WanLoRAConverter] = None
+        lora_converter: Optional[WanLoRAConverter] = None,
     ):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
             "load LoRA is not allowed when tensor parallel is enabled; "
@@ -156,11 +156,15 @@ def load_loras(
         )
         super().load_loras(lora_list, fused, save_original_weight, lora_converter)
 
-    def load_loras_low_noise(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
+    def load_loras_low_noise(
+        self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False
+    ):
         assert self.dit2 is not None, "low noise LoRA can only be applied to Wan2.2"
         self.load_loras(lora_list, fused, save_original_weight, self.low_noise_lora_converter)
 
-    def load_loras_high_noise(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
+    def load_loras_high_noise(
+        self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False
+    ):
         assert self.dit2 is not None, "high noise LoRA can only be applied to Wan2.2"
         self.load_loras(lora_list, fused, save_original_weight)
 
@@ -323,7 +327,7 @@ def predict_noise_with_cfg(
 
     def predict_noise(self, model, latents, image_clip_feature, image_y, timestep, context):
         latents = latents.to(dtype=self.config.model_dtype, device=self.device)
-        attn_kwargs = self.config.get_attn_kwargs(latents, self.device)
+        attn_kwargs = self.get_attn_kwargs(latents)
 
         noise_pred = model(
             x=latents,
diff --git a/diffsynth_engine/utils/parallel.py b/diffsynth_engine/utils/parallel.py
@@ -19,8 +19,6 @@
 from queue import Empty
 
 import diffsynth_engine.models.basic.attention as attention_ops
-from diffsynth_engine.models import PreTrainedModel
-from diffsynth_engine.pipelines import BasePipeline
 from diffsynth_engine.utils.platform import empty_cache
 from diffsynth_engine.utils import logging
 
@@ -300,14 +298,15 @@ def _worker_loop(
             world_size=world_size,
         )
 
-        def wrap_for_parallel(module: Union[PreTrainedModel, BasePipeline]):
-            if isinstance(module, BasePipeline):
-                for model_name in module.model_names:
-                    if isinstance(submodule := getattr(module, model_name), PreTrainedModel):
+        def wrap_for_parallel(module):
+            if hasattr(module, "model_names"):
+                for model_name in getattr(module, "model_names"):
+                    submodule = getattr(module, model_name)
+                    if getattr(submodule, "_supports_parallelization", False):
                         setattr(module, model_name, wrap_for_parallel(submodule))
                 return module
 
-            if not module._supports_parallelization:
+            if not getattr(module, "_supports_parallelization", False):
                 return module
 
             if tp_degree > 1: