modelscope
diff --git a/‎diffsynth_engine/models/base.py‎
Lines changed: 7 additions & 0 deletions b/‎diffsynth_engine/models/base.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎diffsynth_engine/models/flux/flux_dit.py‎
Lines changed: 31 additions & 11 deletions b/‎diffsynth_engine/models/flux/flux_dit.py‎
Lines changed: 31 additions & 11 deletions
diff --git a/‎diffsynth_engine/models/wan/wan_dit.py‎
Lines changed: 17 additions & 10 deletions b/‎diffsynth_engine/models/wan/wan_dit.py‎
Lines changed: 17 additions & 10 deletions
diff --git a/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 7 additions & 10 deletions b/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 7 additions & 10 deletions
@@ -14,6 +14,7 @@ def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
 
 class PreTrainedModel(nn.Module):
     converter = StateDictConverter()
+    _supports_parallelization = False
 
     def load_state_dict(self, state_dict: Dict[str, torch.Tensor], strict: bool = True, assign: bool = False):
         state_dict = self.converter.convert(state_dict)
@@ -55,6 +56,12 @@ def unload_loras(self):
             if isinstance(module, (LoRALinear, LoRAConv2d)):
                 module.clear()
 
+    def get_tp_plan(self):
+        raise NotImplementedError(f"{self.__class__.__name__} does not support TP")
+
+    def get_fsdp_modules(self):
+        raise NotImplementedError(f"{self.__class__.__name__} does not support FSDP")
+
 
 def split_suffix(name: str):
     suffix_list = [
 
@@ -18,7 +18,12 @@
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.constants import FLUX_DIT_CONFIG_FILE
-from diffsynth_engine.utils.parallel import sequence_parallel, sequence_parallel_unshard
+from diffsynth_engine.utils.parallel import (
+    cfg_parallel,
+    cfg_parallel_unshard,
+    sequence_parallel,
+    sequence_parallel_unshard,
+)
 from diffsynth_engine.utils import logging
 
 
@@ -323,12 +328,12 @@ def forward(self, x, t_emb, rope_emb, image_emb=None):
 
 class FluxDiT(PreTrainedModel):
     converter = FluxDiTStateDictConverter()
+    _supports_parallelization = True
 
     def __init__(
         self,
         in_channel: int = 64,
         attn_impl: Optional[str] = None,
-        use_usp: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -354,8 +359,6 @@ def __init__(
         self.final_norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.final_proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
 
-        self.use_usp = use_usp
-
     def patchify(self, hidden_states):
         hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
         return hidden_states
@@ -398,6 +401,8 @@ def forward(
         **kwargs,
     ):
         h, w = hidden_states.shape[-2:]
+        if image_ids is None:
+            image_ids = self.prepare_image_ids(hidden_states)
         controlnet_double_block_output = (
             controlnet_double_block_output if controlnet_double_block_output is not None else ()
         )
@@ -406,10 +411,24 @@ def forward(
         )
 
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
-        with fp8_inference(fp8_linear_enabled), gguf_inference():
-            if image_ids is None:
-                image_ids = self.prepare_image_ids(hidden_states)
-
+        with (
+            fp8_inference(fp8_linear_enabled),
+            gguf_inference(),
+            cfg_parallel(
+                (
+                    hidden_states,
+                    timestep,
+                    prompt_emb,
+                    pooled_prompt_emb,
+                    image_emb,
+                    guidance,
+                    text_ids,
+                    image_ids,
+                    *controlnet_double_block_output,
+                    *controlnet_single_block_output,
+                )
+            ),
+        ):
             # warning: keep the order of time_embedding + guidance_embedding + pooled_text_embedding
             # addition of floating point numbers does not meet commutative law
             conditioning = self.time_embedder(timestep, hidden_states.dtype)
@@ -439,7 +458,6 @@ def forward(
                     *(1 for _ in controlnet_double_block_output),
                     *(1 for _ in controlnet_single_block_output),
                 ),
-                enabled=self.use_usp,
             ):
                 hidden_states = self.x_embedder(hidden_states)
                 prompt_emb = self.context_embedder(prompt_emb)
@@ -465,6 +483,7 @@ def forward(
                 (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
 
             hidden_states = self.unpatchify(hidden_states, h, w)
+            (hidden_states,) = cfg_parallel_unshard((hidden_states,))
             return hidden_states
 
     @classmethod
@@ -475,7 +494,6 @@ def from_state_dict(
         dtype: torch.dtype,
         in_channel: int = 64,
         attn_impl: Optional[str] = None,
-        use_usp: bool = False,
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -484,9 +502,11 @@ def from_state_dict(
                 dtype=dtype,
                 in_channel=in_channel,
                 attn_impl=attn_impl,
-                use_usp=use_usp,
             )
             model = model.requires_grad_(False)  # for loading gguf
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
+
+    def get_fsdp_modules(self):
+        return ["blocks", "single_blocks"]
@@ -16,7 +16,12 @@
     WAN_DIT_14B_FLF2V_CONFIG_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
-from diffsynth_engine.utils.parallel import sequence_parallel, sequence_parallel_unshard
+from diffsynth_engine.utils.parallel import (
+    cfg_parallel,
+    cfg_parallel_unshard,
+    sequence_parallel,
+    sequence_parallel_unshard,
+)
 
 T5_TOKEN_NUM = 512
 FLF_TOKEN_NUM = 257 * 2
@@ -244,6 +249,7 @@ def convert(self, state_dict):
 
 class WanDiT(PreTrainedModel):
     converter = WanDiTStateDictConverter()
+    _supports_parallelization = True
 
     def __init__(
         self,
@@ -260,7 +266,6 @@ def __init__(
         has_image_input: bool,
         flf_pos_emb: bool = False,
         attn_impl: Optional[str] = None,
-        use_usp: bool = False,
         device: str = "cpu",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -303,8 +308,6 @@ def __init__(
         if has_image_input:
             self.img_emb = MLP(1280, dim, flf_pos_emb, device=device, dtype=dtype)  # clip_feature_dim = 1280
 
-        self.use_usp = use_usp
-
     def patchify(self, x: torch.Tensor):
         x = self.patch_embedding(x)  # b c f h w -> b 4c f h/2 w/2
         grid_size = x.shape[2:]
@@ -331,7 +334,10 @@ def forward(
         clip_feature: Optional[torch.Tensor] = None,  # clip_vision_encoder(img)
         y: Optional[torch.Tensor] = None,  # vae_encoder(img)
     ):
-        with gguf_inference():
+        with (
+            gguf_inference(),
+            cfg_parallel((x, context, timestep, clip_feature, y)),
+        ):
             t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
             t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
             context = self.text_embedding(context)
@@ -353,12 +359,13 @@ def forward(
                 .to(x.device)
             )
 
-            with sequence_parallel([x, freqs], seq_dims=(1, 0), enabled=self.use_usp):
+            with sequence_parallel((x, freqs), seq_dims=(1, 0)):
                 for block in self.blocks:
                     x = block(x, context, t_mod, freqs)
                 x = self.head(x, t)
                 (x,) = sequence_parallel_unshard((x,), seq_dims=(1,), seq_lens=(f * h * w,))
             x = self.unpatchify(x, (f, h, w))
+            (x,) = cfg_parallel_unshard((x,))
             return x
 
     @classmethod
@@ -369,7 +376,6 @@ def from_state_dict(
         dtype,
         model_type="1.3b-t2v",
         attn_impl: Optional[str] = None,
-        use_usp=False,
         assign=True,
     ):
         if model_type == "1.3b-t2v":
@@ -383,9 +389,7 @@ def from_state_dict(
         else:
             raise ValueError(f"Unsupported model type: {model_type}")
         with no_init_weights():
-            model = torch.nn.utils.skip_init(
-                cls, **config, device=device, dtype=dtype, attn_impl=attn_impl, use_usp=use_usp
-            )
+            model = torch.nn.utils.skip_init(cls, **config, device=device, dtype=dtype, attn_impl=attn_impl)
             model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=assign)
         model.to(device=device, dtype=dtype)
@@ -467,3 +471,6 @@ def get_tp_plan(self):
                 }
             )
         return tp_plan
+
+    def get_fsdp_modules(self):
+        return ["blocks"]
@@ -31,7 +31,7 @@ def __init__(
         vae_tiled: bool = False,
         vae_tile_size: int = -1,
         vae_tile_stride: int = -1,
-        device="cuda:0",
+        device="cuda",
         dtype=torch.float16,
     ):
         super().__init__()
@@ -47,15 +47,15 @@ def __init__(
     def from_pretrained(
         cls,
         model_path_or_config: str | os.PathLike | ModelConfig,
-        device: str = "cuda:0",
+        device: str = "cuda",
         dtype: torch.dtype = torch.float16,
         offload_mode: str | None = None,
     ) -> "BasePipeline":
         raise NotImplementedError()
 
     @classmethod
     def from_state_dict(
-        cls, state_dict: Dict[str, torch.Tensor], device: str = "cuda:0", dtype: torch.dtype = torch.float16
+        cls, state_dict: Dict[str, torch.Tensor], device: str = "cuda", dtype: torch.dtype = torch.float16
     ) -> "BasePipeline":
         raise NotImplementedError()
 
@@ -269,21 +269,18 @@ def enable_cpu_offload(self, offload_mode: str):
             logger.warning("must set an non cpu device for pipeline before calling enable_cpu_offload")
             return
         if offload_mode == "cpu_offload":
-            self.enable_model_cpu_offload()
+            self._enable_model_cpu_offload()
         elif offload_mode == "sequential_cpu_offload":
-            self.enable_sequential_cpu_offload()
+            self._enable_sequential_cpu_offload()
 
-    def enable_model_cpu_offload(self):
+    def _enable_model_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
                 model.to("cpu")
         self.offload_mode = "cpu_offload"
 
-    def enable_sequential_cpu_offload(self):
-        if self.device == "cpu" or self.device == "mps":
-            logger.warning("must set an non cpu device for pipeline before calling enable_sequential_cpu_offload")
-            return
+    def _enable_sequential_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None: