support sd1.5 controlnet

Glaceon-Hyy · Glaceon-Hyy · commit d75f65a805b5 · 2025-07-03T10:37:29.000+08:00
diff --git a/diffsynth_engine/__init__.py b/diffsynth_engine/__init__.py
@@ -10,6 +10,7 @@
     ControlNetParams,
 )
 from .models.flux import FluxControlNet, FluxIPAdapter, FluxRedux
+from .models.sd import SDControlNet
 from .utils.download import fetch_model, fetch_modelscope_model, fetch_civitai_model
 from .utils.video import load_video, save_video
 from .tools import (
@@ -25,6 +26,7 @@
     "FluxControlNet",
     "FluxIPAdapter",
     "FluxRedux",
+    "SDControlNet",
     "SDXLImagePipeline",
     "SDImagePipeline",
     "WanVideoPipeline",
diff --git a/diffsynth_engine/models/sd/__init__.py b/diffsynth_engine/models/sd/__init__.py
@@ -1,12 +1,14 @@
 from .sd_text_encoder import SDTextEncoder, config as sd_text_encoder_config
 from .sd_unet import SDUNet, config as sd_unet_config
 from .sd_vae import SDVAEDecoder, SDVAEEncoder
+from .sd_controlnet import SDControlNet
 
 __all__ = [
     "SDTextEncoder",
     "SDUNet",
     "SDVAEDecoder",
     "SDVAEEncoder",
+    "SDControlNet",
     "sd_text_encoder_config",
     "sd_unet_config",
 ]
diff --git a/diffsynth_engine/models/sd/sd_controlnet.py b/diffsynth_engine/models/sd/sd_controlnet.py
diff --git a/diffsynth_engine/models/sd/sd_unet.py b/diffsynth_engine/models/sd/sd_unet.py
@@ -264,7 +264,7 @@ def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float16):
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(320, 4, kernel_size=3, padding=1, device=device, dtype=dtype)
 
-    def forward(self, x, timestep, context, **kwargs):
+    def forward(self, x, timestep, context, controlnet_res_stack=None, **kwargs):
         # 1. time
         time_emb = self.time_embedding(timestep, dtype=x.dtype)
 
@@ -273,10 +273,18 @@ def forward(self, x, timestep, context, **kwargs):
         text_emb = context
         res_stack = [hidden_states]
 
+        controlnet_insert_block_id = 30
+
         # 3. blocks
         for i, block in enumerate(self.blocks):
+            # 3.1 UNet
             hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
 
+            # 3.2 Controlnet
+            if i == controlnet_insert_block_id and controlnet_res_stack is not None:
+                hidden_states += controlnet_res_stack.pop()
+                res_stack = [res + controlnet_res for res, controlnet_res in zip(res_stack, controlnet_res_stack)]
+
         # 4. output
         hidden_states = self.conv_norm_out(hidden_states)
         hidden_states = self.conv_act(hidden_states)
diff --git a/diffsynth_engine/pipelines/controlnet_helper.py b/diffsynth_engine/pipelines/controlnet_helper.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+from typing import Dict, List, Tuple, Union, Optional
+from PIL import Image
+from dataclasses import dataclass
+
+ImageType = Union[Image.Image, torch.Tensor, List[Image.Image], List[torch.Tensor]]
+
+@dataclass
+class ControlNetParams:
+    scale: float
+    image: ImageType
+    model: Optional[nn.Module] = None
+    mask: Optional[ImageType] = None
+    control_start: float = 0
+    control_end: float = 1
+
+def accumulate(result, new_item):
+    if result is None:
+        return new_item
+    for i, item in enumerate(new_item):
+        result[i] += item
+    return result
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -22,6 +22,7 @@
 )
 from diffsynth_engine.models.basic.lora import LoRAContext
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
+from diffsynth_engine.pipelines.controlnet_helper import ControlNetParams, accumulate
 from diffsynth_engine.tokenizers import CLIPTokenizer, T5TokenizerFast
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
@@ -415,17 +416,6 @@ def calculate_shift(
     return mu
 
 
-def accumulate(result, new_item):
-    if result is None:
-        return new_item
-    for i, item in enumerate(new_item):
-        result[i] += item
-    return result
-
-
-ImageType = Union[Image.Image, torch.Tensor, List[Image.Image], List[torch.Tensor]]
-
-
 class ControlType(Enum):
     normal = "normal"
     bfl_control = "bfl_control"
@@ -439,17 +429,6 @@ def get_in_channel(self):
         elif self == ControlType.bfl_fill:
             return 384
 
-
-@dataclass
-class ControlNetParams:
-    scale: float
-    image: ImageType
-    model: Optional[nn.Module] = None
-    mask: Optional[ImageType] = None
-    control_start: float = 0
-    control_end: float = 1
-
-
 @dataclass
 class FluxModelConfig:
     dit_path: str | os.PathLike
diff --git a/diffsynth_engine/pipelines/sd_image.py b/diffsynth_engine/pipelines/sd_image.py
@@ -4,14 +4,15 @@
 import numpy as np
 from einops import repeat
 from dataclasses import dataclass
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, List
 from tqdm import tqdm
 from PIL import Image, ImageOps
 
 from diffsynth_engine.models.base import split_suffix
 from diffsynth_engine.models.basic.lora import LoRAContext
 from diffsynth_engine.models.sd import SDTextEncoder, SDVAEDecoder, SDVAEEncoder, SDUNet, sd_unet_config
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
+from diffsynth_engine.pipelines.controlnet_helper import ControlNetParams, accumulate
 from diffsynth_engine.tokenizers import CLIPTokenizer
 from diffsynth_engine.algorithm.noise_scheduler import ScaledLinearScheduler
 from diffsynth_engine.algorithm.sampler import EulerSampler
@@ -259,37 +260,100 @@ def encode_prompt(self, prompt, clip_skip):
         prompt_emb = self.text_encoder(input_ids, clip_skip=clip_skip)
         return prompt_emb
 
+    def preprocess_control_image(self, image: Image.Image, mode="RGB") -> torch.Tensor:
+        image = image.convert(mode)
+        image_array = np.array(image, dtype=np.float32)
+        if len(image_array.shape) == 2:
+            image_array = image_array[:, :, np.newaxis]
+        image = torch.Tensor(image_array / 255).permute(2, 0, 1).unsqueeze(0)
+        return image
+
+    def prepare_controlnet_params(self, controlnet_params: List[ControlNetParams], h, w):
+        results = []
+        for param in controlnet_params:
+            condition = self.preprocess_control_image(param.image).to(device=self.device, dtype=self.dtype)
+            results.append(
+                ControlNetParams(
+                    model=param.model,
+                    scale=param.scale,
+                    image=condition,
+                )
+            )
+        return results
+
+    def predict_multicontrolnet(
+        self,
+        latents: torch.Tensor,
+        timestep: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        controlnet_params: List[ControlNetParams],
+        current_step: int,
+        total_step: int,
+    ):
+        controlnet_res_stack = None
+        if len(controlnet_params) > 0:
+            self.load_models_to_device([])        
+        for param in controlnet_params:
+            current_scale = param.scale
+            if not (
+                current_step >= param.control_start * total_step and current_step <= param.control_end * total_step
+            ):
+                # if current_step is not in the control range
+                # skip thie controlnet
+                continue
+            if self.offload_mode is not None:
+                empty_cache()
+                param.model.to(self.device)
+            controlnet_res = param.model(
+                latents,
+                timestep,
+                prompt_emb,
+                param.image
+            )
+            controlnet_res = [res * current_scale for res in controlnet_res]
+            if self.offload_mode is not None:
+                empty_cache()
+                param.model.to("cpu")            
+            controlnet_res_stack = accumulate(controlnet_res_stack, controlnet_res)
+        return controlnet_res_stack
+
     def predict_noise_with_cfg(
         self,
         latents: torch.Tensor,
         timestep: torch.Tensor,
         positive_prompt_emb: torch.Tensor,
         negative_prompt_emb: torch.Tensor,
+        controlnet_params: List[ControlNetParams],
+        current_step: int,
+        total_step: int,        
         cfg_scale: float,
         batch_cfg: bool = True,
     ):
         if cfg_scale <= 1.0:
-            return self.predict_noise(latents, timestep, positive_prompt_emb)
+            return self.predict_noise(latents, timestep, positive_prompt_emb, controlnet_params, current_step, total_step)
         if not batch_cfg:
             # cfg by predict noise one by one
-            positive_noise_pred = self.predict_noise(latents, timestep, positive_prompt_emb)
-            negative_noise_pred = self.predict_noise(latents, timestep, negative_prompt_emb)
+            positive_noise_pred = self.predict_noise(latents, timestep, positive_prompt_emb, controlnet_params, current_step, total_step)
+            negative_noise_pred = self.predict_noise(latents, timestep, negative_prompt_emb, controlnet_params, current_step, total_step)
             noise_pred = negative_noise_pred + cfg_scale * (positive_noise_pred - negative_noise_pred)
             return noise_pred
         else:
             # cfg by predict noise in one batch
             prompt_emb = torch.cat([positive_prompt_emb, negative_prompt_emb], dim=0)
             latents = torch.cat([latents, latents], dim=0)
             timestep = torch.cat([timestep, timestep], dim=0)
-            positive_noise_pred, negative_noise_pred = self.predict_noise(latents, timestep, prompt_emb).chunk(2)
+            positive_noise_pred, negative_noise_pred = self.predict_noise(latents, timestep, prompt_emb, controlnet_params, current_step, total_step).chunk(2)
             noise_pred = negative_noise_pred + cfg_scale * (positive_noise_pred - negative_noise_pred)
             return noise_pred
 
-    def predict_noise(self, latents, timestep, prompt_emb):
+    def predict_noise(self, latents, timestep, prompt_emb, controlnet_params, current_step, total_step):
+        controlnet_res_stack = self.predict_multicontrolnet(latents, timestep, prompt_emb, controlnet_params, current_step, total_step)
+
         noise_pred = self.unet(
             x=latents,
             timestep=timestep,
             context=prompt_emb,
+            controlnet_res_stack=controlnet_res_stack,
             device=self.device,
         )
         return noise_pred
@@ -329,8 +393,12 @@ def __call__(
         width: int = 1024,
         num_inference_steps: int = 20,
         seed: int | None = None,
+        controlnet_params: List[ControlNetParams] | ControlNetParams = [],
         progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
     ):
+        if not isinstance(controlnet_params, list):
+            controlnet_params = [controlnet_params]
+
         if input_image is not None:
             width, height = input_image.size
         self.validate_image_size(height, width, minimum=64, multiple_of=8)
@@ -345,6 +413,9 @@ def __call__(
         # Initialize sampler
         self.sampler.initialize(init_latents=init_latents, timesteps=timesteps, sigmas=sigmas, mask=mask)
 
+        # ControlNet
+        controlnet_params = self.prepare_controlnet_params(controlnet_params, h=height, w=width)
+
         # Encode prompts
         self.load_models_to_device(["text_encoder"])
         positive_prompt_emb = self.encode_prompt(prompt, clip_skip=clip_skip)
@@ -361,6 +432,9 @@ def __call__(
                 positive_prompt_emb=positive_prompt_emb,
                 negative_prompt_emb=negative_prompt_emb,
                 cfg_scale=cfg_scale,
+                controlnet_params=controlnet_params,
+                current_step=i,
+                total_step=len(timesteps),                
                 batch_cfg=self.batch_cfg,
             )
             # Denoise
diff --git a/diffsynth_engine/utils/flag.py b/diffsynth_engine/utils/flag.py
@@ -19,7 +19,7 @@
 else:
     logger.info("Flash attention 2 is not available")
 
-XFORMERS_AVAILABLE = importlib.util.find_spec("xformers") is not None
+XFORMERS_AVAILABLE = None # importlib.util.find_spec("xformers") is not None
 if XFORMERS_AVAILABLE:
     logger.info("XFormers is available")
 else:
diff --git a/tests/test_pipelines/test_sd_controlnet.py b/tests/test_pipelines/test_sd_controlnet.py
@@ -0,0 +1,41 @@
+import unittest
+
+from tests.common.test_case import ImageTestCase
+from diffsynth_engine import SDImagePipeline, SDControlNet, ControlNetParams, fetch_model
+import torch
+
+
+class TestSDControlNet(ImageTestCase):
+    @classmethod
+    def setUpClass(cls):
+        model_path = fetch_model(
+            "muse/v1-5-pruned-emaonly", revision="20240118200020", path="v1-5-pruned-emaonly.safetensors"
+        )
+        cls.pipe = SDImagePipeline.from_pretrained(model_path)
+
+    def test_canny(self):
+        canny_image = self.get_input_image("canny.png")
+        controlnet = SDControlNet.from_pretrained(
+            fetch_model("lllyasviel/sd-controlnet-canny", path="diffusion_pytorch_model.safetensors"),
+            device="cuda:0",
+            dtype=torch.float16,
+        )
+        output_image = self.pipe(
+            prompt="A young girl stands gracefully at the edge of a serene beach, her long, flowing hair gently tousled by the sea breeze. She wears a soft, pastel-colored dress that complements the tranquil blues and greens of the coastal scenery. The golden hues of the setting sun cast a warm glow on her face, highlighting her serene expression. The background features a vast, azure ocean with gentle waves lapping at the shore, surrounded by distant cliffs and a clear, cloudless sky. The composition emphasizes the girl's serene presence amidst the natural beauty, with a balanced blend of warm and cool tones.",
+            height=canny_image.height,
+            width=canny_image.width,
+            num_inference_steps=30,
+            seed=42,
+            controlnet_params=ControlNetParams(
+                model=controlnet,
+                scale=1.0,
+                control_end=1.0,
+                image=canny_image,
+            ),
+        )
+        # TODO: replace image
+        self.assertImageEqualAndSaveFailed(output_image, "flux/flux_union_pro_canny.png", threshold=0.7)
+
+        
+if __name__ == "__main__":
+    unittest.main()