modelscope
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎diffsynth_engine/models/flux/flux_dit.py‎
Lines changed: 5 additions & 1 deletion b/‎diffsynth_engine/models/flux/flux_dit.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 1 addition & 1 deletion b/‎diffsynth_engine/pipelines/base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎diffsynth_engine/pipelines/flux_image.py‎
Lines changed: 60 additions & 12 deletions b/‎diffsynth_engine/pipelines/flux_image.py‎
Lines changed: 60 additions & 12 deletions
diff --git a/‎diffsynth_engine/processor/__init__.py‎ b/‎diffsynth_engine/processor/__init__.py‎
diff --git a/‎diffsynth_engine/processor/canny_processor.py‎
Lines changed: 21 additions & 0 deletions b/‎diffsynth_engine/processor/canny_processor.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎diffsynth_engine/processor/depth_processor.py‎
Lines changed: 42 additions & 0 deletions b/‎diffsynth_engine/processor/depth_processor.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎diffsynth_engine/utils/onnx.py‎
Lines changed: 33 additions & 0 deletions b/‎diffsynth_engine/utils/onnx.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/common/test_case.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/common/test_case.py‎
Lines changed: 2 additions & 1 deletion
@@ -77,6 +77,13 @@ If you have any questions or feedback, please scan the QR code below, or send em
     <img src="assets/dingtalk.png" alt="dingtalk" width="400" />
 </div>
 
+## Contributing
+We welcome contributions to DiffSynth-Engine. After Install from source, we recommand developers install this project using following command to setup the development environment.
+```bash
+pip install -e '.[dev]'
+```
+TODO: Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) for more details.
+
 ## License
 This project is licensed under the Apache License 2.0. See the LICENSE file for details.
 
 
@@ -322,6 +322,7 @@ class FluxDiT(PreTrainedModel):
 
     def __init__(
         self,
+        in_channel: int = 64,
         attn_impl: Optional[str] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
@@ -336,7 +337,8 @@ def __init__(
             nn.Linear(3072, 3072, device=device, dtype=dtype),
         )
         self.context_embedder = nn.Linear(4096, 3072, device=device, dtype=dtype)
-        self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
+        # normal flux has 64 channels, bfl canny and depth has 128 channels, bfl fill has 384 channels, bfl redux has 64 channels
+        self.x_embedder = nn.Linear(in_channel, 3072, device=device, dtype=dtype)
 
         self.blocks = nn.ModuleList(
             [FluxDoubleTransformerBlock(3072, 24, attn_impl=attn_impl, device=device, dtype=dtype) for _ in range(19)]
@@ -430,13 +432,15 @@ def from_state_dict(
         state_dict: Dict[str, torch.Tensor],
         device: str,
         dtype: torch.dtype,
+        in_channel: int = 64,
         attn_impl: Optional[str] = None,
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
                 cls,
                 device=device,
                 dtype=dtype,
+                in_channel=in_channel,
                 attn_impl=attn_impl,
             )
             model = model.requires_grad_(False)  # for loading gguf
 
@@ -4,10 +4,10 @@
 from typing import Dict, List, Tuple
 from PIL import Image
 from dataclasses import dataclass
-from diffsynth_engine.utils.loader import load_file
 from diffsynth_engine.utils.offload import enable_sequential_cpu_offload
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
 from diffsynth_engine.utils import logging
+from diffsynth_engine.utils.loader import load_file
 from diffsynth_engine.utils.platform import empty_cache
 
 logger = logging.get_logger(__name__)
 
@@ -1,3 +1,4 @@
+from enum import Enum
 import re
 import os
 import torch
@@ -27,6 +28,8 @@
 from diffsynth_engine.utils.download import fetch_model
 from diffsynth_engine.utils.platform import empty_cache
 
+from einops import rearrange
+
 logger = logging.get_logger(__name__)
 
 
@@ -244,11 +247,25 @@ def accumulate(result, new_item):
 ImageType = Union[Image.Image, torch.Tensor, List[Image.Image], List[torch.Tensor]]
 
 
+class ControlType(Enum):
+    normal = "normal"
+    bfl_control = "bfl_control"
+    bfl_fill = "bfl_fill"
+
+    def get_in_channel(self):
+        if self == ControlType.normal:
+            return 64
+        elif self == ControlType.bfl_control:
+            return 128
+        elif self == ControlType.bfl_fill:
+            return 384
+
+
 @dataclass
 class ControlNetParams:
-    model: nn.Module
     scale: float
     image: ImageType
+    model: Optional[nn.Module] = None
     mask: Optional[ImageType] = None
     control_start: float = 0
     control_end: float = 1
@@ -287,6 +304,7 @@ def __init__(
         vae_tiled: bool = False,
         vae_tile_size: int = 256,
         vae_tile_stride: int = 256,
+        control_type: ControlType = ControlType.normal,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -312,6 +330,7 @@ def __init__(
         self.batch_cfg = batch_cfg
         self.ip_adapter = None
         self.redux = None
+        self.control_type = control_type
         self.model_names = [
             "text_encoder_1",
             "text_encoder_2",
@@ -324,6 +343,7 @@ def __init__(
     def from_pretrained(
         cls,
         model_path_or_config: str | os.PathLike | FluxModelConfig,
+        control_type: ControlType = ControlType.normal,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         offload_mode: str | None = None,
@@ -364,7 +384,11 @@ def from_pretrained(
             tokenizer_2 = T5TokenizerFast.from_pretrained(FLUX_TOKENIZER_2_CONF_PATH)
         with LoRAContext():
             dit = FluxDiT.from_state_dict(
-                dit_state_dict, device=init_device, dtype=model_config.dit_dtype, attn_impl=model_config.dit_attn_impl
+                dit_state_dict,
+                device=init_device,
+                dtype=model_config.dit_dtype,
+                in_channel=control_type.get_in_channel(),
+                attn_impl=model_config.dit_attn_impl,
             )
             if load_text_encoder:
                 text_encoder_1 = FluxTextEncoder1.from_state_dict(
@@ -386,6 +410,7 @@ def from_pretrained(
             vae_decoder=vae_decoder,
             vae_encoder=vae_encoder,
             load_text_encoder=load_text_encoder,
+            control_type=control_type,
             device=device,
             dtype=dtype,
         )
@@ -535,6 +560,12 @@ def predict_noise(
         current_step: int,
         total_step: int,
     ):
+        if self.control_type != ControlType.normal:
+            controlnet_param = controlnet_params[0]
+            latents = torch.cat((latents, controlnet_param.image * controlnet_param.scale), dim=1)
+            latents = latents.to(self.dtype)
+            controlnet_params = []
+
         double_block_output, single_block_output = self.predict_multicontrolnet(
             latents=latents,
             timestep=timestep,
@@ -547,7 +578,9 @@ def predict_noise(
             current_step=current_step,
             total_step=total_step,
         )
+        
         self.load_models_to_device(["dit"])
+        
         noise_pred = self.dit(
             hidden_states=latents,
             timestep=timestep,
@@ -600,16 +633,28 @@ def prepare_masked_latent(self, image: Image.Image, mask: Image.Image | None, he
             image = self.preprocess_image(image).to(device=self.device, dtype=self.dtype)
             latent = self.encode_image(image)
         else:
-            image = image.resize((width, height))
-            mask = mask.resize((width, height))
-            image = self.preprocess_image(image).to(device=self.device, dtype=self.dtype)
-            mask = self.preprocess_mask(mask).to(device=self.device, dtype=self.dtype)
-            masked_image = image.clone()
-            masked_image[(mask > 0.5).repeat(1, 3, 1, 1)] = -1
-            latent = self.encode_image(masked_image)
-            mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3]))
-            mask = 1 - mask
-            latent = torch.cat([latent, mask], dim=1)
+            if self.control_type == ControlType.normal:
+                image = image.resize((width, height))
+                mask = mask.resize((width, height))
+                image = self.preprocess_image(image).to(device=self.device, dtype=self.dtype)
+                mask = self.preprocess_mask(mask).to(device=self.device, dtype=self.dtype)
+                masked_image = image.clone()
+                masked_image[(mask > 0.5).repeat(1, 3, 1, 1)] = -1
+                latent = self.encode_image(masked_image)
+                mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3]))
+                mask = 1 - mask
+                latent = torch.cat([latent, mask], dim=1)
+            elif self.control_type == ControlType.bfl_fill:
+                image = image.resize((width, height))
+                mask = mask.resize((width, height))
+                image = self.preprocess_image(image).to(device=self.device, dtype=self.dtype)
+                mask = self.preprocess_mask(mask).to(device=self.device, dtype=self.dtype)
+                image = image * (1 - mask)
+                image = self.encode_image(image)
+                mask = rearrange(mask, "b 1 (h ph) (w pw) -> b (ph pw) h w", ph=8, pw=8)
+                latent = torch.cat((image, mask), dim=1)
+            else:
+                raise ValueError(f"Unsupported mask latent prepare for controlnet type: {self.control_type}")
         return latent
 
     def prepare_controlnet_params(self, controlnet_params: List[ControlNetParams], h, w):
@@ -706,6 +751,9 @@ def __call__(
         controlnet_params: List[ControlNetParams] | ControlNetParams = [],
         progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
     ):
+        if self.control_type != ControlType.normal:
+            assert controlnet_params and len(controlnet_params) == 1, "bfl_controlnet must have one controlnet"
+
         if input_image is not None:
             width, height = input_image.size
         if not isinstance(controlnet_params, list):
 
@@ -0,0 +1,21 @@
+import cv2
+import numpy as np
+from PIL import Image
+
+
+class CannyProcessor:
+    def __init__(
+        self,
+        device,
+        low_threshold: int = 100,
+        high_threshold: int = 200,
+    ):
+        self.device = device
+        self.low_threshold = low_threshold
+        self.high_threshold = high_threshold
+
+    def __call__(self, image: Image.Image) -> Image.Image:
+        image = np.array(image.convert("RGB"), dtype=np.uint8)
+        output_image = cv2.Canny(image, self.low_threshold, self.high_threshold)
+        output_image = Image.fromarray(output_image)
+        return output_image
@@ -0,0 +1,42 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torchvision.transforms.functional import to_tensor, normalize, resize, to_pil_image
+
+
+from diffsynth_engine.utils.download import fetch_model
+from diffsynth_engine.utils.onnx import OnnxModel
+
+
+MODEL_ID = "muse/depth_anything_detector"
+REVISION = "20240801180053"
+MODEL_NAME = "depth_anything_detector.onnx"
+
+
+class DepthProcessor:
+    def __init__(self, device):
+        self.device = device
+        model_path = fetch_model(model_uri=MODEL_ID, revision=REVISION, path=MODEL_NAME)
+        self.model = OnnxModel(model_path, device=self.device)
+
+    def _image_preprocess(self, image: Image.Image) -> np.ndarray:
+        image = resize(image, (518, 518))
+        image = to_tensor(image)
+        image = normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        image = image.unsqueeze(0).contiguous()
+        return image.numpy()
+
+    def __call__(self, img: Image.Image) -> Image.Image:
+        image = img
+        w, h = image.size
+        image = self._image_preprocess(image)
+        depth = self.model(image)
+        depth = torch.from_numpy(depth)
+        depth: torch.Tensor = F.interpolate(depth[None], (h, w), mode="bilinear", align_corners=False)
+        depth = depth.squeeze(0).squeeze(0)
+        # 确保张量在 [0, 255] 范围内，并转换为 uint8 类型
+        depth = torch.clamp(depth, 0, 255).byte()
+        # 转换为 PIL Image 对象
+        depth = to_pil_image(depth)
+        return depth
@@ -0,0 +1,33 @@
+import onnxruntime
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+class OnnxModel:
+    def __init__(self, model_path: str, device: str = "cuda:0"):
+        self.model_path = model_path
+        if "cuda" in device:
+            self.session = onnxruntime.InferenceSession(
+                model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+            )
+        else:
+            self.session = onnxruntime.InferenceSession(model_path, providers=["CPUExecutionProvider"])
+
+    def forward(self, *args, **kwargs):
+        inputs = {}
+        for key, value in kwargs.items():
+            inputs[key] = value
+        for i, arg in enumerate(args):
+            name = self.session.get_inputs()[i].name
+            if name in inputs:
+                raise ValueError(f"the input name [{name}] is duplicated")
+            inputs[name] = arg
+        return self.session.run(None, inputs)[0]
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
@@ -29,7 +29,8 @@ dependencies = [
     "torchsde",
     "pillow",
     "imageio[ffmpeg]",
-    "yunchang ; sys_platform == 'linux'"
+    "yunchang ; sys_platform == 'linux'",
+    "onnxruntime"
 ]
 
 [project.optional-dependencies]
 
@@ -1,5 +1,6 @@
 import unittest
 import os
+import time
 import numpy as np
 import torch
 from pathlib import Path
@@ -79,7 +80,7 @@ def assertImageEqualAndSaveFailed(self, input_image: Image.Image, expect_image_p
             self.assertImageEqual(input_image, expect_image, threshold=threshold)
         except Exception as e:
             name = expect_image_path.split("/")[-1]
-            input_image.save(f"{name}")
+            input_image.save(f"save_{time.time()}_{name}")
             raise e
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,8 @@ dependencies = [`
`29`	`29`	`"torchsde",`
`30`	`30`	`"pillow",`
`31`	`31`	`"imageio[ffmpeg]",`
`32`		`- "yunchang ; sys_platform == 'linux'"`
	`32`	`+ "yunchang ; sys_platform == 'linux'",`
	`33`	`+ "onnxruntime"`
`33`	`34`	`]`
`34`	`35`
`35`	`36`	`[project.optional-dependencies]`