huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/api/inputs.md‎
Lines changed: 38 additions & 0 deletions b/‎docs/source/en/api/inputs.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/diffusers/models/controlnet_union.py‎
Lines changed: 70 additions & 18 deletions b/‎src/diffusers/models/controlnet_union.py‎
Lines changed: 70 additions & 18 deletions
diff --git a/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py‎
Lines changed: 22 additions & 17 deletions b/‎src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py‎
Lines changed: 22 additions & 17 deletions
@@ -219,6 +219,8 @@
       title: Logging
     - local: api/outputs
       title: Outputs
+    - local: api/inputs
+      title: Inputs
     - local: api/quantization
       title: Quantization
     title: Main Classes
 
@@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Inputs
+
+Some model inputs are subclasses of [`~utils.BaseInput`], data structures containing all the information needed by the model. The inputs can also be used as tuples or dictionaries.
+
+For example:
+
+```python
+from diffusers.models.controlnet_union import ControlNetUnionInput
+
+union_input = ControlNetUnionInput(
+    openpose=...
+)
+```
+
+When considering the `inputs` object as a tuple, it considers all the attributes including those that have `None` values.
+
+<Tip>
+
+To check a specific pipeline or model input, refer to its corresponding API documentation.
+
+</Tip>
+
+## BaseInput
+
+[[autodoc]] utils.BaseInput
+    - to_tuple
@@ -11,15 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 from transformers.activations import QuickGELUActivation as QuickGELU
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..image_processor import PipelineImageInput
 from ..loaders.single_file_model import FromOriginalModelMixin
-from ..utils import logging
+from ..utils import BaseInput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
@@ -39,6 +41,52 @@
 from .unets.unet_2d_condition import UNet2DConditionModel
 
 
+@dataclass
+class ControlNetUnionInput(BaseInput):
+    """
+    The image input of [`ControlNetUnionModel`]:
+
+    - 0: openpose
+    - 1: depth
+    - 2: hed/pidi/scribble/ted
+    - 3: canny/lineart/anime_lineart/mlsd
+    - 4: normal
+    - 5: segment
+    """
+
+    openpose: PipelineImageInput = None
+    depth: PipelineImageInput = None
+    hed: PipelineImageInput = None
+    canny: PipelineImageInput = None
+    normal: PipelineImageInput = None
+    segment: PipelineImageInput = None
+
+
+@dataclass
+class ControlNetUnionInputProMax(BaseInput):
+    """
+    The image input of [`ControlNetUnionModel`] for ProMax variants:
+
+    - 0: openpose
+    - 1: depth
+    - 2: hed/pidi/scribble/ted
+    - 3: canny/lineart/anime_lineart/mlsd
+    - 4: normal
+    - 5: segment
+    - 6: tile
+    - 7: repaint
+    """
+
+    openpose: PipelineImageInput = None
+    depth: PipelineImageInput = None
+    hed: PipelineImageInput = None
+    canny: PipelineImageInput = None
+    normal: PipelineImageInput = None
+    segment: PipelineImageInput = None
+    tile: PipelineImageInput = None
+    repaint: PipelineImageInput = None
+
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -624,7 +672,8 @@ def forward(
         sample: torch.Tensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
-        controlnet_cond_list: List[torch.Tensor],
+        controlnet_cond: Union[ControlNetUnionInput, ControlNetUnionInputProMax],
+        control_type: torch.Tensor,
         conditioning_scale: float = 1.0,
         class_labels: Optional[torch.Tensor] = None,
         timestep_cond: Optional[torch.Tensor] = None,
@@ -644,8 +693,11 @@ def forward(
                 The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.Tensor`):
                 The encoder hidden states.
-            controlnet_cond_list (`List[torch.Tensor]`):
-                List of the conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            controlnet_cond (`Union[ControlNetUnionInput, ControlNetUnionInputProMax]`):
+                The conditional input tensors.
+            control_type (`torch.Tensor`):
+                A tensor of shape `(batch, num_control_type)` with values `0` or `1` depending on whether the
+                control type is used.
             conditioning_scale (`float`, defaults to `1.0`):
                 The scale factor for ControlNet outputs.
             class_labels (`torch.Tensor`, *optional*, defaults to `None`):
@@ -743,7 +795,6 @@ def forward(
                 add_embeds = add_embeds.to(emb.dtype)
                 aug_emb = self.add_embedding(add_embeds)
 
-        control_type = added_cond_kwargs.get("control_type")
         control_embeds = self.control_type_proj(control_type.flatten())
         control_embeds = control_embeds.reshape((t_emb.shape[0], -1))
         control_embeds = control_embeds.to(emb.dtype)
@@ -753,32 +804,33 @@ def forward(
 
         # 2. pre-process
         sample = self.conv_in(sample)
-        indices = torch.nonzero(control_type[0])
 
         inputs = []
         condition_list = []
 
-        for idx in range(indices.shape[0] + 1):
-            if idx == indices.shape[0]:
-                controlnet_cond = sample
-                feat_seq = torch.mean(controlnet_cond, dim=(2, 3))
-            else:
-                controlnet_cond = self.controlnet_cond_embedding(controlnet_cond_list[indices[idx][0]])
-                feat_seq = torch.mean(controlnet_cond, dim=(2, 3))
-                feat_seq = feat_seq + self.task_embedding[indices[idx][0]]
-
+        for idx, image_type in enumerate(controlnet_cond):
+            if controlnet_cond[image_type] is None:
+                continue
+            condition = self.controlnet_cond_embedding(controlnet_cond[image_type])
+            feat_seq = torch.mean(condition, dim=(2, 3))
+            feat_seq = feat_seq + self.task_embedding[idx]
             inputs.append(feat_seq.unsqueeze(1))
-            condition_list.append(controlnet_cond)
+            condition_list.append(condition)
+
+        condition = sample
+        feat_seq = torch.mean(condition, dim=(2, 3))
+        inputs.append(feat_seq.unsqueeze(1))
+        condition_list.append(condition)
 
         x = torch.cat(inputs, dim=1)
         for layer in self.transformer_layers:
             x = layer(x)
 
         controlnet_cond_fuser = sample * 0.0
-        for idx in range(indices.shape[0]):
+        for idx, condition in enumerate(condition_list):
             alpha = self.spatial_ch_projs(x[:, idx])
             alpha = alpha.unsqueeze(-1).unsqueeze(-1)
-            controlnet_cond_fuser += condition_list[idx] + alpha
+            controlnet_cond_fuser += condition + alpha
 
         sample = sample + controlnet_cond_fuser
 
 
@@ -40,6 +40,7 @@
     AttnProcessor2_0,
     XFormersAttnProcessor,
 )
+from ...models.controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1184,10 +1185,7 @@ def __call__(
         prompt_2: Optional[Union[str, List[str]]] = None,
         image: PipelineImageInput = None,
         mask_image: PipelineImageInput = None,
-        control_image_list: Union[
-            PipelineImageInput,
-            List[PipelineImageInput],
-        ] = None,
+        control_image_list: Union[ControlNetUnionInput, ControlNetUnionInputProMax] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         padding_mask_crop: Optional[int] = None,
@@ -1226,8 +1224,6 @@ def __call__(
             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        union_control=False,
-        union_control_type=None,
         **kwargs,
     ):
         r"""
@@ -1433,12 +1429,13 @@ def __call__(
             )
 
         # 1. Check inputs
-        for control_image in control_image_list:
-            if control_image:
+        control_type = []
+        for image_type in control_image_list:
+            if control_image_list[image_type]:
                 self.check_inputs(
                     prompt,
                     prompt_2,
-                    control_image,
+                    control_image_list[image_type],
                     mask_image,
                     strength,
                     num_inference_steps,
@@ -1458,6 +1455,11 @@ def __call__(
                     callback_on_step_end_tensor_inputs,
                     padding_mask_crop,
                 )
+                control_type.append(1)
+            else:
+                control_type.append(0)
+
+        control_type = torch.Tensor(control_type)
 
         self._guidance_scale = guidance_scale
         self._clip_skip = clip_skip
@@ -1553,10 +1555,10 @@ def denoising_value_valid(dnv):
         init_image = init_image.to(dtype=torch.float32)
 
         # 5.2 Prepare control images
-        for idx in range(len(control_image_list)):
-            if control_image_list[idx]:
+        for image_type in control_image_list:
+            if control_image_list[image_type]:
                 control_image = self.prepare_control_image(
-                    image=control_image_list[idx],
+                    image=control_image_list[image_type],
                     width=width,
                     height=height,
                     batch_size=batch_size * num_images_per_prompt,
@@ -1569,7 +1571,7 @@ def denoising_value_valid(dnv):
                     guess_mode=guess_mode,
                 )
                 height, width = control_image.shape[-2:]
-                control_image_list[idx] = control_image
+                control_image_list[image_type] = control_image
 
         # 5.3 Prepare mask
         mask = self.mask_processor.preprocess(
@@ -1709,6 +1711,11 @@ def denoising_value_valid(dnv):
             num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
             timesteps = timesteps[:num_inference_steps]
 
+        control_type = (
+            control_type.reshape(1, -1)
+            .to(device, dtype=prompt_embeds.dtype)
+            .repeat(batch_size * num_images_per_prompt * 2, 1)
+        )
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
@@ -1723,9 +1730,6 @@ def denoising_value_valid(dnv):
                 added_cond_kwargs = {
                     "text_embeds": add_text_embeds,
                     "time_ids": add_time_ids,
-                    "control_type": union_control_type.reshape(1, -1)
-                    .to(device, dtype=prompt_embeds.dtype)
-                    .repeat(batch_size * num_images_per_prompt * 2, 1),
                 }
 
                 # controlnet(s) inference
@@ -1759,7 +1763,8 @@ def denoising_value_valid(dnv):
                     control_model_input,
                     t,
                     encoder_hidden_states=controlnet_prompt_embeds,
-                    controlnet_cond_list=control_image_list,
+                    controlnet_cond=control_image_list,
+                    control_type=control_type,
                     conditioning_scale=cond_scale,
                     guess_mode=guess_mode,
                     added_cond_kwargs=controlnet_added_cond_kwargs,