fixes

hlky · hlky · commit 5930bcf6d15c · 2024-12-05T12:55:00.000Z
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -101,6 +101,7 @@
         )
         from .controlnets import (
             ControlNetModel,
+            ControlNetUnionModel,
             ControlNetXSAdapter,
             FluxControlNetModel,
             FluxMultiControlNetModel,
diff --git a/src/diffusers/models/controlnets/__init__.py b/src/diffusers/models/controlnets/__init__.py
@@ -15,6 +15,7 @@
         SparseControlNetModel,
         SparseControlNetOutput,
     )
+    from .controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax, ControlNetUnionModel
     from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
     from .multicontrolnet import MultiControlNetModel
 
diff --git a/src/diffusers/models/controlnets/controlnet_union.py b/src/diffusers/models/controlnets/controlnet_union.py
@@ -18,31 +18,31 @@
 from torch import nn
 from transformers.activations import QuickGELUActivation as QuickGELU
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..image_processor import PipelineImageInput
-from ..loaders.single_file_model import FromOriginalModelMixin
-from ..utils import BaseInput, logging
-from .attention_processor import (
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...image_processor import PipelineImageInput
+from ...loaders.single_file_model import FromOriginalModelMixin
+from ...utils import logging
+from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
-from .embeddings import TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unets.unet_2d_blocks import (
+from ..embeddings import TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..unets.unet_2d_blocks import (
     CrossAttnDownBlock2D,
     DownBlock2D,
     UNetMidBlock2DCrossAttn,
     get_down_block,
 )
-from .unets.unet_2d_condition import UNet2DConditionModel
+from ..unets.unet_2d_condition import UNet2DConditionModel
+from .controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
 
 
 @dataclass
-class ControlNetUnionInput(BaseInput):
+class ControlNetUnionInput:
     """
     The image input of [`ControlNetUnionModel`]:
 
@@ -54,18 +54,27 @@ class ControlNetUnionInput(BaseInput):
     - 5: segment
     """
 
-    openpose: PipelineImageInput = None
-    depth: PipelineImageInput = None
-    hed: PipelineImageInput = None
-    canny: PipelineImageInput = None
-    normal: PipelineImageInput = None
-    segment: PipelineImageInput = None
+    openpose: Optional[PipelineImageInput] = None
+    depth: Optional[PipelineImageInput] = None
+    hed: Optional[PipelineImageInput] = None
+    canny: Optional[PipelineImageInput] = None
+    normal: Optional[PipelineImageInput] = None
+    segment: Optional[PipelineImageInput] = None
+
+    def __len__(self) -> int:
+        return len(vars(self))
+
+    def __iter__(self):
+        return iter(vars(self))
+
+    def __getitem__(self, key):
+        return getattr(self, key)
 
 
 @dataclass
-class ControlNetUnionInputProMax(BaseInput):
+class ControlNetUnionInputProMax:
     """
-    The image input of [`ControlNetUnionModel`] for ProMax variants:
+    The image input of [`ControlNetUnionModel`]:
 
     - 0: openpose
     - 1: depth
@@ -77,14 +86,23 @@ class ControlNetUnionInputProMax(BaseInput):
     - 7: repaint
     """
 
-    openpose: PipelineImageInput = None
-    depth: PipelineImageInput = None
-    hed: PipelineImageInput = None
-    canny: PipelineImageInput = None
-    normal: PipelineImageInput = None
-    segment: PipelineImageInput = None
-    tile: PipelineImageInput = None
-    repaint: PipelineImageInput = None
+    openpose: Optional[PipelineImageInput] = None
+    depth: Optional[PipelineImageInput] = None
+    hed: Optional[PipelineImageInput] = None
+    canny: Optional[PipelineImageInput] = None
+    normal: Optional[PipelineImageInput] = None
+    segment: Optional[PipelineImageInput] = None
+    tile: Optional[PipelineImageInput] = None
+    repaint: Optional[PipelineImageInput] = None
+
+    def __len__(self) -> int:
+        return len(vars(self))
+
+    def __iter__(self):
+        return iter(vars(self))
+
+    def __getitem__(self, key):
+        return getattr(self, key)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
@@ -40,7 +40,7 @@
     AttnProcessor2_0,
     XFormersAttnProcessor,
 )
-from ...models.controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax
+from ...models.controlnets import ControlNetUnionInput, ControlNetUnionInputProMax
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -605,6 +605,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
     def check_image(self, image, prompt, prompt_embeds):
         image_is_pil = isinstance(image, PIL.Image.Image)
         image_is_tensor = isinstance(image, torch.Tensor)
@@ -826,6 +827,7 @@ def check_inputs(
                     f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
                 )
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
     def prepare_control_image(
         self,
         image,
@@ -860,6 +862,7 @@ def prepare_control_image(
 
         return image
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_inpaint_sd_xl.StableDiffusionXLControlNetInpaintPipeline.prepare_latents
     def prepare_latents(
         self,
         batch_size,
@@ -927,6 +930,7 @@ def prepare_latents(
 
         return outputs
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_inpaint_sd_xl.StableDiffusionXLControlNetInpaintPipeline._encode_vae_image
     def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
         dtype = image.dtype
         if self.vae.config.force_upcast:
@@ -950,6 +954,7 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
 
         return image_latents
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_inpaint_sd_xl.StableDiffusionXLControlNetInpaintPipeline.prepare_mask_latents
     def prepare_mask_latents(
         self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
     ):
@@ -1560,7 +1565,7 @@ def denoising_value_valid(dnv):
             latents, noise = latents_outputs
 
         # 7. Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
+        mask, _ = self.prepare_mask_latents(
             mask,
             masked_image,
             batch_size * num_images_per_prompt,
@@ -1573,19 +1578,7 @@ def denoising_value_valid(dnv):
         )
 
         # 8. Check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
+        if num_channels_unet != 4:
             raise ValueError(
                 f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
             )
@@ -1673,7 +1666,6 @@ def denoising_value_valid(dnv):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
 
-                # concat latents, mask, masked_image_latents in the channel dimension
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 added_cond_kwargs = {
@@ -1730,9 +1722,6 @@ def denoising_value_valid(dnv):
                 if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
                     added_cond_kwargs["image_embeds"] = image_embeds
 
-                if num_channels_unet == 9:
-                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
@@ -1757,20 +1746,19 @@ def denoising_value_valid(dnv):
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
-                if num_channels_unet == 4:
-                    init_latents_proper = image_latents
-                    if self.do_classifier_free_guidance:
-                        init_mask, _ = mask.chunk(2)
-                    else:
-                        init_mask = mask
+                init_latents_proper = image_latents
+                if self.do_classifier_free_guidance:
+                    init_mask, _ = mask.chunk(2)
+                else:
+                    init_mask = mask
 
-                    if i < len(timesteps) - 1:
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_proper, noise, torch.tensor([noise_timestep])
-                        )
+                if i < len(timesteps) - 1:
+                    noise_timestep = timesteps[i + 1]
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_proper, noise, torch.tensor([noise_timestep])
+                    )
 
-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
@@ -16,6 +16,8 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import numpy as np
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from transformers import (
@@ -41,7 +43,7 @@
     AttnProcessor2_0,
     XFormersAttnProcessor,
 )
-from ...models.controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax
+from ...models.controlnets import ControlNetUnionInput, ControlNetUnionInputProMax
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -608,6 +610,44 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
     def check_inputs(
         self,
         prompt,
@@ -1228,7 +1268,7 @@ def __call__(
         # 4. Prepare image
         for image_type in image:
             if image[image_type]:
-                image = self.prepare_image(
+                image[image_type] = self.prepare_image(
                     image=image[image_type],
                     width=width,
                     height=height,
@@ -1240,7 +1280,6 @@ def __call__(
                     guess_mode=guess_mode,
                 )
                 height, width = image.shape[-2:]
-                image[image_type] = image
 
         # 5. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
@@ -43,7 +43,7 @@
     AttnProcessor2_0,
     XFormersAttnProcessor,
 )
-from ...models.controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax
+from ...models.controlnets import ControlNetUnionInput, ControlNetUnionInputProMax
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
diff --git a/src/diffusers/utils/inputs.py b/src/diffusers/utils/inputs.py

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@`
`101`	`101`	`)`
`102`	`102`	`from .controlnets import (`
`103`	`103`	`ControlNetModel,`
	`104`	`+ ControlNetUnionModel,`
`104`	`105`	`ControlNetXSAdapter,`
`105`	`106`	`FluxControlNetModel,`
`106`	`107`	`FluxMultiControlNetModel,`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`SparseControlNetModel,`
`16`	`16`	`SparseControlNetOutput,`
`17`	`17`	`)`
	`18`	`+ from .controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax, ControlNetUnionModel`
`18`	`19`	`from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel`
`19`	`20`	`from .multicontrolnet import MultiControlNetModel`
`20`	`21`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@`
`43`	`43`	`AttnProcessor2_0,`
`44`	`44`	`XFormersAttnProcessor,`
`45`	`45`	`)`
`46`		`-from ...models.controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax`
	`46`	`+from ...models.controlnets import ControlNetUnionInput, ControlNetUnionInputProMax`
`47`	`47`	`from ...models.lora import adjust_lora_scale_text_encoder`
`48`	`48`	`from ...schedulers import KarrasDiffusionSchedulers`
`49`	`49`	`from ...utils import (`