huggingface
diff --git a/‎examples/community/README_community_scripts.md‎
Lines changed: 149 additions & 27 deletions b/‎examples/community/README_community_scripts.md‎
Lines changed: 149 additions & 27 deletions
diff --git a/‎src/diffusers/models/controlnets/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/models/controlnets/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/models/controlnets/controlnet_union.py‎
Lines changed: 8 additions & 93 deletions b/‎src/diffusers/models/controlnets/controlnet_union.py‎
Lines changed: 8 additions & 93 deletions
@@ -241,7 +241,45 @@ from diffusers import StableDiffusionPipeline
 from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
 from diffusers.configuration_utils import register_to_config
 import torch
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Tuple, Union
+
+
+class SDPromptSchedulingCallback(PipelineCallback):
+    @register_to_config
+    def __init__(
+        self,
+        encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        cutoff_step_ratio=None,
+        cutoff_step_index=None,
+    ):
+        super().__init__(
+            cutoff_step_ratio=cutoff_step_ratio, cutoff_step_index=cutoff_step_index
+        )
+
+    tensor_inputs = ["prompt_embeds"]
+
+    def callback_fn(
+        self, pipeline, step_index, timestep, callback_kwargs
+    ) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+        if isinstance(self.config.encoded_prompt, tuple):
+            prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
+        else:
+            prompt_embeds = self.config.encoded_prompt
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index
+            if cutoff_step_index is not None
+            else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            if pipeline.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+        return callback_kwargs
 
 
 pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
@@ -253,28 +291,73 @@ pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
 pipeline.safety_checker = None
 pipeline.requires_safety_checker = False
 
+callback = MultiPipelineCallbacks(
+    [
+        SDPromptSchedulingCallback(
+            encoded_prompt=pipeline.encode_prompt(
+                prompt=f"prompt {index}",
+                negative_prompt=f"negative prompt {index}",
+                device=pipeline._execution_device,
+                num_images_per_prompt=1,
+                # pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
+                do_classifier_free_guidance=True,
+            ),
+            cutoff_step_index=index,
+        ) for index in range(1, 20)
+    ]
+)
+
+image = pipeline(
+    prompt="prompt"
+    negative_prompt="negative prompt",
+    callback_on_step_end=callback,
+    callback_on_step_end_tensor_inputs=["prompt_embeds"],
+).images[0]
+torch.cuda.empty_cache()
+image.save('image.png')
+```
 
-class SDPromptScheduleCallback(PipelineCallback):
+```python
+from diffusers import StableDiffusionXLPipeline
+from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
+from diffusers.configuration_utils import register_to_config
+import torch
+from typing import Any, Dict, Tuple, Union
+
+
+class SDXLPromptSchedulingCallback(PipelineCallback):
     @register_to_config
     def __init__(
         self,
-        prompt: str,
-        negative_prompt: Optional[str] = None,
-        num_images_per_prompt: int = 1,
-        cutoff_step_ratio=1.0,
+        encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        add_text_embeds: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        add_time_ids: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        cutoff_step_ratio=None,
         cutoff_step_index=None,
     ):
         super().__init__(
             cutoff_step_ratio=cutoff_step_ratio, cutoff_step_index=cutoff_step_index
         )
 
-    tensor_inputs = ["prompt_embeds"]
+    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
 
     def callback_fn(
         self, pipeline, step_index, timestep, callback_kwargs
     ) -> Dict[str, Any]:
         cutoff_step_ratio = self.config.cutoff_step_ratio
         cutoff_step_index = self.config.cutoff_step_index
+        if isinstance(self.config.encoded_prompt, tuple):
+            prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
+        else:
+            prompt_embeds = self.config.encoded_prompt
+        if isinstance(self.config.add_text_embeds, tuple):
+            add_text_embeds, negative_add_text_embeds = self.config.add_text_embeds
+        else:
+            add_text_embeds = self.config.add_text_embeds
+        if isinstance(self.config.add_time_ids, tuple):
+            add_time_ids, negative_add_time_ids = self.config.add_time_ids
+        else:
+            add_time_ids = self.config.add_time_ids
 
         # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
         cutoff_step = (
@@ -284,34 +367,73 @@ class SDPromptScheduleCallback(PipelineCallback):
         )
 
         if step_index == cutoff_step:
-            prompt_embeds, negative_prompt_embeds = pipeline.encode_prompt(
-                prompt=self.config.prompt,
-                negative_prompt=self.config.negative_prompt,
-                device=pipeline._execution_device,
-                num_images_per_prompt=self.config.num_images_per_prompt,
-                do_classifier_free_guidance=pipeline.do_classifier_free_guidance,
-            )
             if pipeline.do_classifier_free_guidance:
                 prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+                add_text_embeds = torch.cat([negative_add_text_embeds, add_text_embeds])
+                add_time_ids = torch.cat([negative_add_time_ids, add_time_ids])
             callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
+            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
         return callback_kwargs
 
-callback = MultiPipelineCallbacks(
-    [
-        SDPromptScheduleCallback(
-            prompt="Official portrait of a smiling world war ii general, female, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
-            negative_prompt="Deformed, ugly, bad anatomy",
-            cutoff_step_ratio=0.25,
+
+pipeline: StableDiffusionXLPipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    use_safetensors=True,
+).to("cuda")
+
+callbacks = []
+for index in range(1, 20):
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = pipeline.encode_prompt(
+        prompt=f"prompt {index}",
+        negative_prompt=f"prompt {index}",
+        device=pipeline._execution_device,
+        num_images_per_prompt=1,
+        # pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
+        do_classifier_free_guidance=True,
+    )
+    text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+    add_time_ids = pipeline._get_add_time_ids(
+        (1024, 1024),
+        (0, 0),
+        (1024, 1024),
+        dtype=prompt_embeds.dtype,
+        text_encoder_projection_dim=text_encoder_projection_dim,
+    )
+    negative_add_time_ids = pipeline._get_add_time_ids(
+        (1024, 1024),
+        (0, 0),
+        (1024, 1024),
+        dtype=prompt_embeds.dtype,
+        text_encoder_projection_dim=text_encoder_projection_dim,
+    )
+    callbacks.append(
+        SDXLPromptSchedulingCallback(
+            encoded_prompt=(prompt_embeds, negative_prompt_embeds),
+            add_text_embeds=(pooled_prompt_embeds, negative_pooled_prompt_embeds),
+            add_time_ids=(add_time_ids, negative_add_time_ids),
+            cutoff_step_index=index,
         )
-    ]
-)
+    )
+
+
+callback = MultiPipelineCallbacks(callbacks)
 
 image = pipeline(
-    prompt="Official portrait of a smiling world war ii general, male, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
-    negative_prompt="Deformed, ugly, bad anatomy",
+    prompt="prompt",
+    negative_prompt="negative prompt",
     callback_on_step_end=callback,
-    callback_on_step_end_tensor_inputs=["prompt_embeds"],
+    callback_on_step_end_tensor_inputs=[
+        "prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+    ],
 ).images[0]
-torch.cuda.empty_cache()
-image.save('image.png')
 ```
@@ -15,7 +15,7 @@
         SparseControlNetModel,
         SparseControlNetOutput,
     )
-    from .controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax, ControlNetUnionModel
+    from .controlnet_union import ControlNetUnionModel
     from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
     from .multicontrolnet import MultiControlNetModel
 
 
@@ -11,14 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...image_processor import PipelineImageInput
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import logging
 from ..attention_processor import (
@@ -40,76 +38,6 @@
 from .controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
 
 
-@dataclass
-class ControlNetUnionInput:
-    """
-    The image input of [`ControlNetUnionModel`]:
-
-    - 0: openpose
-    - 1: depth
-    - 2: hed/pidi/scribble/ted
-    - 3: canny/lineart/anime_lineart/mlsd
-    - 4: normal
-    - 5: segment
-    """
-
-    openpose: Optional[PipelineImageInput] = None
-    depth: Optional[PipelineImageInput] = None
-    hed: Optional[PipelineImageInput] = None
-    canny: Optional[PipelineImageInput] = None
-    normal: Optional[PipelineImageInput] = None
-    segment: Optional[PipelineImageInput] = None
-
-    def __len__(self) -> int:
-        return len(vars(self))
-
-    def __iter__(self):
-        return iter(vars(self))
-
-    def __getitem__(self, key):
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        setattr(self, key, value)
-
-
-@dataclass
-class ControlNetUnionInputProMax:
-    """
-    The image input of [`ControlNetUnionModel`]:
-
-    - 0: openpose
-    - 1: depth
-    - 2: hed/pidi/scribble/ted
-    - 3: canny/lineart/anime_lineart/mlsd
-    - 4: normal
-    - 5: segment
-    - 6: tile
-    - 7: repaint
-    """
-
-    openpose: Optional[PipelineImageInput] = None
-    depth: Optional[PipelineImageInput] = None
-    hed: Optional[PipelineImageInput] = None
-    canny: Optional[PipelineImageInput] = None
-    normal: Optional[PipelineImageInput] = None
-    segment: Optional[PipelineImageInput] = None
-    tile: Optional[PipelineImageInput] = None
-    repaint: Optional[PipelineImageInput] = None
-
-    def __len__(self) -> int:
-        return len(vars(self))
-
-    def __iter__(self):
-        return iter(vars(self))
-
-    def __getitem__(self, key):
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        setattr(self, key, value)
-
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -680,8 +608,9 @@ def forward(
         sample: torch.Tensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
-        controlnet_cond: Union[ControlNetUnionInput, ControlNetUnionInputProMax],
+        controlnet_cond: List[torch.Tensor],
         control_type: torch.Tensor,
+        control_type_idx: List[int],
         conditioning_scale: float = 1.0,
         class_labels: Optional[torch.Tensor] = None,
         timestep_cond: Optional[torch.Tensor] = None,
@@ -701,11 +630,13 @@ def forward(
                 The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.Tensor`):
                 The encoder hidden states.
-            controlnet_cond (`Union[ControlNetUnionInput, ControlNetUnionInputProMax]`):
+            controlnet_cond (`List[torch.Tensor]`):
                 The conditional input tensors.
             control_type (`torch.Tensor`):
                 A tensor of shape `(batch, num_control_type)` with values `0` or `1` depending on whether the control
                 type is used.
+            control_type_idx (`List[int]`):
+                The indices of `control_type`.
             conditioning_scale (`float`, defaults to `1.0`):
                 The scale factor for ControlNet outputs.
             class_labels (`torch.Tensor`, *optional*, defaults to `None`):
@@ -733,20 +664,6 @@ def forward(
                 If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
                 returned where the first element is the sample tensor.
         """
-        if not isinstance(controlnet_cond, (ControlNetUnionInput, ControlNetUnionInputProMax)):
-            raise ValueError(
-                "Expected type of `controlnet_cond` to be one of `ControlNetUnionInput` or `ControlNetUnionInputProMax`"
-            )
-        if len(controlnet_cond) != self.config.num_control_type:
-            if isinstance(controlnet_cond, ControlNetUnionInput):
-                raise ValueError(
-                    f"Expected num_control_type {self.config.num_control_type}, got {len(controlnet_cond)}. Try `ControlNetUnionInputProMax`."
-                )
-            elif isinstance(controlnet_cond, ControlNetUnionInputProMax):
-                raise ValueError(
-                    f"Expected num_control_type {self.config.num_control_type}, got {len(controlnet_cond)}. Try `ControlNetUnionInput`."
-                )
-
         # check channel order
         channel_order = self.config.controlnet_conditioning_channel_order
 
@@ -830,12 +747,10 @@ def forward(
         inputs = []
         condition_list = []
 
-        for idx, image_type in enumerate(controlnet_cond):
-            if controlnet_cond[image_type] is None:
-                continue
-            condition = self.controlnet_cond_embedding(controlnet_cond[image_type])
+        for cond, control_idx in zip(controlnet_cond, control_type_idx):
+            condition = self.controlnet_cond_embedding(cond)
             feat_seq = torch.mean(condition, dim=(2, 3))
-            feat_seq = feat_seq + self.task_embedding[idx]
+            feat_seq = feat_seq + self.task_embedding[control_idx]
             inputs.append(feat_seq.unsqueeze(1))
             condition_list.append(condition)
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`SparseControlNetModel,`
`16`	`16`	`SparseControlNetOutput,`
`17`	`17`	`)`
`18`		`- from .controlnet_union import ControlNetUnionInput, ControlNetUnionInputProMax, ControlNetUnionModel`
	`18`	`+ from .controlnet_union import ControlNetUnionModel`
`19`	`19`	`from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel`
`20`	`20`	`from .multicontrolnet import MultiControlNetModel`
`21`	`21`