up

yiyixuxu · yiyixuxu · commit 01300a35f539 · 2025-07-10T03:17:38.000+02:00
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
@@ -19,8 +19,9 @@
 import torch
 
 from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
 from ...image_processor import VaeImageProcessor
-from ...models import AutoencoderKL, ControlNetModel, ControlNetUnionModel
+from ...models import AutoencoderKL, ControlNetModel, ControlNetUnionModel, UNet2DConditionModel
 from ...pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import logging
@@ -266,37 +267,37 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="negative text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="negative pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "ip_adapter_embeds",
                 type_hint=List[torch.Tensor],
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="image embeddings for IP-Adapter",
             ),
             OutputParam(
                 "negative_ip_adapter_embeds",
                 type_hint=List[torch.Tensor],
-                kwargs_type="guider_input_fields",
+                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="negative image embeddings for IP-Adapter",
             ),
         ]
@@ -683,12 +684,6 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
             ),
-            OutputParam("mask", type_hint=torch.Tensor, description="The mask to use for inpainting generation"),
-            OutputParam(
-                "masked_image_latents",
-                type_hint=torch.Tensor,
-                description="The masked image latents to use for the inpainting generation (only for inpainting-specific unet)",
-            ),
             OutputParam(
                 "noise",
                 type_hint=torch.Tensor,
@@ -993,6 +988,7 @@ class StableDiffusionXLPrepareLatentsStep(PipelineBlock):
     def expected_components(self) -> List[ComponentSpec]:
         return [
             ComponentSpec("scheduler", EulerDiscreteScheduler),
+            ComponentSpec("vae", AutoencoderKL),
         ]
 
     @property
@@ -1105,6 +1101,18 @@ def expected_configs(self) -> List[ConfigSpec]:
             ConfigSpec("requires_aesthetics_score", False),
         ]
 
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("unet", UNet2DConditionModel),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+        ]
+
     @property
     def description(self) -> str:
         return "Step that prepares the additional conditioning for the image-to-image/inpainting generation process"
@@ -1315,6 +1323,18 @@ class StableDiffusionXLPrepareAdditionalConditioningStep(PipelineBlock):
     def description(self) -> str:
         return "Step that prepares the additional conditioning for the text-to-image generation process"
 
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("unet", UNet2DConditionModel),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+        ]
+
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
@@ -167,6 +167,17 @@ def description(self) -> str:
             + "only needed when you are using the `padding_mask_crop` option when pre-processing the image and mask"
         )
 
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
@@ -190,16 +201,6 @@ def intermediate_inputs(self) -> List[str]:
             ),
         ]
 
-    @property
-    def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-                description="The generated images with the mask overlayed",
-            )
-        ]
-
     @torch.no_grad()
     def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -91,7 +91,8 @@ def num_channels_latents(self):
         return num_channels_latents
 
 
-# YiYi Notes: not used yet, maintain a list of schema that can be used across all pipeline blocks
+# YiYi/Sayak TODO: not used yet, maintain a list of schema that can be used across all pipeline blocks
+# auto_docstring
 SDXL_INPUTS_SCHEMA = {
     "prompt": InputParam(
         "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"