Skip to content

Commit 31593e2

Browse files
committed
update
1 parent b81bd78 commit 31593e2

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_modular.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def expected_components(self) -> List[ComponentSpec]:
184184
ComponentSpec("image_encoder", CLIPVisionModelWithProjection),
185185
ComponentSpec("feature_extractor", CLIPImageProcessor),
186186
ComponentSpec("unet", UNet2DConditionModel),
187+
ComponentSpec("guider", GuiderType),
187188
]
188189

189190
@property
@@ -276,7 +277,7 @@ def prepare_ip_adapter_image_embeds(
276277
def __call__(self, pipeline, state: PipelineState) -> PipelineState:
277278
data = self.get_block_state(state)
278279

279-
data.do_classifier_free_guidance = data.guidance_scale > 1.0
280+
data.do_classifier_free_guidance = pipeline.guider.num_conditions > 1
280281
data.device = pipeline._execution_device
281282

282283
data.ip_adapter_embeds = self.prepare_ip_adapter_image_embeds(
@@ -315,7 +316,7 @@ def expected_components(self) -> List[ComponentSpec]:
315316
ComponentSpec("text_encoder_2", CLIPTextModelWithProjection),
316317
ComponentSpec("tokenizer", CLIPTokenizer),
317318
ComponentSpec("tokenizer_2", CLIPTokenizer),
318-
ComponentSpec("guider", GuiderType, obj=ClassifierFreeGuidance()),
319+
ComponentSpec("guider", GuiderType),
319320
]
320321

321322
@property
@@ -3490,6 +3491,11 @@ def description(self):
34903491
"- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n" + \
34913492
"- for text-to-image generation, all you need to provide is `prompt`"
34923493

3494+
# TODO(yiyi, aryan): We need another step before text encoder to set the `num_inference_steps` attribute for guider so that
3495+
# things like when to do guidance and how many conditions to be prepared can be determined. Currently, this is done by
3496+
# always assuming you want to do guidance in the Guiders. So, negative embeddings are prepared regardless of what the
3497+
# configuration of guider is.
3498+
34933499
# block mapping
34943500
TEXT2IMAGE_BLOCKS = OrderedDict([
34953501
("text_encoder", StableDiffusionXLTextEncoderStep),
@@ -3611,7 +3617,6 @@ def num_channels_latents(self):
36113617
"negative_prompt": InputParam("negative_prompt", type_hint=Union[str, List[str]], description="The prompt or prompts not to guide the image generation"),
36123618
"negative_prompt_2": InputParam("negative_prompt_2", type_hint=Union[str, List[str]], description="The negative prompt or prompts for text_encoder_2"),
36133619
"cross_attention_kwargs": InputParam("cross_attention_kwargs", type_hint=Optional[dict], description="Kwargs dictionary passed to the AttentionProcessor"),
3614-
"guidance_scale": InputParam("guidance_scale", type_hint=float, default=5.0, description="Classifier-Free Diffusion Guidance scale"),
36153620
"clip_skip": InputParam("clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"),
36163621
"image": InputParam("image", type_hint=PipelineImageInput, required=True, description="The image(s) to modify for img2img or inpainting"),
36173622
"mask_image": InputParam("mask_image", type_hint=PipelineImageInput, required=True, description="Mask image for inpainting, white pixels will be repainted"),
@@ -3636,7 +3641,6 @@ def num_channels_latents(self):
36363641
"negative_crops_coords_top_left": InputParam("negative_crops_coords_top_left", type_hint=Tuple[int, int], default=(0, 0), description="Negative conditioning crop coordinates"),
36373642
"aesthetic_score": InputParam("aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"),
36383643
"negative_aesthetic_score": InputParam("negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"),
3639-
"guidance_rescale": InputParam("guidance_rescale", type_hint=float, default=0.0, description="Guidance rescale factor to fix overexposure"),
36403644
"eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
36413645
"guider_kwargs": InputParam("guider_kwargs", type_hint=Optional[Dict[str, Any]], description="Kwargs dictionary passed to the Guider"),
36423646
"output_type": InputParam("output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"),
@@ -3704,4 +3708,4 @@ def num_channels_latents(self):
37043708

37053709
SDXL_OUTPUTS_SCHEMA = {
37063710
"images": OutputParam("images", type_hint=Union[Tuple[Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]]], StableDiffusionXLPipelineOutput], description="The final generated images")
3707-
}
3711+
}

0 commit comments

Comments
 (0)