Skip to content

Commit 87246fc

Browse files
ZHJ19970917hlky
authored andcommitted
update StableDiffusion3Img2ImgPipeline.add image size validation (#10166)
* update StableDiffusion3Img2ImgPipeline.add image size validation --------- Co-authored-by: hlky <[email protected]>
1 parent 72ac0aa commit 87246fc

File tree

3 files changed

+55
-2
lines changed

3 files changed

+55
-2
lines changed

src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,8 @@ def check_inputs(
549549
prompt,
550550
prompt_2,
551551
prompt_3,
552+
height,
553+
width,
552554
strength,
553555
negative_prompt=None,
554556
negative_prompt_2=None,
@@ -560,6 +562,15 @@ def check_inputs(
560562
callback_on_step_end_tensor_inputs=None,
561563
max_sequence_length=None,
562564
):
565+
if (
566+
height % (self.vae_scale_factor * self.patch_size) != 0
567+
or width % (self.vae_scale_factor * self.patch_size) != 0
568+
):
569+
raise ValueError(
570+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
571+
f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
572+
)
573+
563574
if strength < 0 or strength > 1:
564575
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
565576

@@ -730,6 +741,8 @@ def __call__(
730741
prompt: Union[str, List[str]] = None,
731742
prompt_2: Optional[Union[str, List[str]]] = None,
732743
prompt_3: Optional[Union[str, List[str]]] = None,
744+
height: Optional[int] = None,
745+
width: Optional[int] = None,
733746
image: PipelineImageInput = None,
734747
strength: float = 0.6,
735748
num_inference_steps: int = 50,
@@ -860,11 +873,15 @@ def __call__(
860873
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
861874
`tuple`. When returning a tuple, the first element is a list with the generated images.
862875
"""
876+
height = height or self.default_sample_size * self.vae_scale_factor
877+
width = width or self.default_sample_size * self.vae_scale_factor
863878
# 1. Check inputs. Raise error if not correct
864879
self.check_inputs(
865880
prompt,
866881
prompt_2,
867882
prompt_3,
883+
height,
884+
width,
868885
strength,
869886
negative_prompt=negative_prompt,
870887
negative_prompt_2=negative_prompt_2,
@@ -933,7 +950,7 @@ def __call__(
933950
pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
934951

935952
# 3. Preprocess image
936-
image = self.image_processor.preprocess(image)
953+
image = self.image_processor.preprocess(image, height=height, width=width)
937954

938955
# 4. Prepare timesteps
939956
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)

src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ def __init__(
218218
)
219219
self.tokenizer_max_length = self.tokenizer.model_max_length
220220
self.default_sample_size = self.transformer.config.sample_size
221+
self.patch_size = (
222+
self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
223+
)
221224

222225
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
223226
def _get_t5_prompt_embeds(
@@ -531,6 +534,8 @@ def check_inputs(
531534
prompt,
532535
prompt_2,
533536
prompt_3,
537+
height,
538+
width,
534539
strength,
535540
negative_prompt=None,
536541
negative_prompt_2=None,
@@ -542,6 +547,15 @@ def check_inputs(
542547
callback_on_step_end_tensor_inputs=None,
543548
max_sequence_length=None,
544549
):
550+
if (
551+
height % (self.vae_scale_factor * self.patch_size) != 0
552+
or width % (self.vae_scale_factor * self.patch_size) != 0
553+
):
554+
raise ValueError(
555+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
556+
f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
557+
)
558+
545559
if strength < 0 or strength > 1:
546560
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
547561

@@ -710,6 +724,8 @@ def __call__(
710724
prompt: Union[str, List[str]] = None,
711725
prompt_2: Optional[Union[str, List[str]]] = None,
712726
prompt_3: Optional[Union[str, List[str]]] = None,
727+
height: Optional[int] = None,
728+
width: Optional[int] = None,
713729
image: PipelineImageInput = None,
714730
strength: float = 0.6,
715731
num_inference_steps: int = 50,
@@ -824,12 +840,16 @@ def __call__(
824840
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
825841
`tuple`. When returning a tuple, the first element is a list with the generated images.
826842
"""
843+
height = height or self.default_sample_size * self.vae_scale_factor
844+
width = width or self.default_sample_size * self.vae_scale_factor
827845

828846
# 1. Check inputs. Raise error if not correct
829847
self.check_inputs(
830848
prompt,
831849
prompt_2,
832850
prompt_3,
851+
height,
852+
width,
833853
strength,
834854
negative_prompt=negative_prompt,
835855
negative_prompt_2=negative_prompt_2,
@@ -890,7 +910,7 @@ def __call__(
890910
pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
891911

892912
# 3. Preprocess image
893-
image = self.image_processor.preprocess(image)
913+
image = self.image_processor.preprocess(image, height=height, width=width)
894914

895915
# 4. Prepare timesteps
896916
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)

src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ def __init__(
224224
)
225225
self.tokenizer_max_length = self.tokenizer.model_max_length
226226
self.default_sample_size = self.transformer.config.sample_size
227+
self.patch_size = (
228+
self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
229+
)
227230

228231
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
229232
def _get_t5_prompt_embeds(
@@ -538,6 +541,8 @@ def check_inputs(
538541
prompt,
539542
prompt_2,
540543
prompt_3,
544+
height,
545+
width,
541546
strength,
542547
negative_prompt=None,
543548
negative_prompt_2=None,
@@ -549,6 +554,15 @@ def check_inputs(
549554
callback_on_step_end_tensor_inputs=None,
550555
max_sequence_length=None,
551556
):
557+
if (
558+
height % (self.vae_scale_factor * self.patch_size) != 0
559+
or width % (self.vae_scale_factor * self.patch_size) != 0
560+
):
561+
raise ValueError(
562+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
563+
f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
564+
)
565+
552566
if strength < 0 or strength > 1:
553567
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
554568

@@ -953,6 +967,8 @@ def __call__(
953967
prompt,
954968
prompt_2,
955969
prompt_3,
970+
height,
971+
width,
956972
strength,
957973
negative_prompt=negative_prompt,
958974
negative_prompt_2=negative_prompt_2,

0 commit comments

Comments
 (0)