Skip to content

Commit cc6833c

Browse files
committed
update
1 parent 7204481 commit cc6833c

10 files changed

+122
-40
lines changed

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,9 @@ def __init__(
197197
self.vae_scale_factor = (
198198
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
199199
)
200-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
200+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
201+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
202+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
201203
self.tokenizer_max_length = (
202204
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
203205
)
@@ -386,9 +388,9 @@ def check_inputs(
386388
callback_on_step_end_tensor_inputs=None,
387389
max_sequence_length=None,
388390
):
389-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
390-
raise ValueError(
391-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
391+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
392+
logger.warning(
393+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
392394
)
393395

394396
if callback_on_step_end_tensor_inputs is not None and not all(

src/diffusers/pipelines/flux/pipeline_flux_controlnet.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,9 @@ def __init__(
218218
self.vae_scale_factor = (
219219
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
220220
)
221-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
221+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
222+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
223+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
222224
self.tokenizer_max_length = (
223225
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
224226
)
@@ -410,9 +412,9 @@ def check_inputs(
410412
callback_on_step_end_tensor_inputs=None,
411413
max_sequence_length=None,
412414
):
413-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
414-
raise ValueError(
415-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
415+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
416+
logger.warning(
417+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
416418
)
417419

418420
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -500,8 +502,10 @@ def prepare_latents(
500502
generator,
501503
latents=None,
502504
):
503-
height = int(height) // self.vae_scale_factor
504-
width = int(width) // self.vae_scale_factor
505+
# VAE applies 8x compression on images but we must also account for packing which requires
506+
# latent height and width to be divisible by 2.
507+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
508+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
505509

506510
shape = (batch_size, num_channels_latents, height, width)
507511

src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,9 @@ def __init__(
230230
self.vae_scale_factor = (
231231
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
232232
)
233-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
233+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
234+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
235+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
234236
self.tokenizer_max_length = (
235237
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
236238
)
@@ -453,9 +455,9 @@ def check_inputs(
453455
if strength < 0 or strength > 1:
454456
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
455457

456-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
457-
raise ValueError(
458-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
458+
if height % self.vae_scale_factor * 2 != 0 or width % self.vae_scale_factor * 2 != 0:
459+
logger.warning(
460+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
459461
)
460462

461463
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -551,8 +553,10 @@ def prepare_latents(
551553
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
552554
)
553555

554-
height = int(height) // self.vae_scale_factor
555-
width = int(width) // self.vae_scale_factor
556+
# VAE applies 8x compression on images but we must also account for packing which requires
557+
# latent height and width to be divisible by 2.
558+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
559+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
556560

557561
shape = (batch_size, num_channels_latents, height, width)
558562
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)

src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,11 @@ def __init__(
233233
self.vae_scale_factor = (
234234
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
235235
)
236-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
236+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
237+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
238+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
237239
self.mask_processor = VaeImageProcessor(
238-
vae_scale_factor=self.vae_scale_factor,
240+
vae_scale_factor=self.vae_scale_factor * 2,
239241
vae_latent_channels=self.vae.config.latent_channels,
240242
do_normalize=False,
241243
do_binarize=True,
@@ -467,9 +469,9 @@ def check_inputs(
467469
if strength < 0 or strength > 1:
468470
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
469471

470-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
471-
raise ValueError(
472-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
472+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
473+
logger.warning(
474+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
473475
)
474476

475477
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -578,8 +580,10 @@ def prepare_latents(
578580
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
579581
)
580582

581-
height = int(height) // self.vae_scale_factor
582-
width = int(width) // self.vae_scale_factor
583+
# VAE applies 8x compression on images but we must also account for packing which requires
584+
# latent height and width to be divisible by 2.
585+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
586+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
583587

584588
shape = (batch_size, num_channels_latents, height, width)
585589
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
@@ -624,8 +628,10 @@ def prepare_mask_latents(
624628
device,
625629
generator,
626630
):
627-
height = int(height) // self.vae_scale_factor
628-
width = int(width) // self.vae_scale_factor
631+
# VAE applies 8x compression on images but we must also account for packing which requires
632+
# latent height and width to be divisible by 2.
633+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
634+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
629635
# resize the mask to latents shape as we concatenate the mask to the latents
630636
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
631637
# and half precision

src/diffusers/pipelines/flux/pipeline_flux_img2img.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,9 @@ def __init__(
214214
self.vae_scale_factor = (
215215
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
216216
)
217-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
217+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
218+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
219+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
218220
self.tokenizer_max_length = (
219221
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
220222
)
@@ -436,9 +438,9 @@ def check_inputs(
436438
if strength < 0 or strength > 1:
437439
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
438440

439-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
440-
raise ValueError(
441-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
441+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
442+
logger.warning(
443+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
442444
)
443445

444446
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -533,8 +535,10 @@ def prepare_latents(
533535
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
534536
)
535537

536-
height = int(height) // self.vae_scale_factor
537-
width = int(width) // self.vae_scale_factor
538+
# VAE applies 8x compression on images but we must also account for packing which requires
539+
# latent height and width to be divisible by 2.
540+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
541+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
538542

539543
shape = (batch_size, num_channels_latents, height, width)
540544
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)

src/diffusers/pipelines/flux/pipeline_flux_inpaint.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,11 @@ def __init__(
211211
self.vae_scale_factor = (
212212
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
213213
)
214-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
214+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
215+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
216+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
215217
self.mask_processor = VaeImageProcessor(
216-
vae_scale_factor=self.vae_scale_factor,
218+
vae_scale_factor=self.vae_scale_factor * 2,
217219
vae_latent_channels=self.vae.config.latent_channels,
218220
do_normalize=False,
219221
do_binarize=True,
@@ -445,9 +447,9 @@ def check_inputs(
445447
if strength < 0 or strength > 1:
446448
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
447449

448-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
449-
raise ValueError(
450-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
450+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
451+
logger.warning(
452+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
451453
)
452454

453455
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -555,8 +557,10 @@ def prepare_latents(
555557
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
556558
)
557559

558-
height = int(height) // self.vae_scale_factor
559-
width = int(width) // self.vae_scale_factor
560+
# VAE applies 8x compression on images but we must also account for packing which requires
561+
# latent height and width to be divisible by 2.
562+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
563+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
560564

561565
shape = (batch_size, num_channels_latents, height, width)
562566
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
@@ -600,8 +604,10 @@ def prepare_mask_latents(
600604
device,
601605
generator,
602606
):
603-
height = int(height) // self.vae_scale_factor
604-
width = int(width) // self.vae_scale_factor
607+
# VAE applies 8x compression on images but we must also account for packing which requires
608+
# latent height and width to be divisible by 2.
609+
height = int(height) // self.vae_scale_factor - ((int(height) // self.vae_scale_factor) % 2)
610+
width = int(width) // self.vae_scale_factor - ((int(width) // self.vae_scale_factor) % 2)
605611
# resize the mask to latents shape as we concatenate the mask to the latents
606612
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
607613
# and half precision

tests/pipelines/controlnet_flux/test_controlnet_flux.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,20 @@ def test_controlnet_flux(self):
181181
def test_xformers_attention_forwardGenerator_pass(self):
182182
pass
183183

184+
def test_flux_image_output_shape(self):
185+
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
186+
inputs = self.get_dummy_inputs(torch_device)
187+
188+
height_width_pairs = [(32, 32), (72, 56)]
189+
for height, width in height_width_pairs:
190+
expected_height = height - height % (pipe.vae_scale_factor * 2)
191+
expected_width = width - width % (pipe.vae_scale_factor * 2)
192+
193+
inputs.update({"height": height, "width": width})
194+
image = pipe(**inputs).images[0]
195+
output_height, output_width = image.shape
196+
assert (output_height, output_width) == (expected_height, expected_width)
197+
184198

185199
@slow
186200
@require_big_gpu_with_torch_cuda

tests/pipelines/flux/test_pipeline_flux.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,20 @@ def test_fused_qkv_projections(self):
191191
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
192192
), "Original outputs should match when fused QKV projections are disabled."
193193

194+
def test_flux_image_output_shape(self):
195+
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
196+
inputs = self.get_dummy_inputs(torch_device)
197+
198+
height_width_pairs = [(32, 32), (72, 56)]
199+
for height, width in height_width_pairs:
200+
expected_height = height - height % (pipe.vae_scale_factor * 2)
201+
expected_width = width - width % (pipe.vae_scale_factor * 2)
202+
203+
inputs.update({"height": height, "width": width})
204+
image = pipe(**inputs).images[0]
205+
output_height, output_width, _ = image.shape
206+
assert (output_height, output_width) == (expected_height, expected_width)
207+
194208

195209
@slow
196210
@require_big_gpu_with_torch_cuda

tests/pipelines/flux/test_pipeline_flux_img2img.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,17 @@ def test_flux_prompt_embeds(self):
147147

148148
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
149149
assert max_diff < 1e-4
150+
151+
def test_flux_image_output_shape(self):
152+
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
153+
inputs = self.get_dummy_inputs(torch_device)
154+
155+
height_width_pairs = [(32, 32), (72, 56)]
156+
for height, width in height_width_pairs:
157+
expected_height = height - height % (pipe.vae_scale_factor * 2)
158+
expected_width = width - width % (pipe.vae_scale_factor * 2)
159+
160+
inputs.update({"height": height, "width": width})
161+
image = pipe(**inputs).images[0]
162+
output_height, output_width, _ = image.shape
163+
assert (output_height, output_width) == (expected_height, expected_width)

tests/pipelines/flux/test_pipeline_flux_inpaint.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,17 @@ def test_flux_inpaint_prompt_embeds(self):
149149

150150
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
151151
assert max_diff < 1e-4
152+
153+
def test_flux_image_output_shape(self):
154+
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
155+
inputs = self.get_dummy_inputs(torch_device)
156+
157+
height_width_pairs = [(32, 32), (72, 56)]
158+
for height, width in height_width_pairs:
159+
expected_height = height - height % (pipe.vae_scale_factor * 2)
160+
expected_width = width - width % (pipe.vae_scale_factor * 2)
161+
162+
inputs.update({"height": height, "width": width})
163+
image = pipe(**inputs).images[0]
164+
output_height, output_width, _ = image.shape
165+
assert (output_height, output_width) == (expected_height, expected_width)

0 commit comments

Comments
 (0)