Skip to content

Commit 4098e47

Browse files
authored
Merge branch 'main' into mochi-1-lor
2 parents 54803d3 + f6f7afa commit 4098e47

36 files changed

+376
-112
lines changed

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,9 @@ def __init__(
197197
self.vae_scale_factor = (
198198
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
199199
)
200-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
200+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
201+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
202+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
201203
self.tokenizer_max_length = (
202204
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
203205
)
@@ -386,9 +388,9 @@ def check_inputs(
386388
callback_on_step_end_tensor_inputs=None,
387389
max_sequence_length=None,
388390
):
389-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
390-
raise ValueError(
391-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
391+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
392+
logger.warning(
393+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
392394
)
393395

394396
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -451,8 +453,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
451453
def _unpack_latents(latents, height, width, vae_scale_factor):
452454
batch_size, num_patches, channels = latents.shape
453455

454-
height = height // vae_scale_factor
455-
width = width // vae_scale_factor
456+
# VAE applies 8x compression on images but we must also account for packing which requires
457+
# latent height and width to be divisible by 2.
458+
height = 2 * (int(height) // (vae_scale_factor * 2))
459+
width = 2 * (int(width) // (vae_scale_factor * 2))
456460

457461
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
458462
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -501,8 +505,10 @@ def prepare_latents(
501505
generator,
502506
latents=None,
503507
):
504-
height = int(height) // self.vae_scale_factor
505-
width = int(width) // self.vae_scale_factor
508+
# VAE applies 8x compression on images but we must also account for packing which requires
509+
# latent height and width to be divisible by 2.
510+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
511+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
506512

507513
shape = (batch_size, num_channels_latents, height, width)
508514

src/diffusers/pipelines/flux/pipeline_flux_controlnet.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,9 @@ def __init__(
218218
self.vae_scale_factor = (
219219
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
220220
)
221-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
221+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
222+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
223+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
222224
self.tokenizer_max_length = (
223225
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
224226
)
@@ -410,9 +412,9 @@ def check_inputs(
410412
callback_on_step_end_tensor_inputs=None,
411413
max_sequence_length=None,
412414
):
413-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
414-
raise ValueError(
415-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
415+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
416+
logger.warning(
417+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
416418
)
417419

418420
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -478,8 +480,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
478480
def _unpack_latents(latents, height, width, vae_scale_factor):
479481
batch_size, num_patches, channels = latents.shape
480482

481-
height = height // vae_scale_factor
482-
width = width // vae_scale_factor
483+
# VAE applies 8x compression on images but we must also account for packing which requires
484+
# latent height and width to be divisible by 2.
485+
height = 2 * (int(height) // (vae_scale_factor * 2))
486+
width = 2 * (int(width) // (vae_scale_factor * 2))
483487

484488
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
485489
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -500,8 +504,10 @@ def prepare_latents(
500504
generator,
501505
latents=None,
502506
):
503-
height = int(height) // self.vae_scale_factor
504-
width = int(width) // self.vae_scale_factor
507+
# VAE applies 8x compression on images but we must also account for packing which requires
508+
# latent height and width to be divisible by 2.
509+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
510+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
505511

506512
shape = (batch_size, num_channels_latents, height, width)
507513

src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,9 @@ def __init__(
230230
self.vae_scale_factor = (
231231
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
232232
)
233-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
233+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
234+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
235+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
234236
self.tokenizer_max_length = (
235237
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
236238
)
@@ -453,9 +455,9 @@ def check_inputs(
453455
if strength < 0 or strength > 1:
454456
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
455457

456-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
457-
raise ValueError(
458-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
458+
if height % self.vae_scale_factor * 2 != 0 or width % self.vae_scale_factor * 2 != 0:
459+
logger.warning(
460+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
459461
)
460462

461463
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -521,8 +523,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
521523
def _unpack_latents(latents, height, width, vae_scale_factor):
522524
batch_size, num_patches, channels = latents.shape
523525

524-
height = height // vae_scale_factor
525-
width = width // vae_scale_factor
526+
# VAE applies 8x compression on images but we must also account for packing which requires
527+
# latent height and width to be divisible by 2.
528+
height = 2 * (int(height) // (vae_scale_factor * 2))
529+
width = 2 * (int(width) // (vae_scale_factor * 2))
526530

527531
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
528532
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -551,9 +555,10 @@ def prepare_latents(
551555
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
552556
)
553557

554-
height = int(height) // self.vae_scale_factor
555-
width = int(width) // self.vae_scale_factor
556-
558+
# VAE applies 8x compression on images but we must also account for packing which requires
559+
# latent height and width to be divisible by 2.
560+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
561+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
557562
shape = (batch_size, num_channels_latents, height, width)
558563
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
559564

@@ -873,7 +878,6 @@ def __call__(
873878
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
874879

875880
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
876-
877881
latents, latent_image_ids = self.prepare_latents(
878882
init_image,
879883
latent_timestep,

src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,11 @@ def __init__(
233233
self.vae_scale_factor = (
234234
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
235235
)
236-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
236+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
237+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
238+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
237239
self.mask_processor = VaeImageProcessor(
238-
vae_scale_factor=self.vae_scale_factor,
240+
vae_scale_factor=self.vae_scale_factor * 2,
239241
vae_latent_channels=self.vae.config.latent_channels,
240242
do_normalize=False,
241243
do_binarize=True,
@@ -467,9 +469,9 @@ def check_inputs(
467469
if strength < 0 or strength > 1:
468470
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
469471

470-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
471-
raise ValueError(
472-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
472+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
473+
logger.warning(
474+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
473475
)
474476

475477
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -548,8 +550,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
548550
def _unpack_latents(latents, height, width, vae_scale_factor):
549551
batch_size, num_patches, channels = latents.shape
550552

551-
height = height // vae_scale_factor
552-
width = width // vae_scale_factor
553+
# VAE applies 8x compression on images but we must also account for packing which requires
554+
# latent height and width to be divisible by 2.
555+
height = 2 * (int(height) // (vae_scale_factor * 2))
556+
width = 2 * (int(width) // (vae_scale_factor * 2))
553557

554558
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
555559
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -578,9 +582,10 @@ def prepare_latents(
578582
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
579583
)
580584

581-
height = int(height) // self.vae_scale_factor
582-
width = int(width) // self.vae_scale_factor
583-
585+
# VAE applies 8x compression on images but we must also account for packing which requires
586+
# latent height and width to be divisible by 2.
587+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
588+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
584589
shape = (batch_size, num_channels_latents, height, width)
585590
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
586591

@@ -624,8 +629,10 @@ def prepare_mask_latents(
624629
device,
625630
generator,
626631
):
627-
height = int(height) // self.vae_scale_factor
628-
width = int(width) // self.vae_scale_factor
632+
# VAE applies 8x compression on images but we must also account for packing which requires
633+
# latent height and width to be divisible by 2.
634+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
635+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
629636
# resize the mask to latents shape as we concatenate the mask to the latents
630637
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
631638
# and half precision
@@ -663,7 +670,6 @@ def prepare_mask_latents(
663670

664671
# aligning device to prevent device errors when concating it with the latent model input
665672
masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
666-
667673
masked_image_latents = self._pack_latents(
668674
masked_image_latents,
669675
batch_size,

src/diffusers/pipelines/flux/pipeline_flux_img2img.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,9 @@ def __init__(
214214
self.vae_scale_factor = (
215215
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
216216
)
217-
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
217+
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
218+
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
219+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
218220
self.tokenizer_max_length = (
219221
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
220222
)
@@ -437,9 +439,9 @@ def check_inputs(
437439
if strength < 0 or strength > 1:
438440
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
439441

440-
if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
441-
raise ValueError(
442-
f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
442+
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
443+
logger.warning(
444+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
443445
)
444446

445447
if callback_on_step_end_tensor_inputs is not None and not all(
@@ -505,8 +507,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
505507
def _unpack_latents(latents, height, width, vae_scale_factor):
506508
batch_size, num_patches, channels = latents.shape
507509

508-
height = height // vae_scale_factor
509-
width = width // vae_scale_factor
510+
# VAE applies 8x compression on images but we must also account for packing which requires
511+
# latent height and width to be divisible by 2.
512+
height = 2 * (int(height) // (vae_scale_factor * 2))
513+
width = 2 * (int(width) // (vae_scale_factor * 2))
510514

511515
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
512516
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -534,9 +538,10 @@ def prepare_latents(
534538
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
535539
)
536540

537-
height = int(height) // self.vae_scale_factor
538-
width = int(width) // self.vae_scale_factor
539-
541+
# VAE applies 8x compression on images but we must also account for packing which requires
542+
# latent height and width to be divisible by 2.
543+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
544+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
540545
shape = (batch_size, num_channels_latents, height, width)
541546
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
542547

0 commit comments

Comments
 (0)