Skip to content

Commit 5ee4d3c

Browse files
committed
update
1 parent 02e6aaf commit 5ee4d3c

File tree

6 files changed

+27
-17
lines changed

6 files changed

+27
-17
lines changed

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,8 @@ def _unpack_latents(latents, height, width, vae_scale_factor):
455455

456456
# VAE applies 8x compression on images but we must also account for packing which requires
457457
# latent height and width to be divisible by 2.
458-
height = int(height) // vae_scale_factor - ((int(height) // vae_scale_factor) % 2)
459-
width = int(width) // vae_scale_factor - ((int(width) // vae_scale_factor) % 2)
458+
height = 2 * (int(height) // (vae_scale_factor * 2))
459+
width = 2 * (int(width) // (vae_scale_factor * 2))
460460

461461
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
462462
latents = latents.permute(0, 3, 1, 4, 2, 5)

src/diffusers/pipelines/flux/pipeline_flux_controlnet.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,8 +480,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
480480
def _unpack_latents(latents, height, width, vae_scale_factor):
481481
batch_size, num_patches, channels = latents.shape
482482

483-
height = height // vae_scale_factor
484-
width = width // vae_scale_factor
483+
# VAE applies 8x compression on images but we must also account for packing which requires
484+
# latent height and width to be divisible by 2.
485+
height = 2 * (int(height) // (vae_scale_factor * 2))
486+
width = 2 * (int(width) // (vae_scale_factor * 2))
485487

486488
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
487489
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -506,6 +508,7 @@ def prepare_latents(
506508
# latent height and width to be divisible by 2.
507509
height = 2 * (int(height) // (self.vae_scale_factor * 2))
508510
width = 2 * (int(width) // (self.vae_scale_factor * 2))
511+
509512
shape = (batch_size, num_channels_latents, height, width)
510513

511514
if latents is not None:

src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
523523
def _unpack_latents(latents, height, width, vae_scale_factor):
524524
batch_size, num_patches, channels = latents.shape
525525

526-
height = height // vae_scale_factor
527-
width = width // vae_scale_factor
526+
# VAE applies 8x compression on images but we must also account for packing which requires
527+
# latent height and width to be divisible by 2.
528+
height = 2 * (int(height) // (vae_scale_factor * 2))
529+
width = 2 * (int(width) // (vae_scale_factor * 2))
528530

529531
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
530532
latents = latents.permute(0, 3, 1, 4, 2, 5)

src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -550,8 +550,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
550550
def _unpack_latents(latents, height, width, vae_scale_factor):
551551
batch_size, num_patches, channels = latents.shape
552552

553-
height = height // vae_scale_factor
554-
width = width // vae_scale_factor
553+
# VAE applies 8x compression on images but we must also account for packing which requires
554+
# latent height and width to be divisible by 2.
555+
height = 2 * (int(height) // (vae_scale_factor * 2))
556+
width = 2 * (int(width) // (vae_scale_factor * 2))
555557

556558
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
557559
latents = latents.permute(0, 3, 1, 4, 2, 5)
@@ -629,10 +631,9 @@ def prepare_mask_latents(
629631
):
630632
# VAE applies 8x compression on images but we must also account for packing which requires
631633
# latent height and width to be divisible by 2.
632-
height = 2 * (int(height) // self.vae_scale_factor * 2)
633-
width = 2 * (
634-
int(width) // self.vae_scale_factor * 2
635-
) # resize the mask to latents shape as we concatenate the mask to the latents
634+
height = 2 * (int(height) // (self.vae_scale_factor * 2))
635+
width = 2 * (int(width) // (self.vae_scale_factor * 2))
636+
# resize the mask to latents shape as we concatenate the mask to the latents
636637
# we do that before converting to dtype to avoid breaking in case we're using cpu_offload
637638
# and half precision
638639
mask = torch.nn.functional.interpolate(mask, size=(height, width))
@@ -669,7 +670,6 @@ def prepare_mask_latents(
669670

670671
# aligning device to prevent device errors when concating it with the latent model input
671672
masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
672-
673673
masked_image_latents = self._pack_latents(
674674
masked_image_latents,
675675
batch_size,

src/diffusers/pipelines/flux/pipeline_flux_img2img.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
409409
image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
410410

411411
image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
412+
412413
return image_latents
413414

414415
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
@@ -506,8 +507,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
506507
def _unpack_latents(latents, height, width, vae_scale_factor):
507508
batch_size, num_patches, channels = latents.shape
508509

509-
height = height // vae_scale_factor
510-
width = width // vae_scale_factor
510+
# VAE applies 8x compression on images but we must also account for packing which requires
511+
# latent height and width to be divisible by 2.
512+
height = 2 * (int(height) // (vae_scale_factor * 2))
513+
width = 2 * (int(width) // (vae_scale_factor * 2))
511514

512515
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
513516
latents = latents.permute(0, 3, 1, 4, 2, 5)

src/diffusers/pipelines/flux/pipeline_flux_inpaint.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,8 +528,10 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
528528
def _unpack_latents(latents, height, width, vae_scale_factor):
529529
batch_size, num_patches, channels = latents.shape
530530

531-
height = height // vae_scale_factor
532-
width = width // vae_scale_factor
531+
# VAE applies 8x compression on images but we must also account for packing which requires
532+
# latent height and width to be divisible by 2.
533+
height = 2 * (int(height) // (vae_scale_factor * 2))
534+
width = 2 * (int(width) // (vae_scale_factor * 2))
533535

534536
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
535537
latents = latents.permute(0, 3, 1, 4, 2, 5)

0 commit comments

Comments
 (0)