Skip to content

Commit 6160749

Browse files
authored
Merge branch 'main' into sd3-controlnet-inpaint-ipadapter
2 parents ef1f486 + c3478a4 commit 6160749

40 files changed

+314
-32
lines changed

examples/community/rerender_a_video.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,9 @@ def __call__(
908908
if callback is not None and i % callback_steps == 0:
909909
callback(i, t, latents)
910910

911+
if XLA_AVAILABLE:
912+
xm.mark_step()
913+
911914
if not output_type == "latent":
912915
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
913916
else:

examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ def load_model_hook(models, input_dir):
765765
lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
766766

767767
transformer_state_dict = {
768-
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
768+
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
769769
}
770770
transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
771771
incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
"transformers>=4.41.2",
136136
"urllib3<=2.0.0",
137137
"black",
138+
"phonemizer",
138139
]
139140

140141
# this is a lookup table with items like:
@@ -227,6 +228,7 @@ def run(self):
227228
"scipy",
228229
"torchvision",
229230
"transformers",
231+
"phonemizer",
230232
)
231233
extras["torch"] = deps_list("torch", "accelerate")
232234

src/diffusers/dependency_versions_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@
4343
"transformers": "transformers>=4.41.2",
4444
"urllib3": "urllib3<=2.0.0",
4545
"black": "black",
46+
"phonemizer": "phonemizer",
4647
}

src/diffusers/loaders/single_file_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@
186186
"inpainting": 512,
187187
"inpainting_v2": 512,
188188
"controlnet": 512,
189+
"instruct-pix2pix": 512,
189190
"v2": 768,
190191
"v1": 512,
191192
}

src/diffusers/models/autoencoders/autoencoder_dc.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,9 @@ def __init__(
486486
self.tile_sample_stride_height = 448
487487
self.tile_sample_stride_width = 448
488488

489+
self.tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
490+
self.tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
491+
489492
def enable_tiling(
490493
self,
491494
tile_sample_min_height: Optional[int] = None,
@@ -515,6 +518,8 @@ def enable_tiling(
515518
self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
516519
self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
517520
self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
521+
self.tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
522+
self.tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
518523

519524
def disable_tiling(self) -> None:
520525
r"""
@@ -606,11 +611,106 @@ def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutp
606611
return (decoded,)
607612
return DecoderOutput(sample=decoded)
608613

614+
def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
615+
blend_extent = min(a.shape[2], b.shape[2], blend_extent)
616+
for y in range(blend_extent):
617+
b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
618+
return b
619+
620+
def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
621+
blend_extent = min(a.shape[3], b.shape[3], blend_extent)
622+
for x in range(blend_extent):
623+
b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
624+
return b
625+
609626
def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
610-
raise NotImplementedError("`tiled_encode` has not been implemented for AutoencoderDC.")
627+
batch_size, num_channels, height, width = x.shape
628+
latent_height = height // self.spatial_compression_ratio
629+
latent_width = width // self.spatial_compression_ratio
630+
631+
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
632+
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
633+
tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
634+
tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
635+
blend_height = tile_latent_min_height - tile_latent_stride_height
636+
blend_width = tile_latent_min_width - tile_latent_stride_width
637+
638+
# Split x into overlapping tiles and encode them separately.
639+
# The tiles have an overlap to avoid seams between tiles.
640+
rows = []
641+
for i in range(0, x.shape[2], self.tile_sample_stride_height):
642+
row = []
643+
for j in range(0, x.shape[3], self.tile_sample_stride_width):
644+
tile = x[:, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
645+
if (
646+
tile.shape[2] % self.spatial_compression_ratio != 0
647+
or tile.shape[3] % self.spatial_compression_ratio != 0
648+
):
649+
pad_h = (self.spatial_compression_ratio - tile.shape[2]) % self.spatial_compression_ratio
650+
pad_w = (self.spatial_compression_ratio - tile.shape[3]) % self.spatial_compression_ratio
651+
tile = F.pad(tile, (0, pad_w, 0, pad_h))
652+
tile = self.encoder(tile)
653+
row.append(tile)
654+
rows.append(row)
655+
result_rows = []
656+
for i, row in enumerate(rows):
657+
result_row = []
658+
for j, tile in enumerate(row):
659+
# blend the above tile and the left tile
660+
# to the current tile and add the current tile to the result row
661+
if i > 0:
662+
tile = self.blend_v(rows[i - 1][j], tile, blend_height)
663+
if j > 0:
664+
tile = self.blend_h(row[j - 1], tile, blend_width)
665+
result_row.append(tile[:, :, :tile_latent_stride_height, :tile_latent_stride_width])
666+
result_rows.append(torch.cat(result_row, dim=3))
667+
668+
encoded = torch.cat(result_rows, dim=2)[:, :, :latent_height, :latent_width]
669+
670+
if not return_dict:
671+
return (encoded,)
672+
return EncoderOutput(latent=encoded)
611673

612674
def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
613-
raise NotImplementedError("`tiled_decode` has not been implemented for AutoencoderDC.")
675+
batch_size, num_channels, height, width = z.shape
676+
677+
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
678+
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
679+
tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
680+
tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
681+
682+
blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
683+
blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
684+
685+
# Split z into overlapping tiles and decode them separately.
686+
# The tiles have an overlap to avoid seams between tiles.
687+
rows = []
688+
for i in range(0, height, tile_latent_stride_height):
689+
row = []
690+
for j in range(0, width, tile_latent_stride_width):
691+
tile = z[:, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
692+
decoded = self.decoder(tile)
693+
row.append(decoded)
694+
rows.append(row)
695+
696+
result_rows = []
697+
for i, row in enumerate(rows):
698+
result_row = []
699+
for j, tile in enumerate(row):
700+
# blend the above tile and the left tile
701+
# to the current tile and add the current tile to the result row
702+
if i > 0:
703+
tile = self.blend_v(rows[i - 1][j], tile, blend_height)
704+
if j > 0:
705+
tile = self.blend_h(row[j - 1], tile, blend_width)
706+
result_row.append(tile[:, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
707+
result_rows.append(torch.cat(result_row, dim=3))
708+
709+
decoded = torch.cat(result_rows, dim=2)
710+
711+
if not return_dict:
712+
return (decoded,)
713+
return DecoderOutput(sample=decoded)
614714

615715
def forward(self, sample: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
616716
encoded = self.encode(sample, return_dict=False)[0]

src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def disable_vae_slicing(self):
237237
"""
238238
self.vae.disable_slicing()
239239

240-
def enable_model_cpu_offload(self, gpu_id=0):
240+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
241241
r"""
242242
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
243243
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
@@ -249,11 +249,23 @@ def enable_model_cpu_offload(self, gpu_id=0):
249249
else:
250250
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
251251

252-
device = torch.device(f"cuda:{gpu_id}")
252+
torch_device = torch.device(device)
253+
device_index = torch_device.index
254+
255+
if gpu_id is not None and device_index is not None:
256+
raise ValueError(
257+
f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
258+
f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
259+
)
260+
261+
device_type = torch_device.type
262+
device = torch.device(f"{device_type}:{gpu_id or torch_device.index}")
253263

254264
if self.device.type != "cpu":
255265
self.to("cpu", silence_dtype_warnings=True)
256-
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
266+
device_mod = getattr(torch, device.type, None)
267+
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
268+
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
257269

258270
model_sequence = [
259271
self.text_encoder.text_model,

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -404,9 +404,9 @@ def encode_prompt(
404404
negative_prompt_2 (`str` or `List[str]`, *optional*):
405405
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
406406
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
407-
negative_prompt_2 (`str` or `List[str]`, *optional*):
407+
negative_prompt_3 (`str` or `List[str]`, *optional*):
408408
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
409-
`text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
409+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
410410
prompt_embeds (`torch.FloatTensor`, *optional*):
411411
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
412412
provided, text embeddings will be generated from `prompt` input argument.

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -422,9 +422,9 @@ def encode_prompt(
422422
negative_prompt_2 (`str` or `List[str]`, *optional*):
423423
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
424424
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
425-
negative_prompt_2 (`str` or `List[str]`, *optional*):
425+
negative_prompt_3 (`str` or `List[str]`, *optional*):
426426
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
427-
`text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
427+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
428428
prompt_embeds (`torch.FloatTensor`, *optional*):
429429
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
430430
provided, text embeddings will be generated from `prompt` input argument.

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,16 @@ def __call__(
665665
instead.
666666
prompt_2 (`str` or `List[str]`, *optional*):
667667
The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
668-
will be used instead
668+
will be used instead.
669+
negative_prompt (`str` or `List[str]`, *optional*):
670+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
671+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
672+
not greater than `1`).
673+
negative_prompt_2 (`str` or `List[str]`, *optional*):
674+
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
675+
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
676+
true_cfg_scale (`float`, *optional*, defaults to 1.0):
677+
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
669678
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
670679
The height in pixels of the generated image. This is set to 1024 by default for the best results.
671680
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -709,6 +718,14 @@ def __call__(
709718
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
710719
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
711720
provided, embeddings are computed from the `ip_adapter_image` input argument.
721+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
722+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
723+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
724+
argument.
725+
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
726+
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
727+
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
728+
input argument.
712729
output_type (`str`, *optional*, defaults to `"pil"`):
713730
The output format of the generate image. Choose between
714731
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -773,7 +790,10 @@ def __call__(
773790
lora_scale = (
774791
self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
775792
)
776-
do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
793+
has_neg_prompt = negative_prompt is not None or (
794+
negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
795+
)
796+
do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
777797
(
778798
prompt_embeds,
779799
pooled_prompt_embeds,

0 commit comments

Comments
 (0)