Skip to content

Commit 90428bc

Browse files
authored
Merge branch 'main' into sayakpaul-patch-1
2 parents 99f86df + 9fc9c6d commit 90428bc

24 files changed

+300
-45
lines changed

docs/source/en/api/pipelines/flux.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ transformer_8bit = FluxTransformer2DModel.from_pretrained(
367367

368368
pipeline = FluxPipeline.from_pretrained(
369369
"black-forest-labs/FLUX.1-dev",
370-
text_encoder=text_encoder_8bit,
370+
text_encoder_2=text_encoder_8bit,
371371
transformer=transformer_8bit,
372372
torch_dtype=torch.float16,
373373
device_map="balanced",

examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ def load_model_hook(models, input_dir):
765765
lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir)
766766

767767
transformer_state_dict = {
768-
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")
768+
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
769769
}
770770
transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
771771
incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
"transformers>=4.41.2",
136136
"urllib3<=2.0.0",
137137
"black",
138+
"phonemizer",
138139
]
139140

140141
# this is a lookup table with items like:
@@ -227,6 +228,7 @@ def run(self):
227228
"scipy",
228229
"torchvision",
229230
"transformers",
231+
"phonemizer",
230232
)
231233
extras["torch"] = deps_list("torch", "accelerate")
232234

src/diffusers/dependency_versions_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@
4343
"transformers": "transformers>=4.41.2",
4444
"urllib3": "urllib3<=2.0.0",
4545
"black": "black",
46+
"phonemizer": "phonemizer",
4647
}

src/diffusers/loaders/peft.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -300,15 +300,17 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
300300
try:
301301
inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
302302
incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
303-
except RuntimeError as e:
304-
for module in self.modules():
305-
if isinstance(module, BaseTunerLayer):
306-
active_adapters = module.active_adapters
307-
for active_adapter in active_adapters:
308-
if adapter_name in active_adapter:
309-
module.delete_adapter(adapter_name)
310-
311-
self.peft_config.pop(adapter_name)
303+
except Exception as e:
304+
# In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`.
305+
if hasattr(self, "peft_config"):
306+
for module in self.modules():
307+
if isinstance(module, BaseTunerLayer):
308+
active_adapters = module.active_adapters
309+
for active_adapter in active_adapters:
310+
if adapter_name in active_adapter:
311+
module.delete_adapter(adapter_name)
312+
313+
self.peft_config.pop(adapter_name)
312314
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
313315
raise
314316

src/diffusers/loaders/single_file_utils.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@
186186
"inpainting": 512,
187187
"inpainting_v2": 512,
188188
"controlnet": 512,
189+
"instruct-pix2pix": 512,
189190
"v2": 768,
190191
"v1": 512,
191192
}
@@ -605,10 +606,14 @@ def infer_diffusers_model_type(checkpoint):
605606
if any(
606607
g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
607608
):
608-
if checkpoint["img_in.weight"].shape[1] == 384:
609-
model_type = "flux-fill"
609+
if "model.diffusion_model.img_in.weight" in checkpoint:
610+
key = "model.diffusion_model.img_in.weight"
611+
else:
612+
key = "img_in.weight"
610613

611-
elif checkpoint["img_in.weight"].shape[1] == 128:
614+
if checkpoint[key].shape[1] == 384:
615+
model_type = "flux-fill"
616+
elif checkpoint[key].shape[1] == 128:
612617
model_type = "flux-depth"
613618
else:
614619
model_type = "flux-dev"

src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def disable_vae_slicing(self):
237237
"""
238238
self.vae.disable_slicing()
239239

240-
def enable_model_cpu_offload(self, gpu_id=0):
240+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
241241
r"""
242242
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
243243
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
@@ -249,11 +249,23 @@ def enable_model_cpu_offload(self, gpu_id=0):
249249
else:
250250
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
251251

252-
device = torch.device(f"cuda:{gpu_id}")
252+
torch_device = torch.device(device)
253+
device_index = torch_device.index
254+
255+
if gpu_id is not None and device_index is not None:
256+
raise ValueError(
257+
f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
258+
f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
259+
)
260+
261+
device_type = torch_device.type
262+
device = torch.device(f"{device_type}:{gpu_id or torch_device.index}")
253263

254264
if self.device.type != "cpu":
255265
self.to("cpu", silence_dtype_warnings=True)
256-
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
266+
device_mod = getattr(torch, device.type, None)
267+
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
268+
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
257269

258270
model_sequence = [
259271
self.text_encoder.text_model,

src/diffusers/pipelines/auto_pipeline.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
from .pag import (
6969
HunyuanDiTPAGPipeline,
7070
PixArtSigmaPAGPipeline,
71+
SanaPAGPipeline,
7172
StableDiffusion3PAGImg2ImgPipeline,
7273
StableDiffusion3PAGPipeline,
7374
StableDiffusionControlNetPAGInpaintPipeline,
@@ -82,6 +83,7 @@
8283
StableDiffusionXLPAGPipeline,
8384
)
8485
from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
86+
from .sana import SanaPipeline
8587
from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
8688
from .stable_diffusion import (
8789
StableDiffusionImg2ImgPipeline,
@@ -121,6 +123,8 @@
121123
("lcm", LatentConsistencyModelPipeline),
122124
("pixart-alpha", PixArtAlphaPipeline),
123125
("pixart-sigma", PixArtSigmaPipeline),
126+
("sana", SanaPipeline),
127+
("sana-pag", SanaPAGPipeline),
124128
("stable-diffusion-pag", StableDiffusionPAGPipeline),
125129
("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGPipeline),
126130
("stable-diffusion-xl-pag", StableDiffusionXLPAGPipeline),

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -404,9 +404,9 @@ def encode_prompt(
404404
negative_prompt_2 (`str` or `List[str]`, *optional*):
405405
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
406406
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
407-
negative_prompt_2 (`str` or `List[str]`, *optional*):
407+
negative_prompt_3 (`str` or `List[str]`, *optional*):
408408
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
409-
`text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
409+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
410410
prompt_embeds (`torch.FloatTensor`, *optional*):
411411
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
412412
provided, text embeddings will be generated from `prompt` input argument.

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py

Lines changed: 121 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@
1717

1818
import torch
1919
from transformers import (
20+
BaseImageProcessor,
2021
CLIPTextModelWithProjection,
2122
CLIPTokenizer,
23+
PreTrainedModel,
2224
T5EncoderModel,
2325
T5TokenizerFast,
2426
)
2527

2628
from ...image_processor import PipelineImageInput, VaeImageProcessor
27-
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
29+
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
2830
from ...models.autoencoders import AutoencoderKL
2931
from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
3032
from ...models.transformers import SD3Transformer2DModel
@@ -159,7 +161,9 @@ def retrieve_timesteps(
159161
return timesteps, num_inference_steps
160162

161163

162-
class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
164+
class StableDiffusion3ControlNetInpaintingPipeline(
165+
DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin
166+
):
163167
r"""
164168
Args:
165169
transformer ([`SD3Transformer2DModel`]):
@@ -192,13 +196,17 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
192196
Tokenizer of class
193197
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
194198
controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]):
195-
Provides additional conditioning to the `unet` during the denoising process. If you set multiple
199+
Provides additional conditioning to the `transformer` during the denoising process. If you set multiple
196200
ControlNets as a list, the outputs from each ControlNet are added together to create one combined
197201
additional conditioning.
202+
image_encoder (`PreTrainedModel`, *optional*):
203+
Pre-trained Vision Model for IP Adapter.
204+
feature_extractor (`BaseImageProcessor`, *optional*):
205+
Image processor for IP Adapter.
198206
"""
199207

200-
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
201-
_optional_components = []
208+
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
209+
_optional_components = ["image_encoder", "feature_extractor"]
202210
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
203211

204212
def __init__(
@@ -215,6 +223,8 @@ def __init__(
215223
controlnet: Union[
216224
SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
217225
],
226+
image_encoder: PreTrainedModel = None,
227+
feature_extractor: BaseImageProcessor = None,
218228
):
219229
super().__init__()
220230

@@ -229,6 +239,8 @@ def __init__(
229239
transformer=transformer,
230240
scheduler=scheduler,
231241
controlnet=controlnet,
242+
image_encoder=image_encoder,
243+
feature_extractor=feature_extractor,
232244
)
233245
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
234246
self.image_processor = VaeImageProcessor(
@@ -410,9 +422,9 @@ def encode_prompt(
410422
negative_prompt_2 (`str` or `List[str]`, *optional*):
411423
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
412424
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
413-
negative_prompt_2 (`str` or `List[str]`, *optional*):
425+
negative_prompt_3 (`str` or `List[str]`, *optional*):
414426
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
415-
`text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
427+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
416428
prompt_embeds (`torch.FloatTensor`, *optional*):
417429
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
418430
provided, text embeddings will be generated from `prompt` input argument.
@@ -775,6 +787,84 @@ def num_timesteps(self):
775787
def interrupt(self):
776788
return self._interrupt
777789

790+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
791+
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
792+
"""Encodes the given image into a feature representation using a pre-trained image encoder.
793+
794+
Args:
795+
image (`PipelineImageInput`):
796+
Input image to be encoded.
797+
device: (`torch.device`):
798+
Torch device.
799+
800+
Returns:
801+
`torch.Tensor`: The encoded image feature representation.
802+
"""
803+
if not isinstance(image, torch.Tensor):
804+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
805+
806+
image = image.to(device=device, dtype=self.dtype)
807+
808+
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
809+
810+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
811+
def prepare_ip_adapter_image_embeds(
812+
self,
813+
ip_adapter_image: Optional[PipelineImageInput] = None,
814+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
815+
device: Optional[torch.device] = None,
816+
num_images_per_prompt: int = 1,
817+
do_classifier_free_guidance: bool = True,
818+
) -> torch.Tensor:
819+
"""Prepares image embeddings for use in the IP-Adapter.
820+
821+
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
822+
823+
Args:
824+
ip_adapter_image (`PipelineImageInput`, *optional*):
825+
The input image to extract features from for IP-Adapter.
826+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
827+
Precomputed image embeddings.
828+
device: (`torch.device`, *optional*):
829+
Torch device.
830+
num_images_per_prompt (`int`, defaults to 1):
831+
Number of images that should be generated per prompt.
832+
do_classifier_free_guidance (`bool`, defaults to True):
833+
Whether to use classifier free guidance or not.
834+
"""
835+
device = device or self._execution_device
836+
837+
if ip_adapter_image_embeds is not None:
838+
if do_classifier_free_guidance:
839+
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
840+
else:
841+
single_image_embeds = ip_adapter_image_embeds
842+
elif ip_adapter_image is not None:
843+
single_image_embeds = self.encode_image(ip_adapter_image, device)
844+
if do_classifier_free_guidance:
845+
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
846+
else:
847+
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
848+
849+
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
850+
851+
if do_classifier_free_guidance:
852+
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
853+
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
854+
855+
return image_embeds.to(device=device)
856+
857+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
858+
def enable_sequential_cpu_offload(self, *args, **kwargs):
859+
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
860+
logger.warning(
861+
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
862+
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
863+
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
864+
)
865+
866+
super().enable_sequential_cpu_offload(*args, **kwargs)
867+
778868
@torch.no_grad()
779869
@replace_example_docstring(EXAMPLE_DOC_STRING)
780870
def __call__(
@@ -803,6 +893,8 @@ def __call__(
803893
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
804894
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
805895
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
896+
ip_adapter_image: Optional[PipelineImageInput] = None,
897+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
806898
output_type: Optional[str] = "pil",
807899
return_dict: bool = True,
808900
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -896,6 +988,12 @@ def __call__(
896988
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
897989
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
898990
input argument.
991+
ip_adapter_image (`PipelineImageInput`, *optional*):
992+
Optional image input to work with IP Adapters.
993+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
994+
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
995+
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
996+
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
899997
output_type (`str`, *optional*, defaults to `"pil"`):
900998
The output format of the generate image. Choose between
901999
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1057,7 +1155,22 @@ def __call__(
10571155
]
10581156
controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps)
10591157

1060-
# 7. Denoising loop
1158+
# 7. Prepare image embeddings
1159+
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
1160+
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
1161+
ip_adapter_image,
1162+
ip_adapter_image_embeds,
1163+
device,
1164+
batch_size * num_images_per_prompt,
1165+
self.do_classifier_free_guidance,
1166+
)
1167+
1168+
if self.joint_attention_kwargs is None:
1169+
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
1170+
else:
1171+
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
1172+
1173+
# 8. Denoising loop
10611174
with self.progress_bar(total=num_inference_steps) as progress_bar:
10621175
for i, t in enumerate(timesteps):
10631176
if self.interrupt:

0 commit comments

Comments
 (0)