Skip to content

Commit f73a38a

Browse files
authored
Merge branch 'huggingface:main' into enable_xpu
2 parents ed55b90 + dac623b commit f73a38a

File tree

65 files changed

+3543
-2868
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3543
-2868
lines changed

docs/source/en/api/models/controlnet.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro
3939

4040
## ControlNetOutput
4141

42-
[[autodoc]] models.controlnet.ControlNetOutput
42+
[[autodoc]] models.controlnets.controlnet.ControlNetOutput
4343

4444
## FlaxControlNetModel
4545

4646
[[autodoc]] FlaxControlNetModel
4747

4848
## FlaxControlNetOutput
4949

50-
[[autodoc]] models.controlnet_flax.FlaxControlNetOutput
50+
[[autodoc]] models.controlnets.controlnet_flax.FlaxControlNetOutput

docs/source/en/api/models/controlnet_sd3.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,5 @@ pipe = StableDiffusion3ControlNetPipeline.from_pretrained("stabilityai/stable-di
3838

3939
## SD3ControlNetOutput
4040

41-
[[autodoc]] models.controlnet_sd3.SD3ControlNetOutput
41+
[[autodoc]] models.controlnets.controlnet_sd3.SD3ControlNetOutput
4242

examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
4040
from huggingface_hub import create_repo, upload_folder
4141
from packaging import version
42-
from peft import LoraConfig
42+
from peft import LoraConfig, set_peft_model_state_dict
4343
from peft.utils import get_peft_model_state_dict
4444
from PIL import Image
4545
from PIL.ImageOps import exif_transpose
@@ -59,12 +59,13 @@
5959
)
6060
from diffusers.loaders import StableDiffusionLoraLoaderMixin
6161
from diffusers.optimization import get_scheduler
62-
from diffusers.training_utils import compute_snr
62+
from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
6363
from diffusers.utils import (
6464
check_min_version,
6565
convert_all_state_dict_to_peft,
6666
convert_state_dict_to_diffusers,
6767
convert_state_dict_to_kohya,
68+
convert_unet_state_dict_to_peft,
6869
is_wandb_available,
6970
)
7071
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
@@ -1319,6 +1320,37 @@ def load_model_hook(models, input_dir):
13191320
else:
13201321
raise ValueError(f"unexpected save model: {model.__class__}")
13211322

1323+
lora_state_dict, network_alphas = StableDiffusionPipeline.lora_state_dict(input_dir)
1324+
1325+
unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
1326+
unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
1327+
incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
1328+
if incompatible_keys is not None:
1329+
# check only for unexpected keys
1330+
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
1331+
if unexpected_keys:
1332+
logger.warning(
1333+
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
1334+
f" {unexpected_keys}. "
1335+
)
1336+
1337+
if args.train_text_encoder:
1338+
# Do we need to call `scale_lora_layers()` here?
1339+
_set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
1340+
1341+
_set_state_dict_into_text_encoder(
1342+
lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_one_
1343+
)
1344+
1345+
# Make sure the trainable params are in float32. This is again needed since the base models
1346+
# are in `weight_dtype`. More details:
1347+
# https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
1348+
if args.mixed_precision == "fp16":
1349+
models = [unet_]
1350+
if args.train_text_encoder:
1351+
models.extend([text_encoder_one_])
1352+
# only upcast trainable parameters (LoRA) into fp32
1353+
cast_training_params(models)
13221354
lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir)
13231355
StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
13241356

examples/community/matryoshka.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -868,7 +868,7 @@ def forward(
868868
blocks = list(zip(self.resnets, self.attentions))
869869

870870
for i, (resnet, attn) in enumerate(blocks):
871-
if self.training and self.gradient_checkpointing:
871+
if torch.is_grad_enabled() and self.gradient_checkpointing:
872872

873873
def create_custom_forward(module, return_dict=None):
874874
def custom_forward(*inputs):
@@ -1029,7 +1029,7 @@ def forward(
10291029

10301030
hidden_states = self.resnets[0](hidden_states, temb)
10311031
for attn, resnet in zip(self.attentions, self.resnets[1:]):
1032-
if self.training and self.gradient_checkpointing:
1032+
if torch.is_grad_enabled() and self.gradient_checkpointing:
10331033

10341034
def create_custom_forward(module, return_dict=None):
10351035
def custom_forward(*inputs):
@@ -1191,7 +1191,7 @@ def forward(
11911191

11921192
hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
11931193

1194-
if self.training and self.gradient_checkpointing:
1194+
if torch.is_grad_enabled() and self.gradient_checkpointing:
11951195

11961196
def create_custom_forward(module, return_dict=None):
11971197
def custom_forward(*inputs):
@@ -1364,7 +1364,7 @@ def forward(
13641364

13651365
# Blocks
13661366
for block in self.transformer_blocks:
1367-
if self.training and self.gradient_checkpointing:
1367+
if torch.is_grad_enabled() and self.gradient_checkpointing:
13681368

13691369
def create_custom_forward(module, return_dict=None):
13701370
def custom_forward(*inputs):

examples/dreambooth/train_dreambooth_lora_sdxl.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
convert_state_dict_to_diffusers,
6868
convert_state_dict_to_kohya,
6969
convert_unet_state_dict_to_peft,
70+
is_peft_version,
7071
is_wandb_available,
7172
)
7273
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
@@ -1183,26 +1184,33 @@ def main(args):
11831184
text_encoder_one.gradient_checkpointing_enable()
11841185
text_encoder_two.gradient_checkpointing_enable()
11851186

1187+
def get_lora_config(rank, use_dora, target_modules):
1188+
base_config = {
1189+
"r": rank,
1190+
"lora_alpha": rank,
1191+
"init_lora_weights": "gaussian",
1192+
"target_modules": target_modules,
1193+
}
1194+
if use_dora:
1195+
if is_peft_version("<", "0.9.0"):
1196+
raise ValueError(
1197+
"You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`."
1198+
)
1199+
else:
1200+
base_config["use_dora"] = True
1201+
1202+
return LoraConfig(**base_config)
1203+
11861204
# now we will add new LoRA weights to the attention layers
1187-
unet_lora_config = LoraConfig(
1188-
r=args.rank,
1189-
use_dora=args.use_dora,
1190-
lora_alpha=args.rank,
1191-
init_lora_weights="gaussian",
1192-
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
1193-
)
1205+
unet_target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
1206+
unet_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=unet_target_modules)
11941207
unet.add_adapter(unet_lora_config)
11951208

11961209
# The text encoder comes from 🤗 transformers, so we cannot directly modify it.
11971210
# So, instead, we monkey-patch the forward calls of its attention-blocks.
11981211
if args.train_text_encoder:
1199-
text_lora_config = LoraConfig(
1200-
r=args.rank,
1201-
use_dora=args.use_dora,
1202-
lora_alpha=args.rank,
1203-
init_lora_weights="gaussian",
1204-
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
1205-
)
1212+
text_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
1213+
text_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=text_target_modules)
12061214
text_encoder_one.add_adapter(text_lora_config)
12071215
text_encoder_two.add_adapter(text_lora_config)
12081216

examples/research_projects/pixart/controlnet_pixart_alpha.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def forward(
215215

216216
# 2. Blocks
217217
for block_index, block in enumerate(self.transformer.transformer_blocks):
218-
if self.training and self.gradient_checkpointing:
218+
if torch.is_grad_enabled() and self.gradient_checkpointing:
219219
# rc todo: for training and gradient checkpointing
220220
print("Gradient checkpointing is not supported for the controlnet transformer model, yet.")
221221
exit(1)

examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,11 @@ def forward(
229229
In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
230230
you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
231231
return_dict (`bool`, defaults to `True`):
232-
Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
232+
Whether or not to return a [`~models.controlnets.controlnet.ControlNetOutput`] instead of a plain tuple.
233233
234234
Returns:
235-
[`~models.controlnet.ControlNetOutput`] **or** `tuple`:
236-
If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
235+
[`~models.controlnets.controlnet.ControlNetOutput`] **or** `tuple`:
236+
If `return_dict` is `True`, a [`~models.controlnets.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
237237
returned where the first element is the sample tensor.
238238
"""
239239
# check channel order

src/diffusers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@
487487

488488

489489
else:
490-
_import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
490+
_import_structure["models.controlnets.controlnet_flax"] = ["FlaxControlNetModel"]
491491
_import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
492492
_import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
493493
_import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
@@ -914,7 +914,7 @@
914914
except OptionalDependencyNotAvailable:
915915
from .utils.dummy_flax_objects import * # noqa F403
916916
else:
917-
from .models.controlnet_flax import FlaxControlNetModel
917+
from .models.controlnets.controlnet_flax import FlaxControlNetModel
918918
from .models.modeling_flax_utils import FlaxModelMixin
919919
from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
920920
from .models.vae_flax import FlaxAutoencoderKL

src/diffusers/loaders/ip_adapter.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,14 @@
3333

3434

3535
if is_transformers_available():
36-
from transformers import (
37-
CLIPImageProcessor,
38-
CLIPVisionModelWithProjection,
39-
)
36+
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
4037

4138
from ..models.attention_processor import (
4239
AttnProcessor,
4340
AttnProcessor2_0,
4441
IPAdapterAttnProcessor,
4542
IPAdapterAttnProcessor2_0,
43+
IPAdapterXFormersAttnProcessor,
4644
)
4745

4846
logger = logging.get_logger(__name__)
@@ -284,7 +282,9 @@ def set_ip_adapter_scale(self, scale):
284282
scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)
285283

286284
for attn_name, attn_processor in unet.attn_processors.items():
287-
if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
285+
if isinstance(
286+
attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterXFormersAttnProcessor)
287+
):
288288
if len(scale_configs) != len(attn_processor.scale):
289289
raise ValueError(
290290
f"Cannot assign {len(scale_configs)} scale_configs to "
@@ -342,7 +342,9 @@ def unload_ip_adapter(self):
342342
)
343343
attn_procs[name] = (
344344
attn_processor_class
345-
if isinstance(value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
345+
if isinstance(
346+
value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterXFormersAttnProcessor)
347+
)
346348
else value.__class__()
347349
)
348350
self.unet.set_attn_processor(attn_procs)

src/diffusers/loaders/unet.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=F
765765
from ..models.attention_processor import (
766766
IPAdapterAttnProcessor,
767767
IPAdapterAttnProcessor2_0,
768+
IPAdapterXFormersAttnProcessor,
768769
)
769770

770771
if low_cpu_mem_usage:
@@ -804,11 +805,15 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=F
804805
if cross_attention_dim is None or "motion_modules" in name:
805806
attn_processor_class = self.attn_processors[name].__class__
806807
attn_procs[name] = attn_processor_class()
807-
808808
else:
809-
attn_processor_class = (
810-
IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
811-
)
809+
if "XFormers" in str(self.attn_processors[name].__class__):
810+
attn_processor_class = IPAdapterXFormersAttnProcessor
811+
else:
812+
attn_processor_class = (
813+
IPAdapterAttnProcessor2_0
814+
if hasattr(F, "scaled_dot_product_attention")
815+
else IPAdapterAttnProcessor
816+
)
812817
num_image_text_embeds = []
813818
for state_dict in state_dicts:
814819
if "proj.weight" in state_dict["image_proj"]:

0 commit comments

Comments
 (0)