Skip to content

Commit cb263a7

Browse files
authored
Merge branch 'main' into ema_model
2 parents 45361d5 + c5376c5 commit cb263a7

15 files changed

+3008
-6
lines changed

examples/community/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
7373
| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
7474
| FRESCO V2V Pipeline | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962) | [FRESCO V2V Pipeline](#fresco) | - | [Yifan Zhou](https://github.com/SingleZombie) |
7575
| AnimateDiff IPEX Pipeline | Accelerate AnimateDiff inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [AnimateDiff on IPEX](#animatediff-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
76+
PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixart alpha and its diffusers pipeline | [PIXART-α Controlnet pipeline](#pixart-α-controlnet-pipeline) | - | [Raul Ciotescu](https://github.com/raulc0399/) |
7677
| HunyuanDiT Differential Diffusion Pipeline | Applies [Differential Diffusion](https://github.com/exx8/differential-diffusion) to [HunyuanDiT](https://github.com/huggingface/diffusers/pull/8240). | [HunyuanDiT with Differential Diffusion](#hunyuandit-with-differential-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing) | [Monjoy Choudhury](https://github.com/MnCSSJ4x) |
7778
| [🪆Matryoshka Diffusion Models](https://huggingface.co/papers/2310.15111) | A diffusion process that denoises inputs at multiple resolutions jointly and uses a NestedUNet architecture where features and parameters for small scale inputs are nested within those of the large scales. See [original codebase](https://github.com/apple/ml-mdm). | [🪆Matryoshka Diffusion Models](#matryoshka-diffusion-models) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/pcuenq/mdm) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/1f54875fc7aeaabcf284ebde64820966/matryoshka_hf.ipynb) | [M. Tolga Cangöz](https://github.com/tolgacangoz) |
7879

@@ -4445,3 +4446,94 @@ grid_image.save(grid_dir + "sample.png")
44454446
`pag_scale` : guidance scale of PAG (ex: 5.0)
44464447

44474448
`pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0'])
4449+
4450+
# PIXART-α Controlnet pipeline
4451+
4452+
[Project](https://pixart-alpha.github.io/) / [GitHub](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/asset/docs/pixart_controlnet.md)
4453+
4454+
This the implementation of the controlnet model and the pipelne for the Pixart-alpha model, adapted to use the HuggingFace Diffusers.
4455+
4456+
## Example Usage
4457+
4458+
This example uses the Pixart HED Controlnet model, converted from the control net model as trained by the authors of the paper.
4459+
4460+
```py
4461+
import sys
4462+
import os
4463+
import torch
4464+
import torchvision.transforms as T
4465+
import torchvision.transforms.functional as TF
4466+
4467+
from pipeline_pixart_alpha_controlnet import PixArtAlphaControlnetPipeline
4468+
from diffusers.utils import load_image
4469+
4470+
from diffusers.image_processor import PixArtImageProcessor
4471+
4472+
from controlnet_aux import HEDdetector
4473+
4474+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4475+
from pixart.controlnet_pixart_alpha import PixArtControlNetAdapterModel
4476+
4477+
controlnet_repo_id = "raulc0399/pixart-alpha-hed-controlnet"
4478+
4479+
weight_dtype = torch.float16
4480+
image_size = 1024
4481+
4482+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
4483+
4484+
torch.manual_seed(0)
4485+
4486+
# load controlnet
4487+
controlnet = PixArtControlNetAdapterModel.from_pretrained(
4488+
controlnet_repo_id,
4489+
torch_dtype=weight_dtype,
4490+
use_safetensors=True,
4491+
).to(device)
4492+
4493+
pipe = PixArtAlphaControlnetPipeline.from_pretrained(
4494+
"PixArt-alpha/PixArt-XL-2-1024-MS",
4495+
controlnet=controlnet,
4496+
torch_dtype=weight_dtype,
4497+
use_safetensors=True,
4498+
).to(device)
4499+
4500+
images_path = "images"
4501+
control_image_file = "0_7.jpg"
4502+
4503+
prompt = "battleship in space, galaxy in background"
4504+
4505+
control_image_name = control_image_file.split('.')[0]
4506+
4507+
control_image = load_image(f"{images_path}/{control_image_file}")
4508+
print(control_image.size)
4509+
height, width = control_image.size
4510+
4511+
hed = HEDdetector.from_pretrained("lllyasviel/Annotators")
4512+
4513+
condition_transform = T.Compose([
4514+
T.Lambda(lambda img: img.convert('RGB')),
4515+
T.CenterCrop([image_size, image_size]),
4516+
])
4517+
4518+
control_image = condition_transform(control_image)
4519+
hed_edge = hed(control_image, detect_resolution=image_size, image_resolution=image_size)
4520+
4521+
hed_edge.save(f"{images_path}/{control_image_name}_hed.jpg")
4522+
4523+
# run pipeline
4524+
with torch.no_grad():
4525+
out = pipe(
4526+
prompt=prompt,
4527+
image=hed_edge,
4528+
num_inference_steps=14,
4529+
guidance_scale=4.5,
4530+
height=image_size,
4531+
width=image_size,
4532+
)
4533+
4534+
out.images[0].save(f"{images_path}//{control_image_name}_output.jpg")
4535+
4536+
```
4537+
4538+
In the folder examples/pixart there is also a script that can be used to train new models.
4539+
Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.

examples/dreambooth/README_flux.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,21 @@ accelerate launch train_dreambooth_lora_flux.py \
170170
--push_to_hub
171171
```
172172

173+
### Target Modules
174+
When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them.
175+
More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore
176+
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
177+
the exact modules for LoRA training. Here are some examples of target modules you can provide:
178+
- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
179+
- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
180+
- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
181+
> [!NOTE]
182+
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
183+
> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
184+
> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k`
185+
> [!NOTE]
186+
> keep in mind that while training more layers can improve quality and expressiveness, it also increases the size of the output LoRA weights.
187+
173188
### Text Encoder Training
174189

175190
Alongside the transformer, fine-tuning of the CLIP text encoder is also supported.

examples/dreambooth/README_sd3.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,40 @@ accelerate launch train_dreambooth_lora_sd3.py \
147147
--push_to_hub
148148
```
149149

150+
### Targeting Specific Blocks & Layers
151+
As image generation models get bigger & more powerful, more fine-tuners come to find that training only part of the
152+
transformer blocks (sometimes as little as two) can be enough to get great results.
153+
In some cases, it can be even better to maintain some of the blocks/layers frozen.
154+
155+
For **SD3.5-Large** specifically, you may find this information useful (taken from: [Stable Diffusion 3.5 Large Fine-tuning Tutorial](https://stabilityai.notion.site/Stable-Diffusion-3-5-Large-Fine-tuning-Tutorial-11a61cdcd1968027a15bdbd7c40be8c6#12461cdcd19680788a23c650dab26b93):
156+
> [!NOTE]
157+
> A commonly believed heuristic that we verified once again during the construction of the SD3.5 family of models is that later/higher layers (i.e. `30 - 37`)* impact tertiary details more heavily. Conversely, earlier layers (i.e. `12 - 24` )* influence the overall composition/primary form more.
158+
> So, freezing other layers/targeting specific layers is a viable approach.
159+
> `*`These suggested layers are speculative and not 100% guaranteed. The tips here are more or less a general idea for next steps.
160+
> **Photorealism**
161+
> In preliminary testing, we observed that freezing the last few layers of the architecture significantly improved model training when using a photorealistic dataset, preventing detail degradation introduced by small dataset from happening.
162+
> **Anatomy preservation**
163+
> To dampen any possible degradation of anatomy, training only the attention layers and **not** the adaptive linear layers could help. For reference, below is one of the transformer blocks.
164+
165+
166+
We've added `--lora_layers` and `--lora_blocks` to make LoRA training modules configurable.
167+
- with `--lora_blocks` you can specify the block numbers for training. E.g. passing -
168+
```diff
169+
--lora_blocks "12,13,14,15,16,17,18,19,20,21,22,23,24,30,31,32,33,34,35,36,37"
170+
```
171+
will trigger LoRA training of transformer blocks 12-24 and 30-37. By default, all blocks are trained.
172+
- with `--lora_layers` you can specify the types of layers you wish to train.
173+
By default, the trained layers are -
174+
`attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,attn.to_k,attn.to_out.0,attn.to_q,attn.to_v`
175+
If you wish to have a leaner LoRA / train more blocks over layers you could pass -
176+
```diff
177+
+ --lora_layers attn.to_k,attn.to_q,attn.to_v,attn.to_out.0
178+
```
179+
This will reduce LoRA size by roughly 50% for the same rank compared to the default.
180+
However, if you're after compact LoRAs, it's our impression that maintaining the default setting for `--lora_layers` and
181+
freezing some of the early & blocks is usually better.
182+
183+
150184
### Text Encoder Training
151185
Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported.
152186
To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:

examples/dreambooth/test_dreambooth_lora_flux.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
3737
instance_prompt = "photo"
3838
pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
3939
script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
40+
transformer_layer_type = "single_transformer_blocks.0.attn.to_k"
4041

4142
def test_dreambooth_lora_flux(self):
4243
with tempfile.TemporaryDirectory() as tmpdir:
@@ -136,6 +137,43 @@ def test_dreambooth_lora_latent_caching(self):
136137
starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
137138
self.assertTrue(starts_with_transformer)
138139

140+
def test_dreambooth_lora_layers(self):
141+
with tempfile.TemporaryDirectory() as tmpdir:
142+
test_args = f"""
143+
{self.script_path}
144+
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
145+
--instance_data_dir {self.instance_data_dir}
146+
--instance_prompt {self.instance_prompt}
147+
--resolution 64
148+
--train_batch_size 1
149+
--gradient_accumulation_steps 1
150+
--max_train_steps 2
151+
--cache_latents
152+
--learning_rate 5.0e-04
153+
--scale_lr
154+
--lora_layers {self.transformer_layer_type}
155+
--lr_scheduler constant
156+
--lr_warmup_steps 0
157+
--output_dir {tmpdir}
158+
""".split()
159+
160+
run_command(self._launch_args + test_args)
161+
# save_pretrained smoke test
162+
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
163+
164+
# make sure the state_dict has the correct naming in the parameters.
165+
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
166+
is_lora = all("lora" in k for k in lora_state_dict.keys())
167+
self.assertTrue(is_lora)
168+
169+
# when not training the text encoder, all the parameters in the state dict should start
170+
# with `"transformer"` in their names. In this test, we only params of
171+
# transformer.single_transformer_blocks.0.attn.to_k should be in the state dict
172+
starts_with_transformer = all(
173+
key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys()
174+
)
175+
self.assertTrue(starts_with_transformer)
176+
139177
def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):
140178
with tempfile.TemporaryDirectory() as tmpdir:
141179
test_args = f"""

examples/dreambooth/test_dreambooth_lora_sd3.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class DreamBoothLoRASD3(ExamplesTestsAccelerate):
3838
pretrained_model_name_or_path = "hf-internal-testing/tiny-sd3-pipe"
3939
script_path = "examples/dreambooth/train_dreambooth_lora_sd3.py"
4040

41+
transformer_block_idx = 0
42+
layer_type = "attn.to_k"
43+
4144
def test_dreambooth_lora_sd3(self):
4245
with tempfile.TemporaryDirectory() as tmpdir:
4346
test_args = f"""
@@ -136,6 +139,74 @@ def test_dreambooth_lora_latent_caching(self):
136139
starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
137140
self.assertTrue(starts_with_transformer)
138141

142+
def test_dreambooth_lora_block(self):
143+
with tempfile.TemporaryDirectory() as tmpdir:
144+
test_args = f"""
145+
{self.script_path}
146+
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
147+
--instance_data_dir {self.instance_data_dir}
148+
--instance_prompt {self.instance_prompt}
149+
--resolution 64
150+
--train_batch_size 1
151+
--gradient_accumulation_steps 1
152+
--max_train_steps 2
153+
--lora_blocks {self.transformer_block_idx}
154+
--learning_rate 5.0e-04
155+
--scale_lr
156+
--lr_scheduler constant
157+
--lr_warmup_steps 0
158+
--output_dir {tmpdir}
159+
""".split()
160+
161+
run_command(self._launch_args + test_args)
162+
# save_pretrained smoke test
163+
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
164+
165+
# make sure the state_dict has the correct naming in the parameters.
166+
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
167+
is_lora = all("lora" in k for k in lora_state_dict.keys())
168+
self.assertTrue(is_lora)
169+
170+
# when not training the text encoder, all the parameters in the state dict should start
171+
# with `"transformer"` in their names.
172+
# In this test, only params of transformer block 0 should be in the state dict
173+
starts_with_transformer = all(
174+
key.startswith("transformer.transformer_blocks.0") for key in lora_state_dict.keys()
175+
)
176+
self.assertTrue(starts_with_transformer)
177+
178+
def test_dreambooth_lora_layer(self):
179+
with tempfile.TemporaryDirectory() as tmpdir:
180+
test_args = f"""
181+
{self.script_path}
182+
--pretrained_model_name_or_path {self.pretrained_model_name_or_path}
183+
--instance_data_dir {self.instance_data_dir}
184+
--instance_prompt {self.instance_prompt}
185+
--resolution 64
186+
--train_batch_size 1
187+
--gradient_accumulation_steps 1
188+
--max_train_steps 2
189+
--lora_layers {self.layer_type}
190+
--learning_rate 5.0e-04
191+
--scale_lr
192+
--lr_scheduler constant
193+
--lr_warmup_steps 0
194+
--output_dir {tmpdir}
195+
""".split()
196+
197+
run_command(self._launch_args + test_args)
198+
# save_pretrained smoke test
199+
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
200+
201+
# make sure the state_dict has the correct naming in the parameters.
202+
lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
203+
is_lora = all("lora" in k for k in lora_state_dict.keys())
204+
self.assertTrue(is_lora)
205+
206+
# In this test, only transformer params of attention layers `attn.to_k` should be in the state dict
207+
starts_with_transformer = all("attn.to_k" in key for key in lora_state_dict.keys())
208+
self.assertTrue(starts_with_transformer)
209+
139210
def test_dreambooth_lora_sd3_checkpointing_checkpoints_total_limit(self):
140211
with tempfile.TemporaryDirectory() as tmpdir:
141212
test_args = f"""

examples/dreambooth/train_dreambooth_flux.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def log_validation(
161161
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
162162
f" {args.validation_prompt}."
163163
)
164-
pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
164+
pipeline = pipeline.to(accelerator.device)
165165
pipeline.set_progress_bar_config(disable=True)
166166

167167
# run inference
@@ -1579,7 +1579,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
15791579
)
15801580

15811581
# handle guidance
1582-
if transformer.config.guidance_embeds:
1582+
if accelerator.unwrap_model(transformer).config.guidance_embeds:
15831583
guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
15841584
guidance = guidance.expand(model_input.shape[0])
15851585
else:
@@ -1693,6 +1693,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
16931693
# create pipeline
16941694
if not args.train_text_encoder:
16951695
text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
1696+
text_encoder_one.to(weight_dtype)
1697+
text_encoder_two.to(weight_dtype)
16961698
else: # even when training the text encoder we're only training text encoder one
16971699
text_encoder_two = text_encoder_cls_two.from_pretrained(
16981700
args.pretrained_model_name_or_path,

0 commit comments

Comments
 (0)