From 51b01942e15e8630f8ba129e4b2f9346bb327a34 Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Fri, 11 Oct 2024 11:06:15 +0300
Subject: [PATCH 01/10] make lora target modules configurable and change the
 default

---
 .../dreambooth/train_dreambooth_lora_flux.py    | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py
index fcc11386abcf..8e1f622e6abc 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -554,6 +554,13 @@ def parse_args(input_args=None):
         "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
     )
 
+    parser.add_argument(
+        "--lora_blocks",
+        type=str,
+        default=None,
+        help=('The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "q_proj,k_proj,v_proj,out_proj" will result in lora training of attention layers only'),
+    )
+
     parser.add_argument(
         "--adam_epsilon",
         type=float,
@@ -1188,12 +1195,18 @@ def main(args):
         if args.train_text_encoder:
             text_encoder_one.gradient_checkpointing_enable()
 
-    # now we will add new LoRA weights to the attention layers
+    if args.lora_blocks is not None:
+        target_modules = [block.strip() for block in args.lora_blocks.split(",")]
+    else:
+        target_modules = ["to_k", "to_q", "to_v", "to_out.0",
+                        "add_k_proj", "add_q_proj", "add_v_proj", "to_add_out", "ff.net.0.proj","ff.net.2", "ff_context.net.0.proj","ff_context.net.2"]
+
+    # now we will add new LoRA weights the transformer layers
     transformer_lora_config = LoraConfig(
         r=args.rank,
         lora_alpha=args.rank,
         init_lora_weights="gaussian",
-        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+        target_modules=target_modules,
     )
     transformer.add_adapter(transformer_lora_config)
     if args.train_text_encoder:

From ad37cdff01c41e1b9beb55a59b94d6532c92532e Mon Sep 17 00:00:00 2001
From: Linoy <linoy@huggingface.co>
Date: Fri, 11 Oct 2024 09:05:58 +0000
Subject: [PATCH 02/10] style

---
 .../dreambooth/train_dreambooth_lora_flux.py  | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py
index 8e1f622e6abc..1db05e8c71cc 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -558,7 +558,9 @@ def parse_args(input_args=None):
         "--lora_blocks",
         type=str,
         default=None,
-        help=('The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "q_proj,k_proj,v_proj,out_proj" will result in lora training of attention layers only'),
+        help=(
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "q_proj,k_proj,v_proj,out_proj" will result in lora training of attention layers only'
+        ),
     )
 
     parser.add_argument(
@@ -1198,8 +1200,20 @@ def main(args):
     if args.lora_blocks is not None:
         target_modules = [block.strip() for block in args.lora_blocks.split(",")]
     else:
-        target_modules = ["to_k", "to_q", "to_v", "to_out.0",
-                        "add_k_proj", "add_q_proj", "add_v_proj", "to_add_out", "ff.net.0.proj","ff.net.2", "ff_context.net.0.proj","ff_context.net.2"]
+        target_modules = [
+            "to_k",
+            "to_q",
+            "to_v",
+            "to_out.0",
+            "add_k_proj",
+            "add_q_proj",
+            "add_v_proj",
+            "to_add_out",
+            "ff.net.0.proj",
+            "ff.net.2",
+            "ff_context.net.0.proj",
+            "ff_context.net.2",
+        ]
 
     # now we will add new LoRA weights the transformer layers
     transformer_lora_config = LoraConfig(

From ff5511c1b5eaae222bcb756c33661e0be0a8fff3 Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Mon, 14 Oct 2024 21:55:42 +0300
Subject: [PATCH 03/10] make lora target modules configurable and change the
 default

---
 .../dreambooth/train_dreambooth_lora_flux.py  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py
index 1db05e8c71cc..7ac6717a9f23 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -555,11 +555,11 @@ def parse_args(input_args=None):
     )
 
     parser.add_argument(
-        "--lora_blocks",
+        "--lora_layers",
         type=str,
         default=None,
         help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "q_proj,k_proj,v_proj,out_proj" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
         ),
     )
 
@@ -1197,18 +1197,18 @@ def main(args):
         if args.train_text_encoder:
             text_encoder_one.gradient_checkpointing_enable()
 
-    if args.lora_blocks is not None:
-        target_modules = [block.strip() for block in args.lora_blocks.split(",")]
+    if args.lora_layers is not None:
+        target_modules = [layer.strip() for layer in args.lora_layers.split(",")]
     else:
         target_modules = [
-            "to_k",
-            "to_q",
-            "to_v",
-            "to_out.0",
-            "add_k_proj",
-            "add_q_proj",
-            "add_v_proj",
-            "to_add_out",
+            "attn.to_k",
+            "attn.to_q",
+            "attn.to_v",
+            "attn.to_out.0",
+            "attn.add_k_proj",
+            "attn.add_q_proj",
+            "attn.add_v_proj",
+            "attn.to_add_out",
             "ff.net.0.proj",
             "ff.net.2",
             "ff_context.net.0.proj",

From faa95afdae6c419d57d7d80b386b7c6eb1550234 Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Tue, 15 Oct 2024 12:02:49 +0300
Subject: [PATCH 04/10] fix bug when using prodigy and training te

---
 examples/dreambooth/train_dreambooth_lora_flux.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py
index 7ac6717a9f23..c5f9a4c3e859 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -1396,10 +1396,9 @@ def load_model_hook(models, input_dir):
                 f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
                 f"When using prodigy only learning_rate is used as the initial learning rate."
             )
-            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
+            # changes the learning rate of text_encoder_parameters_one to be
             # --learning_rate
             params_to_optimize[1]["lr"] = args.learning_rate
-            params_to_optimize[2]["lr"] = args.learning_rate
 
         optimizer = optimizer_class(
             params_to_optimize,

From b17f9bf558fe72abbe407bc72a9fdbb96ac0c481 Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Tue, 15 Oct 2024 15:13:06 +0300
Subject: [PATCH 05/10] fix mixed precision training as  proposed in
 https://github.com/huggingface/diffusers/pull/9565 for full dreambooth as
 well

---
 examples/dreambooth/train_dreambooth_flux.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_flux.py b/examples/dreambooth/train_dreambooth_flux.py
index 8e0f4e09a461..e5f66c970c59 100644
--- a/examples/dreambooth/train_dreambooth_flux.py
+++ b/examples/dreambooth/train_dreambooth_flux.py
@@ -161,7 +161,7 @@ def log_validation(
         f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
         f" {args.validation_prompt}."
     )
-    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
+    pipeline = pipeline.to(accelerator.device)
     pipeline.set_progress_bar_config(disable=True)
 
     # run inference
@@ -1580,7 +1580,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 )
 
                 # handle guidance
-                if transformer.config.guidance_embeds:
+                if accelerator.unwrap_model(transformer).config.guidance_embeds:
                     guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
                     guidance = guidance.expand(model_input.shape[0])
                 else:
@@ -1694,6 +1694,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 # create pipeline
                 if not args.train_text_encoder:
                     text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
+                    text_encoder_one.to(weight_dtype)
+                    text_encoder_two.to(weight_dtype)
                 else:  # even when training the text encoder we're only training text encoder one
                     text_encoder_two = text_encoder_cls_two.from_pretrained(
                         args.pretrained_model_name_or_path,

From 73b0e0f203b957f5d3f295393dbaaab115b5c1dd Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Mon, 28 Oct 2024 15:20:29 +0200
Subject: [PATCH 06/10] add test and notes

---
 examples/dreambooth/README_flux.md            | 15 ++++++++
 .../dreambooth/test_dreambooth_lora_flux.py   | 34 +++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/examples/dreambooth/README_flux.md b/examples/dreambooth/README_flux.md
index 69dfd241395b..a724ca53b927 100644
--- a/examples/dreambooth/README_flux.md
+++ b/examples/dreambooth/README_flux.md
@@ -170,6 +170,21 @@ accelerate launch train_dreambooth_lora_flux.py \
   --push_to_hub
 ```
 
+### Target Modules
+When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them. 
+More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore 
+applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
+the exact modules for LoRA training. Here are some examples of target modules you can provide: 
+- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
+- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
+- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
+> [!NOTE]
+> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
+> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
+> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k` 
+> [!NOTE]
+> keep in mind that while training more layers can improve quality and expressiveness, it also increases the size of the output LoRA weights.
+
 ### Text Encoder Training
 
 Alongside the transformer, fine-tuning of the CLIP text encoder is also supported.
diff --git a/examples/dreambooth/test_dreambooth_lora_flux.py b/examples/dreambooth/test_dreambooth_lora_flux.py
index d197c8187b87..567b89d2a860 100644
--- a/examples/dreambooth/test_dreambooth_lora_flux.py
+++ b/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -136,6 +136,40 @@ def test_dreambooth_lora_latent_caching(self):
             starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
             self.assertTrue(starts_with_transformer)
 
+    def test_dreambooth_lora_layers(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --cache_latents
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lora_layers single_transformer_blocks.0.attn.to_k
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
     def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""

From 8c18e1e5f881d0ab823e3ea4b37df353abae33b9 Mon Sep 17 00:00:00 2001
From: Linoy <linoy@huggingface.co>
Date: Mon, 28 Oct 2024 13:22:35 +0000
Subject: [PATCH 07/10] style

---
 examples/dreambooth/test_dreambooth_lora_flux.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/dreambooth/test_dreambooth_lora_flux.py b/examples/dreambooth/test_dreambooth_lora_flux.py
index 567b89d2a860..f5660b4fd872 100644
--- a/examples/dreambooth/test_dreambooth_lora_flux.py
+++ b/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -167,7 +167,9 @@ def test_dreambooth_lora_layers(self):
 
             # when not training the text encoder, all the parameters in the state dict should start
             # with `"transformer"` in their names.
-            starts_with_transformer = all(key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys())
+            starts_with_transformer = all(
+                key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys()
+            )
             self.assertTrue(starts_with_transformer)
 
     def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):

From 4f034b9b570623a7e3f9707baa7a2f95871989fd Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Mon, 28 Oct 2024 16:10:03 +0200
Subject: [PATCH 08/10] address sayaks comments

---
 examples/dreambooth/test_dreambooth_lora_flux.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/dreambooth/test_dreambooth_lora_flux.py b/examples/dreambooth/test_dreambooth_lora_flux.py
index f5660b4fd872..9083885ef05a 100644
--- a/examples/dreambooth/test_dreambooth_lora_flux.py
+++ b/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -37,7 +37,7 @@ class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
     instance_prompt = "photo"
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
     script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
-
+    transformer_layer_type = "single_transformer_blocks.0.attn.to_k"
     def test_dreambooth_lora_flux(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""
@@ -150,7 +150,7 @@ def test_dreambooth_lora_layers(self):
                 --cache_latents
                 --learning_rate 5.0e-04
                 --scale_lr
-                --lora_layers single_transformer_blocks.0.attn.to_k
+                --lora_layers {transformer_layer_type}
                 --lr_scheduler constant
                 --lr_warmup_steps 0
                 --output_dir {tmpdir}
@@ -166,7 +166,8 @@ def test_dreambooth_lora_layers(self):
             self.assertTrue(is_lora)
 
             # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names.
+            # with `"transformer"` in their names. In this test, we only params of
+            # transformer.single_transformer_blocks.0.attn.to_k should be in the state dict
             starts_with_transformer = all(
                 key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys()
             )

From 2e3a7a1d8514aa27d6c3303dc995bd8d1dadee0a Mon Sep 17 00:00:00 2001
From: Linoy <linoy@huggingface.co>
Date: Mon, 28 Oct 2024 14:11:14 +0000
Subject: [PATCH 09/10] style

---
 examples/dreambooth/test_dreambooth_lora_flux.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/dreambooth/test_dreambooth_lora_flux.py b/examples/dreambooth/test_dreambooth_lora_flux.py
index 9083885ef05a..8544b53966c9 100644
--- a/examples/dreambooth/test_dreambooth_lora_flux.py
+++ b/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -38,6 +38,7 @@ class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
     script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
     transformer_layer_type = "single_transformer_blocks.0.attn.to_k"
+
     def test_dreambooth_lora_flux(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""

From 7c533aee63b939cb379d01522dad9024f56189e7 Mon Sep 17 00:00:00 2001
From: linoytsaban <linoy@huggingface.co>
Date: Mon, 28 Oct 2024 16:29:10 +0200
Subject: [PATCH 10/10] fix test

---
 examples/dreambooth/test_dreambooth_lora_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/test_dreambooth_lora_flux.py b/examples/dreambooth/test_dreambooth_lora_flux.py
index 8544b53966c9..a76825e29448 100644
--- a/examples/dreambooth/test_dreambooth_lora_flux.py
+++ b/examples/dreambooth/test_dreambooth_lora_flux.py
@@ -151,7 +151,7 @@ def test_dreambooth_lora_layers(self):
                 --cache_latents
                 --learning_rate 5.0e-04
                 --scale_lr
-                --lora_layers {transformer_layer_type}
+                --lora_layers {self.transformer_layer_type}
                 --lr_scheduler constant
                 --lr_warmup_steps 0
                 --output_dir {tmpdir}