Merge branch 'flux-control-lora' into flux-control-lora-training-script

sayakpaul · sayakpaul · commit 6ce2307dd909 · 2024-12-06T18:47:22.000+05:30
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -673,6 +673,10 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
     inner_dim = 3072
     mlp_ratio = 4.0
 
+    for k in original_state_dict:
+        if "bias" in k and "img_in" in k:
+            print(f"{k=}")
+
     def swap_scale_shift(weight):
         shift, scale = weight.chunk(2, dim=0)
         new_weight = torch.cat([scale, shift], dim=0)
@@ -750,7 +754,7 @@ def swap_scale_shift(weight):
     for i in range(num_layers):
         block_prefix = f"transformer_blocks.{i}."
 
-        for lora_key, lora_key in zip(["lora_A", "lora_B"], ["lora_A", "lora_B"]):
+        for lora_key in ["lora_A", "lora_B"]:
             # norms
             converted_state_dict[f"{block_prefix}norm1.linear.{lora_key}.weight"] = original_state_dict.pop(
                 f"double_blocks.{i}.img_mod.lin.{lora_key}.weight"
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -427,7 +427,7 @@ def load_lora_into_text_encoder(
                     if lora_config_kwargs["lora_bias"]:
                         if is_peft_version("<=", "0.13.2"):
                             raise ValueError(
-                                "You need `peft` 0.13.3 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
+                                "You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
                             )
                     else:
                         if is_peft_version("<=", "0.13.2"):
@@ -970,7 +970,7 @@ def load_lora_into_text_encoder(
                     if lora_config_kwargs["lora_bias"]:
                         if is_peft_version("<=", "0.13.2"):
                             raise ValueError(
-                                "You need `peft` 0.13.3 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
+                                "You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
                             )
                     else:
                         if is_peft_version("<=", "0.13.2"):
@@ -1479,7 +1479,7 @@ def load_lora_into_text_encoder(
                     if lora_config_kwargs["lora_bias"]:
                         if is_peft_version("<=", "0.13.2"):
                             raise ValueError(
-                                "You need `peft` 0.13.3 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
+                                "You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
                             )
                     else:
                         if is_peft_version("<=", "0.13.2"):
@@ -2108,7 +2108,7 @@ def load_lora_into_text_encoder(
                     if lora_config_kwargs["lora_bias"]:
                         if is_peft_version("<=", "0.13.2"):
                             raise ValueError(
-                                "You need `peft` 0.13.3 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
+                                "You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
                             )
                     else:
                         if is_peft_version("<=", "0.13.2"):
@@ -2246,7 +2246,7 @@ def fuse_lora(
         ):
             logger.info(
                 "The provided state dict contains normalization layers in addition to LoRA layers. The normalization layers will be directly updated the state_dict of the transformer "
-                'as opposed to the LoRA layers that will co-exist separately until the "fuse_lora()" method is called. That is to say, the normalization layers will always be directly '
+                "as opposed to the LoRA layers that will co-exist separately until the 'fuse_lora()' method is called. That is to say, the normalization layers will always be directly "
                 "fused into the transformer and can only be unfused if `discard_original_layers=True` is passed."
             )
 
@@ -2318,14 +2318,13 @@ def _maybe_expand_transformer_param_shape_or_error_(
 
                 lora_A_weight_name = f"{name}.lora_A.weight"
                 lora_B_weight_name = f"{name}.lora_B.weight"
-                lora_B_bias_name = f"{name}.lora_B.bias"
-
                 if lora_A_weight_name not in state_dict.keys():
                     continue
 
                 in_features = state_dict[lora_A_weight_name].shape[1]
                 out_features = state_dict[lora_B_weight_name].shape[0]
 
+                # This means there's no need for an expansion in the params, so we simply skip.
                 if tuple(module_weight.shape) == (out_features, in_features):
                     continue
 
@@ -2349,27 +2348,19 @@ def _maybe_expand_transformer_param_shape_or_error_(
                 parent_module_name, _, current_module_name = name.rpartition(".")
                 parent_module = transformer.get_submodule(parent_module_name)
 
+                # TODO: consider initializing this under meta device for optims.
                 expanded_module = torch.nn.Linear(
                     in_features, out_features, bias=bias, device=module_weight.device, dtype=module_weight.dtype
                 )
-
+                # Only weights are expanded and biases are not.
                 new_weight = torch.zeros_like(
                     expanded_module.weight.data, device=module_weight.device, dtype=module_weight.dtype
                 )
                 slices = tuple(slice(0, dim) for dim in module_weight.shape)
                 new_weight[slices] = module_weight
                 expanded_module.weight.data.copy_(new_weight)
-
-                bias_present_for_lora_B = lora_B_bias_name in state_dict
-                if bias_present_for_lora_B:
-                    new_bias_shape = state_dict[lora_B_bias_name].shape
-                    if bias and module_bias.shape < new_bias_shape:
-                        new_bias = torch.zeros_like(
-                            expanded_module.bias.data, device=module_bias.device, dtype=module_bias.dtype
-                        )
-                        slices = tuple(slice(0, dim) for dim in module_bias.shape)
-                        new_bias[slices] = module_bias
-                        expanded_module.bias.data.copy_(new_bias)
+                if module_bias is not None:
+                    expanded_module.bias.data.copy_(module_bias)
 
                 setattr(parent_module, current_module_name, expanded_module)
 
@@ -2551,7 +2542,7 @@ def load_lora_into_text_encoder(
                     if lora_config_kwargs["lora_bias"]:
                         if is_peft_version("<=", "0.13.2"):
                             raise ValueError(
-                                "You need `peft` 0.13.3 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
+                                "You need `peft` 0.14.0 at least to use `bias` in LoRAs. Please upgrade your installation of `peft`."
                             )
                     else:
                         if is_peft_version("<=", "0.13.2"):
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -293,7 +293,7 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans
                 if lora_config_kwargs["lora_bias"]:
                     if is_peft_version("<=", "0.13.2"):
                         raise ValueError(
-                            "You need `peft` 0.13.3 at least to use `lora_bias` in LoRAs. Please upgrade your installation of `peft`."
+                            "You need `peft` 0.14.0 at least to use `lora_bias` in LoRAs. Please upgrade your installation of `peft`."
                         )
                 else:
                     if is_peft_version("<=", "0.13.2"):
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
@@ -21,9 +21,10 @@
 import numpy as np
 import safetensors.torch
 import torch
+from PIL import Image
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
-from diffusers import FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+from diffusers import FlowMatchEulerDiscreteScheduler, FluxControlPipeline, FluxPipeline, FluxTransformer2DModel
 from diffusers.utils import logging
 from diffusers.utils.testing_utils import (
     CaptureLogger,
@@ -159,7 +160,80 @@ def test_with_alpha_in_state_dict(self):
         )
         self.assertFalse(np.allclose(images_lora_with_alpha, images_lora, atol=1e-3, rtol=1e-3))
 
-    # flux control lora specific
+    @unittest.skip("Not supported in Flux.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in Flux.")
+    def test_modify_padding_mode(self):
+        pass
+
+
+class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = FluxControlPipeline
+    scheduler_cls = FlowMatchEulerDiscreteScheduler()
+    scheduler_kwargs = {}
+    scheduler_classes = [FlowMatchEulerDiscreteScheduler]
+    transformer_kwargs = {
+        "patch_size": 1,
+        "in_channels": 8,
+        "out_channels": 4,
+        "num_layers": 1,
+        "num_single_layers": 1,
+        "attention_head_dim": 16,
+        "num_attention_heads": 2,
+        "joint_attention_dim": 32,
+        "pooled_projection_dim": 32,
+        "axes_dims_rope": [4, 4, 8],
+    }
+    transformer_cls = FluxTransformer2DModel
+    vae_kwargs = {
+        "sample_size": 32,
+        "in_channels": 3,
+        "out_channels": 3,
+        "block_out_channels": (4,),
+        "layers_per_block": 1,
+        "latent_channels": 1,
+        "norm_num_groups": 1,
+        "use_quant_conv": False,
+        "use_post_quant_conv": False,
+        "shift_factor": 0.0609,
+        "scaling_factor": 1.5035,
+    }
+    has_two_text_encoders = True
+    tokenizer_cls, tokenizer_id = CLIPTokenizer, "peft-internal-testing/tiny-clip-text-2"
+    tokenizer_2_cls, tokenizer_2_id = AutoTokenizer, "hf-internal-testing/tiny-random-t5"
+    text_encoder_cls, text_encoder_id = CLIPTextModel, "peft-internal-testing/tiny-clip-text-2"
+    text_encoder_2_cls, text_encoder_2_id = T5EncoderModel, "hf-internal-testing/tiny-random-t5"
+
+    @property
+    def output_shape(self):
+        return (1, 8, 8, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "control_image": Image.fromarray(np.random.randint(0, 255, size=(32, 32, 3), dtype="uint8")),
+            "num_inference_steps": 4,
+            "guidance_scale": 0.0,
+            "height": 8,
+            "width": 8,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
     def test_with_norm_in_state_dict(self):
         components, _, denoiser_lora_config = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
         pipe = self.pipeline_class(**components)
@@ -184,7 +258,7 @@ def test_with_norm_in_state_dict(self):
 
                 with CaptureLogger(logger) as cap_logger:
                     pipe.load_lora_weights(norm_state_dict)
-                    lora_load_output = pipe(**inputs, generator=torch.manual_seed(0))[0]
+                lora_load_output = pipe(**inputs, generator=torch.manual_seed(0))[0]
 
                 self.assertTrue(
                     cap_logger.out.startswith(
@@ -211,18 +285,38 @@ def test_with_norm_in_state_dict(self):
             cap_logger.out.startswith("Unsupported keys found in state dict when trying to load normalization layers")
         )
 
-    # flux control lora specific
     def test_lora_parameter_expanded_shapes(self):
         components, _, _ = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
         pipe = self.pipeline_class(**components)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
+        original_out = pipe(**inputs, generator=torch.manual_seed(0))[0]
 
         logger = logging.get_logger("diffusers.loaders.lora_pipeline")
         logger.setLevel(logging.DEBUG)
 
+        # Change the transformer config to mimic a real use case.
+        num_channels_without_control = 4
+        transformer = FluxTransformer2DModel.from_config(
+            components["transformer"].config, in_channels=num_channels_without_control
+        ).to(torch_device)
+        self.assertTrue(
+            transformer.config.in_channels == num_channels_without_control,
+            f"Expected {num_channels_without_control} channels in the modified transformer but has {transformer.config.in_channels=}",
+        )
+
+        original_transformer_state_dict = pipe.transformer.state_dict()
+        x_embedder_weight = original_transformer_state_dict.pop("x_embedder.weight")
+        incompatible_keys = transformer.load_state_dict(original_transformer_state_dict, strict=False)
+        self.assertTrue(
+            "x_embedder.weight" in incompatible_keys.missing_keys,
+            "Could not find x_embedder.weight in the missing keys.",
+        )
+        transformer.x_embedder.weight.data.copy_(x_embedder_weight[..., :num_channels_without_control])
+        pipe.transformer = transformer
+
         out_features, in_features = pipe.transformer.x_embedder.weight.shape
         rank = 4
 
@@ -234,11 +328,13 @@ def test_lora_parameter_expanded_shapes(self):
         }
         with CaptureLogger(logger) as cap_logger:
             pipe.load_lora_weights(lora_state_dict, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in denoiser")
 
+        lora_out = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+        self.assertFalse(np.allclose(original_out, lora_out, rtol=1e-4, atol=1e-4))
         self.assertTrue(pipe.transformer.x_embedder.weight.data.shape[1] == 2 * in_features)
         self.assertTrue(pipe.transformer.config.in_channels == 2 * in_features)
-
-        pipe.delete_adapters("adapter-1")
         self.assertTrue(cap_logger.out.startswith("Expanding the nn.Linear input/output features for module"))
 
         components, _, _ = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
@@ -256,14 +352,20 @@ def test_lora_parameter_expanded_shapes(self):
         with self.assertRaises(NotImplementedError):
             pipe.load_lora_weights(lora_state_dict, "adapter-1")
 
-    # flux control lora specific
     @require_peft_version_greater("0.13.2")
     def test_lora_B_bias(self):
         components, _, denoiser_lora_config = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
         pipe = self.pipeline_class(**components)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
+        # keep track of the bias values of the base layers to perform checks later.
+        bias_values = {}
+        for name, module in pipe.transformer.named_modules():
+            if any(k in name for k in ["to_q", "to_k", "to_v", "to_out.0"]):
+                if module.bias is not None:
+                    bias_values[name] = module.bias.data.clone()
+
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
         logger = logging.get_logger("diffusers.loaders.lora_pipeline")

Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans`
`293`	`293`	`if lora_config_kwargs["lora_bias"]:`
`294`	`294`	`if is_peft_version("<=", "0.13.2"):`
`295`	`295`	`raise ValueError(`
`296`		- "You need `peft` 0.13.3 at least to use `lora_bias` in LoRAs. Please upgrade your installation of `peft`."
	`296`	+ "You need `peft` 0.14.0 at least to use `lora_bias` in LoRAs. Please upgrade your installation of `peft`."
`297`	`297`	`)`
`298`	`298`	`else:`
`299`	`299`	`if is_peft_version("<=", "0.13.2"):`