Merge branch 'main' into dreambooth-lora-flux-exploration

linoytsaban · web-flow · commit b791e13dd039 · 2024-10-09T10:50:13.000+03:00
diff --git a/docs/source/en/tutorials/using_peft_for_inference.md b/docs/source/en/tutorials/using_peft_for_inference.md
@@ -75,6 +75,12 @@ image
 
 ![pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_12_1.png)
 
+<Tip>
+
+By default, if the most up-to-date versions of PEFT and Transformers are detected, `low_cpu_mem_usage` is set to `True` to speed up the loading time of LoRA checkpoints. 
+
+</Tip>
+
 ## Merge adapters
 
 You can also merge different adapter checkpoints for inference to blend their styles together.
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -115,6 +115,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 `default_{i}` where i is the total number of adapters being loaded.
             weight_name (`str`, *optional*, defaults to None):
                 Name of the serialized state dict file.
+            low_cpu_mem_usage (`bool`, *optional*):
+                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
+                weights.
 
         Example:
 
@@ -142,8 +145,14 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         adapter_name = kwargs.pop("adapter_name", None)
         _pipeline = kwargs.pop("_pipeline", None)
         network_alphas = kwargs.pop("network_alphas", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
         allow_pickle = False
 
+        if low_cpu_mem_usage and is_peft_version("<=", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+
         if use_safetensors is None:
             use_safetensors = True
             allow_pickle = True
@@ -209,6 +218,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 network_alphas=network_alphas,
                 adapter_name=adapter_name,
                 _pipeline=_pipeline,
+                low_cpu_mem_usage=low_cpu_mem_usage,
             )
         else:
             raise ValueError(
@@ -268,7 +278,9 @@ def _process_custom_diffusion(self, state_dict):
 
         return attn_processors
 
-    def _process_lora(self, state_dict, unet_identifier_key, network_alphas, adapter_name, _pipeline):
+    def _process_lora(
+        self, state_dict, unet_identifier_key, network_alphas, adapter_name, _pipeline, low_cpu_mem_usage
+    ):
         # This method does the following things:
         # 1. Filters the `state_dict` with keys matching  `unet_identifier_key` when using the non-legacy
         #    format. For legacy format no filtering is applied.
@@ -335,9 +347,12 @@ def _process_lora(self, state_dict, unet_identifier_key, network_alphas, adapter
             # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
             # otherwise loading LoRA weights will lead to an error
             is_model_cpu_offload, is_sequential_cpu_offload = self._optionally_disable_offloading(_pipeline)
+            peft_kwargs = {}
+            if is_peft_version(">=", "0.13.1"):
+                peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
-            inject_adapter_in_model(lora_config, self, adapter_name=adapter_name)
-            incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name)
+            inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
+            incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
 
             if incompatible_keys is not None:
                 # check only for unexpected keys
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
@@ -83,14 +83,16 @@ def forward(
         hidden_states: torch.FloatTensor,
         temb: torch.FloatTensor,
         image_rotary_emb=None,
+        joint_attention_kwargs=None,
     ):
         residual = hidden_states
         norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
         mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-
+        joint_attention_kwargs = joint_attention_kwargs or {}
         attn_output = self.attn(
             hidden_states=norm_hidden_states,
             image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
         )
 
         hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
@@ -161,18 +163,20 @@ def forward(
         encoder_hidden_states: torch.FloatTensor,
         temb: torch.FloatTensor,
         image_rotary_emb=None,
+        joint_attention_kwargs=None,
     ):
         norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
 
         norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
             encoder_hidden_states, emb=temb
         )
-
+        joint_attention_kwargs = joint_attention_kwargs or {}
         # Attention.
         attn_output, context_attn_output = self.attn(
             hidden_states=norm_hidden_states,
             encoder_hidden_states=norm_encoder_hidden_states,
             image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
         )
 
         # Process attention outputs for the `hidden_states`.
@@ -497,6 +501,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
 
             # controlnet residual
@@ -533,6 +538,7 @@ def custom_forward(*inputs):
                     hidden_states=hidden_states,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
 
             # controlnet residual
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
@@ -9,16 +9,17 @@
 
 @dataclass
 class IFPipelineOutput(BaseOutput):
-    """
-    Args:
+    r"""
     Output class for Stable Diffusion pipelines.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`):
             List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
             num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_detected (`List[bool]`)
+        nsfw_detected (`List[bool]`):
             List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content or a watermark. `None` if safety checking could not be performed.
-        watermark_detected (`List[bool]`)
+        watermark_detected (`List[bool]`):
             List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
             checking could not be performed.
     """
diff --git a/src/diffusers/pipelines/pag/pag_utils.py b/src/diffusers/pipelines/pag/pag_utils.py
@@ -98,7 +98,9 @@ def _get_pag_scale(self, t):
         else:
             return self.pag_scale
 
-    def _apply_perturbed_attention_guidance(self, noise_pred, do_classifier_free_guidance, guidance_scale, t):
+    def _apply_perturbed_attention_guidance(
+        self, noise_pred, do_classifier_free_guidance, guidance_scale, t, return_pred_text=False
+    ):
         r"""
         Apply perturbed attention guidance to the noise prediction.
 
@@ -107,9 +109,11 @@ def _apply_perturbed_attention_guidance(self, noise_pred, do_classifier_free_gui
             do_classifier_free_guidance (bool): Whether to apply classifier-free guidance.
             guidance_scale (float): The scale factor for the guidance term.
             t (int): The current time step.
+            return_pred_text (bool): Whether to return the text noise prediction.
 
         Returns:
-            torch.Tensor: The updated noise prediction tensor after applying perturbed attention guidance.
+            Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: The updated noise prediction tensor after applying
+            perturbed attention guidance and the text noise prediction.
         """
         pag_scale = self._get_pag_scale(t)
         if do_classifier_free_guidance:
@@ -122,6 +126,8 @@ def _apply_perturbed_attention_guidance(self, noise_pred, do_classifier_free_gui
         else:
             noise_pred_text, noise_pred_perturb = noise_pred.chunk(2)
             noise_pred = noise_pred_text + pag_scale * (noise_pred_text - noise_pred_perturb)
+        if return_pred_text:
+            return noise_pred, noise_pred_text
         return noise_pred
 
     def _prepare_perturbed_attention_guidance(self, cond, uncond, do_classifier_free_guidance):
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py b/src/diffusers/pipelines/pag/pipeline_pag_hunyuandit.py
@@ -893,8 +893,8 @@ def __call__(
 
                 # perform guidance
                 if self.do_perturbed_attention_guidance:
-                    noise_pred = self._apply_perturbed_attention_guidance(
-                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True
                     )
                 elif self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_sd.py
@@ -993,8 +993,8 @@ def __call__(
 
                 # perform guidance
                 if self.do_perturbed_attention_guidance:
-                    noise_pred = self._apply_perturbed_attention_guidance(
-                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True
                     )
 
                 elif self.do_classifier_free_guidance:
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
@@ -1237,8 +1237,8 @@ def __call__(
 
                 # perform guidance
                 if self.do_perturbed_attention_guidance:
-                    noise_pred = self._apply_perturbed_attention_guidance(
-                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True
                     )
 
                 elif self.do_classifier_free_guidance:
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
@@ -1437,8 +1437,8 @@ def denoising_value_valid(dnv):
 
                 # perform guidance
                 if self.do_perturbed_attention_guidance:
-                    noise_pred = self._apply_perturbed_attention_guidance(
-                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True
                     )
                 elif self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
@@ -1649,8 +1649,8 @@ def denoising_value_valid(dnv):
 
                 # perform guidance
                 if self.do_perturbed_attention_guidance:
-                    noise_pred = self._apply_perturbed_attention_guidance(
-                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+                    noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(
+                        noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True
                     )
                 elif self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
@@ -388,6 +388,24 @@ def decorator(test_case):
     return decorator
 
 
+def require_transformers_version_greater(transformers_version):
+    """
+    Decorator marking a test that requires transformers with a specific version, this would require some specific
+    versions of PEFT and transformers.
+    """
+
+    def decorator(test_case):
+        correct_transformers_version = is_transformers_available() and version.parse(
+            version.parse(importlib.metadata.version("transformers")).base_version
+        ) > version.parse(transformers_version)
+        return unittest.skipUnless(
+            correct_transformers_version,
+            f"test requires transformers with the version greater than {transformers_version}",
+        )(test_case)
+
+    return decorator
+
+
 def require_accelerate_version_greater(accelerate_version):
     def decorator(test_case):
         correct_accelerate_version = is_peft_available() and version.parse(
diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py
@@ -33,8 +33,10 @@
     StableDiffusionXLPipeline,
     T2IAdapter,
 )
+from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
+    CaptureLogger,
     load_image,
     nightly,
     numpy_cosine_similarity_distance,
@@ -620,14 +622,18 @@ def test_integration_logits_for_dora_lora(self):
         pipeline.load_lora_weights("hf-internal-testing/dora-trained-on-kohya")
         pipeline.enable_model_cpu_offload()
 
-        images = pipeline(
-            "photo of ohwx dog",
-            num_inference_steps=10,
-            generator=torch.manual_seed(0),
-            output_type="np",
-        ).images
+        logger = logging.get_logger("diffusers.loaders.lora_pipeline")
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            images = pipeline(
+                "photo of ohwx dog",
+                num_inference_steps=10,
+                generator=torch.manual_seed(0),
+                output_type="np",
+            ).images
+        assert "It seems like you are using a DoRA checkpoint" in cap_logger.out
 
         predicted_slice = images[0, -3:, -3:, -1].flatten()
-        expected_slice_scale = np.array([0.3932, 0.3742, 0.4429, 0.3737, 0.3504, 0.433, 0.3948, 0.3769, 0.4516])
+        expected_slice_scale = np.array([0.1817, 0.0697, 0.2346, 0.0900, 0.1261, 0.2279, 0.1767, 0.1991, 0.2886])
         max_diff = numpy_cosine_similarity_distance(expected_slice_scale, predicted_slice)
         assert max_diff < 1e-3
diff --git a/tests/lora/utils.py b/tests/lora/utils.py

Original file line number	Diff line number	Diff line change
`@@ -993,8 +993,8 @@ def __call__(`
`993`	`993`
`994`	`994`	`# perform guidance`
`995`	`995`	`if self.do_perturbed_attention_guidance:`
`996`		`- noise_pred = self._apply_perturbed_attention_guidance(`
`997`		`- noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t`
	`996`	`+ noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(`
	`997`	`+ noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True`
`998`	`998`	`)`
`999`	`999`
`1000`	`1000`	`elif self.do_classifier_free_guidance:`
Original file line number	Diff line number	Diff line change
`@@ -1237,8 +1237,8 @@ def __call__(`
`1237`	`1237`
`1238`	`1238`	`# perform guidance`
`1239`	`1239`	`if self.do_perturbed_attention_guidance:`
`1240`		`- noise_pred = self._apply_perturbed_attention_guidance(`
`1241`		`- noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t`
	`1240`	`+ noise_pred, noise_pred_text = self._apply_perturbed_attention_guidance(`
	`1241`	`+ noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t, True`
`1242`	`1242`	`)`
`1243`	`1243`
`1244`	`1244`	`elif self.do_classifier_free_guidance:`