huggingface
diff --git a/‎docs/source/en/api/pipelines/flux2.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/pipelines/flux2.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/dreambooth/README_flux2.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/dreambooth/README_flux2.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/text_to_image/train_text_to_image_lora.py‎
Lines changed: 61 additions & 13 deletions b/‎examples/text_to_image/train_text_to_image_lora.py‎
Lines changed: 61 additions & 13 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_z_image.py‎
Lines changed: 17 additions & 3 deletions b/‎src/diffusers/models/transformers/transformer_z_image.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/diffusers/pipelines/flux2/pipeline_flux2.py‎
Lines changed: 0 additions & 1 deletion b/‎src/diffusers/pipelines/flux2/pipeline_flux2.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/z_image/pipeline_z_image.py‎
Lines changed: 14 additions & 25 deletions b/‎src/diffusers/pipelines/z_image/pipeline_z_image.py‎
Lines changed: 14 additions & 25 deletions
diff --git a/‎src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py‎
Lines changed: 20 additions & 1 deletion b/‎src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py‎
Lines changed: 20 additions & 1 deletion
@@ -19,7 +19,7 @@ specific language governing permissions and limitations under the License.
 
 Flux.2 is the recent series of image generation models from Black Forest Labs, preceded by the [Flux.1](./flux.md) series. It is an entirely new model with a new architecture and pre-training done from scratch!
 
-Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux2-dev).
+Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux2).
 
 > [!TIP]
 > Flux2 can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.
 
@@ -2,7 +2,7 @@
 
 [DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize image generation models given just a few (3~5) images of a subject/concept.
 
-The `train_dreambooth_lora_flux2.py` script shows how to implement the training procedure for [LoRAs](https://huggingface.co/blog/lora) and adapt it for [FLUX.2 [dev]](https://github.com/black-forest-labs/flux2-dev).
+The `train_dreambooth_lora_flux2.py` script shows how to implement the training procedure for [LoRAs](https://huggingface.co/blog/lora) and adapt it for [FLUX.2 [dev]](https://github.com/black-forest-labs/flux2).
 
 > [!NOTE]
 > **Memory consumption**
 
@@ -37,7 +37,7 @@
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer
@@ -46,7 +46,12 @@
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import cast_training_params, compute_snr
-from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
@@ -708,6 +713,56 @@ def collate_fn(examples):
         num_workers=args.dataloader_num_workers,
     )
 
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"Unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionPipeline.save_lora_weights(
+                save_directory=output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                safe_serialization=True,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        # returns a tuple of state dictionary and network alphas
+        lora_state_dict, network_alphas = StableDiffusionPipeline.lora_state_dict(input_dir)
+
+        unet_state_dict = {f"{k.replace('unet.', '')}": v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            # throw warning if some unexpected keys are found and continue loading
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32
+        if args.mixed_precision in ["fp16"]:
+            cast_training_params([unet_], dtype=torch.float32)
+
     # Scheduler and math around the number of training steps.
     # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
     num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
@@ -732,6 +787,10 @@ def collate_fn(examples):
         unet, optimizer, train_dataloader, lr_scheduler
     )
 
+    # Register the hooks for efficient saving and loading of LoRA weights
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -906,17 +965,6 @@ def collate_fn(examples):
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
 
-                        unwrapped_unet = unwrap_model(unet)
-                        unet_lora_state_dict = convert_state_dict_to_diffusers(
-                            get_peft_model_state_dict(unwrapped_unet)
-                        )
-
-                        StableDiffusionPipeline.save_lora_weights(
-                            save_directory=save_path,
-                            unet_lora_layers=unet_lora_state_dict,
-                            safe_serialization=True,
-                        )
-
                         logger.info(f"Saved state to {save_path}")
 
             logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
 
@@ -69,7 +69,10 @@ def timestep_embedding(t, dim, max_period=10000):
 
     def forward(self, t):
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        weight_dtype = self.mlp[0].weight.dtype
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        t_emb = self.mlp(t_freq)
         return t_emb
 
 
@@ -126,6 +129,10 @@ def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tenso
         dtype = query.dtype
         query, key = query.to(dtype), key.to(dtype)
 
+        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+
         # Compute joint attention
         hidden_states = dispatch_attention_fn(
             query,
@@ -306,6 +313,10 @@ def __call__(self, ids: torch.Tensor):
         if self.freqs_cis is None:
             self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
             self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
+        else:
+            # Ensure freqs_cis are on the same device as ids
+            if self.freqs_cis[0].device != device:
+                self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
 
         result = []
         for i in range(len(self.axes_dims)):
@@ -317,6 +328,8 @@ def __call__(self, ids: torch.Tensor):
 class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     _supports_gradient_checkpointing = True
     _no_split_modules = ["ZImageTransformerBlock"]
+    _repeated_blocks = ["ZImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"]  # precision sensitive layers
 
     @register_to_config
     def __init__(
@@ -553,8 +566,6 @@ def forward(
         t = t * self.t_scale
         t = self.t_embedder(t)
 
-        adaln_input = t
-
         (
             x,
             cap_feats,
@@ -572,6 +583,9 @@ def forward(
 
         x = torch.cat(x, dim=0)
         x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
+
+        # Match t_embedder output dtype to x for layerwise casting compatibility
+        adaln_input = t.type_as(x)
         x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
         x = list(x.split(x_item_seqlens, dim=0))
         x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split(x_item_seqlens, dim=0))
 
@@ -861,7 +861,6 @@ def __call__(
         if output_type == "latent":
             image = latents
         else:
-            torch.save({"pred": latents}, "pred_d.pt")
             latents = self._unpack_latents_with_ids(latents, latent_ids)
 
             latents_bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
 
@@ -165,21 +165,16 @@ def encode_prompt(
         self,
         prompt: Union[str, List[str]],
         device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-        num_images_per_prompt: int = 1,
         do_classifier_free_guidance: bool = True,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         prompt_embeds: Optional[List[torch.FloatTensor]] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_sequence_length: int = 512,
-        lora_scale: Optional[float] = None,
     ):
         prompt = [prompt] if isinstance(prompt, str) else prompt
         prompt_embeds = self._encode_prompt(
             prompt=prompt,
             device=device,
-            dtype=dtype,
-            num_images_per_prompt=num_images_per_prompt,
             prompt_embeds=prompt_embeds,
             max_sequence_length=max_sequence_length,
         )
@@ -193,8 +188,6 @@ def encode_prompt(
             negative_prompt_embeds = self._encode_prompt(
                 prompt=negative_prompt,
                 device=device,
-                dtype=dtype,
-                num_images_per_prompt=num_images_per_prompt,
                 prompt_embeds=negative_prompt_embeds,
                 max_sequence_length=max_sequence_length,
             )
@@ -206,12 +199,9 @@ def _encode_prompt(
         self,
         prompt: Union[str, List[str]],
         device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-        num_images_per_prompt: int = 1,
         prompt_embeds: Optional[List[torch.FloatTensor]] = None,
         max_sequence_length: int = 512,
     ) -> List[torch.FloatTensor]:
-        assert num_images_per_prompt == 1
         device = device or self._execution_device
 
         if prompt_embeds is not None:
@@ -417,8 +407,6 @@ def __call__(
                 f"Please adjust the width to a multiple of {vae_scale}."
             )
 
-        assert self.dtype == torch.bfloat16
-        dtype = self.dtype
         device = self._execution_device
 
         self._guidance_scale = guidance_scale
@@ -434,10 +422,6 @@ def __call__(
         else:
             batch_size = len(prompt_embeds)
 
-        lora_scale = (
-            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
-        )
-
         # If prompt_embeds is provided and prompt is None, skip encoding
         if prompt_embeds is not None and prompt is None:
             if self.do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -455,11 +439,8 @@ def __call__(
                 do_classifier_free_guidance=self.do_classifier_free_guidance,
                 prompt_embeds=prompt_embeds,
                 negative_prompt_embeds=negative_prompt_embeds,
-                dtype=dtype,
                 device=device,
-                num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
-                lora_scale=lora_scale,
             )
 
         # 4. Prepare latent variables
@@ -475,6 +456,14 @@ def __call__(
             generator,
             latents,
         )
+
+        # Repeat prompt_embeds for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+            if self.do_classifier_free_guidance and negative_prompt_embeds:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+
+        actual_batch_size = batch_size * num_images_per_prompt
         image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
 
         # 5. Prepare timesteps
@@ -523,12 +512,12 @@ def __call__(
                 apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
 
                 if apply_cfg:
-                    latents_typed = latents if latents.dtype == dtype else latents.to(dtype)
+                    latents_typed = latents.to(self.transformer.dtype)
                     latent_model_input = latents_typed.repeat(2, 1, 1, 1)
                     prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
                     timestep_model_input = timestep.repeat(2)
                 else:
-                    latent_model_input = latents if latents.dtype == dtype else latents.to(dtype)
+                    latent_model_input = latents.to(self.transformer.dtype)
                     prompt_embeds_model_input = prompt_embeds
                     timestep_model_input = timestep
 
@@ -543,11 +532,11 @@ def __call__(
 
                 if apply_cfg:
                     # Perform CFG
-                    pos_out = model_out_list[:batch_size]
-                    neg_out = model_out_list[batch_size:]
+                    pos_out = model_out_list[:actual_batch_size]
+                    neg_out = model_out_list[actual_batch_size:]
 
                     noise_pred = []
-                    for j in range(batch_size):
+                    for j in range(actual_batch_size):
                         pos = pos_out[j].float()
                         neg = neg_out[j].float()
 
@@ -588,11 +577,11 @@ def __call__(
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
 
-        latents = latents.to(dtype)
         if output_type == "latent":
             image = latents
 
         else:
+            latents = latents.to(self.vae.dtype)
             latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
 
             image = self.vae.decode(latents, return_dict=False)[0]
 
@@ -429,7 +429,22 @@ def multistep_dpm_solver_second_order_update(
         return x_t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -452,6 +467,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
     def _init_step_index(self, timestep):
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None: