Merge branch 'main' into groupwise-offloading

a-r-r-o-w · web-flow · commit 274b84e9b93f · 2025-02-11T16:28:25.000+05:30
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -519,7 +519,7 @@ def _convert_sd_scripts_to_ai_toolkit(sds_sd):
         remaining_keys = list(sds_sd.keys())
         te_state_dict = {}
         if remaining_keys:
-            if not all(k.startswith("lora_te1") for k in remaining_keys):
+            if not all(k.startswith("lora_te") for k in remaining_keys):
                 raise ValueError(f"Incompatible keys detected: \n\n {', '.join(remaining_keys)}")
             for key in remaining_keys:
                 if not key.endswith("lora_down.weight"):
@@ -558,6 +558,88 @@ def _convert_sd_scripts_to_ai_toolkit(sds_sd):
         new_state_dict = {**ait_sd, **te_state_dict}
         return new_state_dict
 
+    def _convert_mixture_state_dict_to_diffusers(state_dict):
+        new_state_dict = {}
+
+        def _convert(original_key, diffusers_key, state_dict, new_state_dict):
+            down_key = f"{original_key}.lora_down.weight"
+            down_weight = state_dict.pop(down_key)
+            lora_rank = down_weight.shape[0]
+
+            up_weight_key = f"{original_key}.lora_up.weight"
+            up_weight = state_dict.pop(up_weight_key)
+
+            alpha_key = f"{original_key}.alpha"
+            alpha = state_dict.pop(alpha_key)
+
+            # scale weight by alpha and dim
+            scale = alpha / lora_rank
+            # calculate scale_down and scale_up
+            scale_down = scale
+            scale_up = 1.0
+            while scale_down * 2 < scale_up:
+                scale_down *= 2
+                scale_up /= 2
+            down_weight = down_weight * scale_down
+            up_weight = up_weight * scale_up
+
+            diffusers_down_key = f"{diffusers_key}.lora_A.weight"
+            new_state_dict[diffusers_down_key] = down_weight
+            new_state_dict[diffusers_down_key.replace(".lora_A.", ".lora_B.")] = up_weight
+
+        all_unique_keys = {
+            k.replace(".lora_down.weight", "").replace(".lora_up.weight", "").replace(".alpha", "") for k in state_dict
+        }
+        all_unique_keys = sorted(all_unique_keys)
+        assert all("lora_transformer_" in k for k in all_unique_keys), f"{all_unique_keys=}"
+
+        for k in all_unique_keys:
+            if k.startswith("lora_transformer_single_transformer_blocks_"):
+                i = int(k.split("lora_transformer_single_transformer_blocks_")[-1].split("_")[0])
+                diffusers_key = f"single_transformer_blocks.{i}"
+            elif k.startswith("lora_transformer_transformer_blocks_"):
+                i = int(k.split("lora_transformer_transformer_blocks_")[-1].split("_")[0])
+                diffusers_key = f"transformer_blocks.{i}"
+            else:
+                raise NotImplementedError
+
+            if "attn_" in k:
+                if "_to_out_0" in k:
+                    diffusers_key += ".attn.to_out.0"
+                elif "_to_add_out" in k:
+                    diffusers_key += ".attn.to_add_out"
+                elif any(qkv in k for qkv in ["to_q", "to_k", "to_v"]):
+                    remaining = k.split("attn_")[-1]
+                    diffusers_key += f".attn.{remaining}"
+                elif any(add_qkv in k for add_qkv in ["add_q_proj", "add_k_proj", "add_v_proj"]):
+                    remaining = k.split("attn_")[-1]
+                    diffusers_key += f".attn.{remaining}"
+
+            if diffusers_key == f"transformer_blocks.{i}":
+                print(k, diffusers_key)
+            _convert(k, diffusers_key, state_dict, new_state_dict)
+
+        if len(state_dict) > 0:
+            raise ValueError(
+                f"Expected an empty state dict at this point but its has these keys which couldn't be parsed: {list(state_dict.keys())}."
+            )
+
+        new_state_dict = {f"transformer.{k}": v for k, v in new_state_dict.items()}
+        return new_state_dict
+
+    # This is  weird.
+    # https://huggingface.co/sayakpaul/different-lora-from-civitai/tree/main?show_file_info=sharp_detailed_foot.safetensors
+    # has both `peft` and non-peft state dict.
+    has_peft_state_dict = any(k.startswith("transformer.") for k in state_dict)
+    if has_peft_state_dict:
+        state_dict = {k: v for k, v in state_dict.items() if k.startswith("transformer.")}
+        return state_dict
+    # Another weird one.
+    has_mixture = any(
+        k.startswith("lora_transformer_") and ("lora_down" in k or "lora_up" in k or "alpha" in k) for k in state_dict
+    )
+    if has_mixture:
+        return _convert_mixture_state_dict_to_diffusers(state_dict)
     return _convert_sd_scripts_to_ai_toolkit(state_dict)
 
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
@@ -36,11 +36,11 @@
 def prepare_causal_attention_mask(
     num_frames: int, height_width: int, dtype: torch.dtype, device: torch.device, batch_size: int = None
 ) -> torch.Tensor:
-    seq_len = num_frames * height_width
-    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
-    for i in range(seq_len):
-        i_frame = i // height_width
-        mask[i, : (i_frame + 1) * height_width] = 0
+    indices = torch.arange(1, num_frames + 1, dtype=torch.int32, device=device)
+    indices_blocks = indices.repeat_interleave(height_width)
+    x, y = torch.meshgrid(indices_blocks, indices_blocks, indexing="xy")
+    mask = torch.where(x <= y, 0, -float("inf")).to(dtype=dtype)
+
     if batch_size is not None:
         mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
     return mask
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -31,6 +31,7 @@
 from huggingface_hub import DDUFEntry, create_repo, split_torch_state_dict_into_shards
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import Tensor, nn
+from typing_extensions import Self
 
 from .. import __version__
 from ..hooks import apply_group_offloading, apply_layerwise_casting
@@ -665,7 +666,7 @@ def dequantize(self):
 
     @classmethod
     @validate_hf_hub_args
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs) -> Self:
         r"""
         Instantiate a pretrained PyTorch model from a pretrained model configuration.
 
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -77,6 +77,9 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
             Video](https://imagen.research.google/video/paper.pdf) paper).
         rho (`float`, *optional*, defaults to 7.0):
             The rho parameter used for calculating the Karras sigma schedule, which is set to 7.0 in the EDM paper [1].
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
     """
 
     _compatibles = []
@@ -92,22 +95,32 @@ def __init__(
         num_train_timesteps: int = 1000,
         prediction_type: str = "epsilon",
         rho: float = 7.0,
+        final_sigmas_type: str = "zero",  # can be "zero" or "sigma_min"
     ):
         if sigma_schedule not in ["karras", "exponential"]:
             raise ValueError(f"Wrong value for provided for `{sigma_schedule=}`.`")
 
         # setable values
         self.num_inference_steps = None
 
-        ramp = torch.linspace(0, 1, num_train_timesteps)
+        sigmas = torch.arange(num_train_timesteps + 1) / num_train_timesteps
         if sigma_schedule == "karras":
-            sigmas = self._compute_karras_sigmas(ramp)
+            sigmas = self._compute_karras_sigmas(sigmas)
         elif sigma_schedule == "exponential":
-            sigmas = self._compute_exponential_sigmas(ramp)
+            sigmas = self._compute_exponential_sigmas(sigmas)
 
         self.timesteps = self.precondition_noise(sigmas)
 
-        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = sigmas[-1]
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        self.sigmas = torch.cat([sigmas, torch.full((1,), fill_value=sigma_last, device=sigmas.device)])
 
         self.is_scale_input_called = False
 
@@ -197,7 +210,12 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
         self.is_scale_input_called = True
         return sample
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[Union[torch.Tensor, List[float]]] = None,
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -206,19 +224,36 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
                 The number of diffusion steps used when generating samples with a pre-trained model.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            sigmas (`Union[torch.Tensor, List[float]]`, *optional*):
+                Custom sigmas to use for the denoising process. If not defined, the default behavior when
+                `num_inference_steps` is passed will be used.
         """
         self.num_inference_steps = num_inference_steps
 
-        ramp = torch.linspace(0, 1, self.num_inference_steps)
+        if sigmas is None:
+            sigmas = torch.linspace(0, 1, self.num_inference_steps)
+        elif isinstance(sigmas, float):
+            sigmas = torch.tensor(sigmas, dtype=torch.float32)
+        else:
+            sigmas = sigmas
         if self.config.sigma_schedule == "karras":
-            sigmas = self._compute_karras_sigmas(ramp)
+            sigmas = self._compute_karras_sigmas(sigmas)
         elif self.config.sigma_schedule == "exponential":
-            sigmas = self._compute_exponential_sigmas(ramp)
+            sigmas = self._compute_exponential_sigmas(sigmas)
 
         sigmas = sigmas.to(dtype=torch.float32, device=device)
         self.timesteps = self.precondition_noise(sigmas)
 
-        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = sigmas[-1]
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        self.sigmas = torch.cat([sigmas, torch.full((1,), fill_value=sigma_last, device=sigmas.device)])
         self._step_index = None
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
diff --git a/tests/models/autoencoders/test_models_autoencoder_hunyuan_video.py b/tests/models/autoencoders/test_models_autoencoder_hunyuan_video.py
@@ -18,6 +18,7 @@
 import torch
 
 from diffusers import AutoencoderKLHunyuanVideo
+from diffusers.models.autoencoders.autoencoder_kl_hunyuan_video import prepare_causal_attention_mask
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     floats_tensor,
@@ -182,3 +183,28 @@ def test_forward_with_norm_groups(self):
     @unittest.skip("Unsupported test.")
     def test_outputs_equivalence(self):
         pass
+
+    def test_prepare_causal_attention_mask(self):
+        def prepare_causal_attention_mask_orig(
+            num_frames: int, height_width: int, dtype: torch.dtype, device: torch.device, batch_size: int = None
+        ) -> torch.Tensor:
+            seq_len = num_frames * height_width
+            mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+            for i in range(seq_len):
+                i_frame = i // height_width
+                mask[i, : (i_frame + 1) * height_width] = 0
+            if batch_size is not None:
+                mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+            return mask
+
+        # test with some odd shapes
+        original_mask = prepare_causal_attention_mask_orig(
+            num_frames=31, height_width=111, dtype=torch.float32, device=torch_device
+        )
+        new_mask = prepare_causal_attention_mask(
+            num_frames=31, height_width=111, dtype=torch.float32, device=torch_device
+        )
+        self.assertTrue(
+            torch.allclose(original_mask, new_mask),
+            "Causal attention mask should be the same",
+        )
diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -114,6 +114,12 @@ def test_forward_with_norm_groups(self):
     def test_set_attn_processor_for_determinism(self):
         return
 
+    @unittest.skip(
+        "Test not supported because of 'weight_norm_fwd_first_dim_kernel' not implemented for 'Float8_e4m3fn'"
+    )
+    def test_layerwise_casting_training(self):
+        return super().test_layerwise_casting_training()
+
     @unittest.skip(
         "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. This causes the hook's pre_forward to not "
         "cast the module weights to compute_dtype (as required by forward pass). As a result, forward pass errors out. To fix:\n"
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -1338,6 +1338,36 @@ def test_variant_sharded_ckpt_right_format(self):
                 # Example: diffusion_pytorch_model.fp16-00001-of-00002.safetensors
                 assert all(f.split(".")[1].split("-")[0] == variant for f in shard_files)
 
+    def test_layerwise_casting_training(self):
+        def test_fn(storage_dtype, compute_dtype):
+            if torch.device(torch_device).type == "cpu" and compute_dtype == torch.bfloat16:
+                return
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+            model = self.model_class(**init_dict)
+            model = model.to(torch_device, dtype=compute_dtype)
+            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+            model.train()
+
+            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+            with torch.amp.autocast(device_type=torch.device(torch_device).type):
+                output = model(**inputs_dict)
+
+                if isinstance(output, dict):
+                    output = output.to_tuple()[0]
+
+                input_tensor = inputs_dict[self.main_input_name]
+                noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+                noise = cast_maybe_tensor_dtype(noise, torch.float32, compute_dtype)
+                loss = torch.nn.functional.mse_loss(output, noise)
+
+            loss.backward()
+
+        test_fn(torch.float16, torch.float32)
+        test_fn(torch.float8_e4m3fn, torch.float32)
+        test_fn(torch.float8_e5m2, torch.float32)
+        test_fn(torch.float8_e4m3fn, torch.bfloat16)
+
     def test_layerwise_casting_inference(self):
         from diffusers.hooks.layerwise_casting import DEFAULT_SKIP_MODULES_PATTERN, SUPPORTED_PYTORCH_LAYERS
 
diff --git a/tests/models/unets/test_models_unet_1d.py b/tests/models/unets/test_models_unet_1d.py
@@ -60,6 +60,10 @@ def test_ema_training(self):
     def test_training(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_layerwise_casting_training(self):
+        pass
+
     def test_determinism(self):
         super().test_determinism()
 
@@ -239,6 +243,10 @@ def test_ema_training(self):
     def test_training(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_layerwise_casting_training(self):
+        pass
+
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
             "in_channels": 14,