huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/community/ip_adapter_face_id.py‎
Lines changed: 1 addition & 4 deletions b/‎examples/community/ip_adapter_face_id.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/diffusers/loaders/ip_adapter.py‎
Lines changed: 3 additions & 12 deletions b/‎src/diffusers/loaders/ip_adapter.py‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎src/diffusers/loaders/lora_base.py‎
Lines changed: 46 additions & 6 deletions b/‎src/diffusers/loaders/lora_base.py‎
Lines changed: 46 additions & 6 deletions
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 45 additions & 14 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 45 additions & 14 deletions
@@ -283,6 +283,8 @@
         title: AllegroTransformer3DModel
       - local: api/models/aura_flow_transformer2d
         title: AuraFlowTransformer2DModel
+      - local: api/models/chroma_transformer
+        title: ChromaTransformer2DModel
       - local: api/models/cogvideox_transformer3d
         title: CogVideoXTransformer3DModel
       - local: api/models/cogview3plus_transformer2d
@@ -405,6 +407,8 @@
       title: AutoPipeline
     - local: api/pipelines/blip_diffusion
       title: BLIP-Diffusion
+    - local: api/pipelines/chroma
+      title: Chroma
     - local: api/pipelines/cogvideox
       title: CogVideoX
     - local: api/pipelines/cogview3
 
@@ -282,10 +282,7 @@ def load_ip_adapter_face_id(self, pretrained_model_name_or_path_or_dict, weight_
         revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", None)
 
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
         model_file = _get_model_file(
             pretrained_model_name_or_path_or_dict,
             weights_name=weight_name,
 
@@ -159,10 +159,7 @@ def load_ip_adapter(
                 " `low_cpu_mem_usage=False`."
             )
 
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
         state_dicts = []
         for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
             pretrained_model_name_or_path_or_dict, weight_name, subfolder
@@ -465,10 +462,7 @@ def load_ip_adapter(
                 " `low_cpu_mem_usage=False`."
             )
 
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
         state_dicts = []
         for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
             pretrained_model_name_or_path_or_dict, weight_name, subfolder
@@ -750,10 +744,7 @@ def load_ip_adapter(
                 " `low_cpu_mem_usage=False`."
             )
 
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
 
         if not isinstance(pretrained_model_name_or_path_or_dict, dict):
             model_file = _get_model_file(
 
@@ -14,6 +14,7 @@
 
 import copy
 import inspect
+import json
 import os
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
@@ -45,6 +46,7 @@
     set_adapter_layers,
     set_weights_and_activate_adapters,
 )
+from ..utils.state_dict_utils import _load_sft_state_dict_metadata
 
 
 if is_transformers_available():
@@ -62,6 +64,7 @@
 
 LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
 LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
+LORA_ADAPTER_METADATA_KEY = "lora_adapter_metadata"
 
 
 def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False, adapter_names=None):
@@ -206,6 +209,7 @@ def _fetch_state_dict(
     subfolder,
     user_agent,
     allow_pickle,
+    metadata=None,
 ):
     model_file = None
     if not isinstance(pretrained_model_name_or_path_or_dict, dict):
@@ -236,11 +240,14 @@ def _fetch_state_dict(
                     user_agent=user_agent,
                 )
                 state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                metadata = _load_sft_state_dict_metadata(model_file)
+
             except (IOError, safetensors.SafetensorError) as e:
                 if not allow_pickle:
                     raise e
                 # try loading non-safetensors weights
                 model_file = None
+                metadata = None
                 pass
 
         if model_file is None:
@@ -261,10 +268,11 @@ def _fetch_state_dict(
                 user_agent=user_agent,
             )
             state_dict = load_state_dict(model_file)
+            metadata = None
     else:
         state_dict = pretrained_model_name_or_path_or_dict
 
-    return state_dict
+    return state_dict, metadata
 
 
 def _best_guess_weight_name(
@@ -306,6 +314,11 @@ def _best_guess_weight_name(
     return weight_name
 
 
+def _pack_dict_with_prefix(state_dict, prefix):
+    sd_with_prefix = {f"{prefix}.{key}": value for key, value in state_dict.items()}
+    return sd_with_prefix
+
+
 def _load_lora_into_text_encoder(
     state_dict,
     network_alphas,
@@ -317,10 +330,14 @@ def _load_lora_into_text_encoder(
     _pipeline=None,
     low_cpu_mem_usage=False,
     hotswap: bool = False,
+    metadata=None,
 ):
     if not USE_PEFT_BACKEND:
         raise ValueError("PEFT backend is required for this method.")
 
+    if network_alphas and metadata:
+        raise ValueError("`network_alphas` and `metadata` cannot be specified both at the same time.")
+
     peft_kwargs = {}
     if low_cpu_mem_usage:
         if not is_peft_version(">=", "0.13.1"):
@@ -349,6 +366,8 @@ def _load_lora_into_text_encoder(
     # Load the layers corresponding to text encoder and make necessary adjustments.
     if prefix is not None:
         state_dict = {k.removeprefix(f"{prefix}."): v for k, v in state_dict.items() if k.startswith(f"{prefix}.")}
+        if metadata is not None:
+            metadata = {k.removeprefix(f"{prefix}."): v for k, v in metadata.items() if k.startswith(f"{prefix}.")}
 
     if len(state_dict) > 0:
         logger.info(f"Loading {prefix}.")
@@ -376,7 +395,10 @@ def _load_lora_into_text_encoder(
             alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
             network_alphas = {k.removeprefix(f"{prefix}."): v for k, v in network_alphas.items() if k in alpha_keys}
 
-        lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=False)
+        if metadata is not None:
+            lora_config_kwargs = metadata
+        else:
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=False)
 
         if "use_dora" in lora_config_kwargs:
             if lora_config_kwargs["use_dora"]:
@@ -398,7 +420,10 @@ def _load_lora_into_text_encoder(
                 if is_peft_version("<=", "0.13.2"):
                     lora_config_kwargs.pop("lora_bias")
 
-        lora_config = LoraConfig(**lora_config_kwargs)
+            try:
+                lora_config = LoraConfig(**lora_config_kwargs)
+            except TypeError as e:
+                raise TypeError("`LoraConfig` class could not be instantiated.") from e
 
         # adapter_name
         if adapter_name is None:
@@ -889,8 +914,7 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device,
     @staticmethod
     def pack_weights(layers, prefix):
         layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-        layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-        return layers_state_dict
+        return _pack_dict_with_prefix(layers_weights, prefix)
 
     @staticmethod
     def write_lora_layers(
@@ -900,16 +924,32 @@ def write_lora_layers(
         weight_name: str,
         save_function: Callable,
         safe_serialization: bool,
+        lora_adapter_metadata: Optional[dict] = None,
     ):
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
 
+        if lora_adapter_metadata and not safe_serialization:
+            raise ValueError("`lora_adapter_metadata` cannot be specified when not using `safe_serialization`.")
+        if lora_adapter_metadata and not isinstance(lora_adapter_metadata, dict):
+            raise TypeError("`lora_adapter_metadata` must be of type `dict`.")
+
         if save_function is None:
             if safe_serialization:
 
                 def save_function(weights, filename):
-                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+                    # Inject framework format.
+                    metadata = {"format": "pt"}
+                    if lora_adapter_metadata:
+                        for key, value in lora_adapter_metadata.items():
+                            if isinstance(value, set):
+                                lora_adapter_metadata[key] = list(value)
+                        metadata[LORA_ADAPTER_METADATA_KEY] = json.dumps(
+                            lora_adapter_metadata, indent=2, sort_keys=True
+                        )
+
+                    return safetensors.torch.save_file(weights, filename, metadata=metadata)
 
             else:
                 save_function = torch.save
 
@@ -1605,9 +1605,18 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
     if diff_keys:
         for diff_k in diff_keys:
             param = original_state_dict[diff_k]
+            # The magnitudes of the .diff-ending weights are very low (most are below 1e-4, some are upto 1e-3,
+            # and 2 of them are about 1.6e-2 [the case with AccVideo lora]). The low magnitudes mostly correspond
+            # to norm layers. Ignoring them is the best option at the moment until a better solution is found. It
+            # is okay to ignore because they do not affect the model output in a significant manner.
+            threshold = 1.6e-2
+            absdiff = param.abs().max() - param.abs().min()
             all_zero = torch.all(param == 0).item()
-            if all_zero:
-                logger.debug(f"Removed {diff_k} key from the state dict as it's all zeros.")
+            all_absdiff_lower_than_threshold = absdiff < threshold
+            if all_zero or all_absdiff_lower_than_threshold:
+                logger.debug(
+                    f"Removed {diff_k} key from the state dict as it's all zeros, or values lower than hardcoded threshold."
+                )
                 original_state_dict.pop(diff_k)
 
     # For the `diff_b` keys, we treat them as lora_bias.
@@ -1655,12 +1664,16 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
 
         # FFN
         for o, c in zip(["ffn.0", "ffn.2"], ["net.0.proj", "net.2"]):
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_A.weight"] = original_state_dict.pop(
-                f"blocks.{i}.{o}.{lora_down_key}.weight"
-            )
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.weight"] = original_state_dict.pop(
-                f"blocks.{i}.{o}.{lora_up_key}.weight"
-            )
+            original_key = f"blocks.{i}.{o}.{lora_down_key}.weight"
+            converted_key = f"blocks.{i}.ffn.{c}.lora_A.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
+            original_key = f"blocks.{i}.{o}.{lora_up_key}.weight"
+            converted_key = f"blocks.{i}.ffn.{c}.lora_B.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
             if f"blocks.{i}.{o}.diff_b" in original_state_dict:
                 converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.bias"] = original_state_dict.pop(
                     f"blocks.{i}.{o}.diff_b"
@@ -1669,12 +1682,16 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
     # Remaining.
     if original_state_dict:
         if any("time_projection" in k for k in original_state_dict):
-            converted_state_dict["condition_embedder.time_proj.lora_A.weight"] = original_state_dict.pop(
-                f"time_projection.1.{lora_down_key}.weight"
-            )
-            converted_state_dict["condition_embedder.time_proj.lora_B.weight"] = original_state_dict.pop(
-                f"time_projection.1.{lora_up_key}.weight"
-            )
+            original_key = f"time_projection.1.{lora_down_key}.weight"
+            converted_key = "condition_embedder.time_proj.lora_A.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
+            original_key = f"time_projection.1.{lora_up_key}.weight"
+            converted_key = "condition_embedder.time_proj.lora_B.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
             if "time_projection.1.diff_b" in original_state_dict:
                 converted_state_dict["condition_embedder.time_proj.lora_B.bias"] = original_state_dict.pop(
                     "time_projection.1.diff_b"
@@ -1709,6 +1726,20 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
                             original_state_dict.pop(f"{text_time}.{b_n}.diff_b")
                         )
 
+        for img_ours, img_theirs in [
+            ("ff.net.0.proj", "img_emb.proj.1"),
+            ("ff.net.2", "img_emb.proj.3"),
+        ]:
+            original_key = f"{img_theirs}.{lora_down_key}.weight"
+            converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_A.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
+            original_key = f"{img_theirs}.{lora_up_key}.weight"
+            converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_B.weight"
+            if original_key in original_state_dict:
+                converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+
     if len(original_state_dict) > 0:
         diff = all(".diff" in k for k in original_state_dict)
         if diff: