huggingface
diff --git a/‎scripts/convert_vae_pt_to_diffusers.py‎
Lines changed: 18 additions & 2 deletions b/‎scripts/convert_vae_pt_to_diffusers.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 61 additions & 0 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 3 additions & 0 deletions b/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 33 additions & 2 deletions b/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py‎
Lines changed: 8 additions & 2 deletions b/‎src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/diffusers/utils/import_utils.py‎
Lines changed: 6 additions & 4 deletions b/‎src/diffusers/utils/import_utils.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎tests/lora/test_lora_layers_sd.py‎
Lines changed: 48 additions & 4 deletions b/‎tests/lora/test_lora_layers_sd.py‎
Lines changed: 48 additions & 4 deletions
@@ -53,7 +53,12 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
     }
 
     for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key and "attn" not in key
+        ]
+        attentions = [key for key in down_blocks[i] if f"down.{i}.attn" in key]
 
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
             new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
@@ -67,6 +72,10 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
         meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
+        paths = renew_vae_attention_paths(attentions)
+        meta_path = {"old": f"down.{i}.attn", "new": f"down_blocks.{i}.attentions"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
     mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
@@ -85,8 +94,11 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
         resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key and "attn" not in key
         ]
+        attentions = [key for key in up_blocks[block_id] if f"up.{block_id}.attn" in key]
 
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
             new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
@@ -100,6 +112,10 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
         meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
         assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
 
+        paths = renew_vae_attention_paths(attentions)
+        meta_path = {"old": f"up.{block_id}.attn", "new": f"up_blocks.{i}.attentions"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
     mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
     num_mid_res_blocks = 2
     for i in range(1, num_mid_res_blocks + 1):
 
@@ -1608,3 +1608,64 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
         converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
 
     return converted_state_dict
+
+
+def _convert_musubi_wan_lora_to_diffusers(state_dict):
+    # https://github.com/kohya-ss/musubi-tuner
+    converted_state_dict = {}
+    original_state_dict = {k[len("lora_unet_") :]: v for k, v in state_dict.items()}
+
+    num_blocks = len({k.split("blocks_")[1].split("_")[0] for k in original_state_dict})
+    is_i2v_lora = any("k_img" in k for k in original_state_dict) and any("v_img" in k for k in original_state_dict)
+
+    def get_alpha_scales(down_weight, key):
+        rank = down_weight.shape[0]
+        alpha = original_state_dict.pop(key + ".alpha").item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+
+    for i in range(num_blocks):
+        # Self-attention
+        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_self_attn_{o}")
+            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_B.weight"] = up_weight * scale_up
+
+        # Cross-attention
+        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
+            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
+
+        if is_i2v_lora:
+            for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
+                down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
+                up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
+                scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
+                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
+
+        # FFN
+        for o, c in zip(["ffn_0", "ffn_2"], ["net.0.proj", "net.2"]):
+            down_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_down.weight")
+            up_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_up.weight")
+            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_{o}")
+            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_A.weight"] = down_weight * scale_down
+            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.weight"] = up_weight * scale_up
+
+    if len(original_state_dict) > 0:
+        raise ValueError(f"`state_dict` should be empty at this point but has {original_state_dict.keys()=}")
+
+    for key in list(converted_state_dict.keys()):
+        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
+
+    return converted_state_dict
@@ -42,6 +42,7 @@
     _convert_bfl_flux_control_lora_to_diffusers,
     _convert_hunyuan_video_lora_to_diffusers,
     _convert_kohya_flux_lora_to_diffusers,
+    _convert_musubi_wan_lora_to_diffusers,
     _convert_non_diffusers_lora_to_diffusers,
     _convert_non_diffusers_lumina2_lora_to_diffusers,
     _convert_non_diffusers_wan_lora_to_diffusers,
@@ -4794,6 +4795,8 @@ def lora_state_dict(
         )
         if any(k.startswith("diffusion_model.") for k in state_dict):
             state_dict = _convert_non_diffusers_wan_lora_to_diffusers(state_dict)
+        elif any(k.startswith("lora_unet_") for k in state_dict):
+            state_dict = _convert_musubi_wan_lora_to_diffusers(state_dict)
 
         is_dora_scale_present = any("dora_scale" in k for k in state_dict)
         if is_dora_scale_present:
 
@@ -177,6 +177,7 @@
     "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
     "ltx-video": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.0"},
     "ltx-video-0.9.1": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.1"},
+    "ltx-video-0.9.5": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.5"},
     "autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
     "autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
     "autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
@@ -638,7 +639,9 @@ def infer_diffusers_model_type(checkpoint):
             model_type = "flux-schnell"
 
     elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["ltx-video"]):
-        if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
+        if checkpoint["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
+            model_type = "ltx-video-0.9.5"
+        elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
             model_type = "ltx-video-0.9.1"
         else:
             model_type = "ltx-video"
@@ -2403,13 +2406,41 @@ def remove_keys_(key: str, state_dict):
         "last_scale_shift_table": "scale_shift_table",
     }
 
+    VAE_095_RENAME_DICT = {
+        # decoder
+        "up_blocks.0": "mid_block",
+        "up_blocks.1": "up_blocks.0.upsamplers.0",
+        "up_blocks.2": "up_blocks.0",
+        "up_blocks.3": "up_blocks.1.upsamplers.0",
+        "up_blocks.4": "up_blocks.1",
+        "up_blocks.5": "up_blocks.2.upsamplers.0",
+        "up_blocks.6": "up_blocks.2",
+        "up_blocks.7": "up_blocks.3.upsamplers.0",
+        "up_blocks.8": "up_blocks.3",
+        # encoder
+        "down_blocks.0": "down_blocks.0",
+        "down_blocks.1": "down_blocks.0.downsamplers.0",
+        "down_blocks.2": "down_blocks.1",
+        "down_blocks.3": "down_blocks.1.downsamplers.0",
+        "down_blocks.4": "down_blocks.2",
+        "down_blocks.5": "down_blocks.2.downsamplers.0",
+        "down_blocks.6": "down_blocks.3",
+        "down_blocks.7": "down_blocks.3.downsamplers.0",
+        "down_blocks.8": "mid_block",
+        # common
+        "last_time_embedder": "time_embedder",
+        "last_scale_shift_table": "scale_shift_table",
+    }
+
     VAE_SPECIAL_KEYS_REMAP = {
         "per_channel_statistics.channel": remove_keys_,
         "per_channel_statistics.mean-of-means": remove_keys_,
         "per_channel_statistics.mean-of-stds": remove_keys_,
     }
 
-    if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
+    if converted_state_dict["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
+        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
+    elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
         VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
 
     for key in list(converted_state_dict.keys()):
 
@@ -350,8 +350,14 @@ def create_vae_diffusers_config(original_config, image_size: int):
     _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
     block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    down_block_types = [
+        "DownEncoderBlock2D" if image_size // 2**i not in vae_params["attn_resolutions"] else "AttnDownEncoderBlock2D"
+        for i, _ in enumerate(block_out_channels)
+    ]
+    up_block_types = [
+        "UpDecoderBlock2D" if image_size // 2**i not in vae_params["attn_resolutions"] else "AttnUpDecoderBlock2D"
+        for i, _ in enumerate(block_out_channels)
+    ][::-1]
 
     config = {
         "sample_size": image_size,
 
@@ -101,18 +101,20 @@ def _is_package_available(pkg_name: str):
 if _onnx_available:
     candidates = (
         "onnxruntime",
+        "onnxruntime-cann",
+        "onnxruntime-directml",
+        "ort_nightly_directml",
         "onnxruntime-gpu",
         "ort_nightly_gpu",
-        "onnxruntime-directml",
+        "onnxruntime-migraphx",
         "onnxruntime-openvino",
-        "ort_nightly_directml",
+        "onnxruntime-qnn",
         "onnxruntime-rocm",
-        "onnxruntime-migraphx",
         "onnxruntime-training",
         "onnxruntime-vitisai",
     )
     _onnxruntime_version = None
-    # For the metadata, we have to look for both onnxruntime and onnxruntime-gpu
+    # For the metadata, we have to look for both onnxruntime and onnxruntime-x
     for pkg in candidates:
         try:
             _onnxruntime_version = importlib_metadata.version(pkg)
 
@@ -33,6 +33,7 @@
 )
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
+    Expectations,
     backend_empty_cache,
     load_image,
     nightly,
@@ -455,11 +456,54 @@ def test_vanilla_funetuning(self):
 
         images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=2).images
 
-        images = images[0, -3:, -3:, -1].flatten()
-
-        expected = np.array([0.7406, 0.699, 0.5963, 0.7493, 0.7045, 0.6096, 0.6886, 0.6388, 0.583])
+        image_slice = images[0, -3:, -3:, -1].flatten()
+
+        expected_slices = Expectations(
+            {
+                ("xpu", 3): np.array(
+                    [
+                        0.6544,
+                        0.6127,
+                        0.5397,
+                        0.6845,
+                        0.6047,
+                        0.5469,
+                        0.6349,
+                        0.5906,
+                        0.5382,
+                    ]
+                ),
+                ("cuda", 7): np.array(
+                    [
+                        0.7406,
+                        0.699,
+                        0.5963,
+                        0.7493,
+                        0.7045,
+                        0.6096,
+                        0.6886,
+                        0.6388,
+                        0.583,
+                    ]
+                ),
+                ("cuda", 8): np.array(
+                    [
+                        0.6542,
+                        0.61253,
+                        0.5396,
+                        0.6843,
+                        0.6044,
+                        0.5468,
+                        0.6349,
+                        0.5905,
+                        0.5381,
+                    ]
+                ),
+            }
+        )
+        expected_slice = expected_slices.get_expectation()
 
-        max_diff = numpy_cosine_similarity_distance(expected, images)
+        max_diff = numpy_cosine_similarity_distance(expected_slice, image_slice)
         assert max_diff < 1e-4
 
         pipe.unload_lora_weights()