From ec5449f3a1378df207df481bfa1ad7ff8057a58a Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Thu, 25 Sep 2025 18:28:54 +0200
Subject: [PATCH 01/69] Support both huggingface_hub `v0.x` and `v1.x` (#12389)

* Support huggingface_hub 0.x and 1.x

* httpx
---
 setup.py                                          | 4 +++-
 src/diffusers/configuration_utils.py              | 4 ++--
 src/diffusers/dependency_versions_table.py        | 3 ++-
 src/diffusers/models/modeling_flax_utils.py       | 4 ++--
 src/diffusers/pipelines/pipeline_loading_utils.py | 6 +++---
 src/diffusers/pipelines/pipeline_utils.py         | 6 +++---
 src/diffusers/utils/hub_utils.py                  | 6 +++---
 tests/models/test_modeling_common.py              | 7 +++----
 tests/pipelines/test_pipelines.py                 | 6 +++---
 9 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/setup.py b/setup.py
index ba3ad8e2b307..372a5685957e 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,8 @@
     "filelock",
     "flax>=0.4.1",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.34.0",
+    "httpx<1.0.0",
+    "huggingface-hub>=0.34.0,<2.0",
     "requests-mock==1.10.0",
     "importlib_metadata",
     "invisible-watermark>=0.2.0",
@@ -259,6 +260,7 @@ def run(self):
 install_requires = [
     deps["importlib_metadata"],
     deps["filelock"],
+    deps["httpx"],
     deps["huggingface-hub"],
     deps["numpy"],
     deps["regex"],
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 540aab03071d..1c4ee33acbfd 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -30,11 +30,11 @@
 from huggingface_hub import DDUFEntry, create_repo, hf_hub_download
 from huggingface_hub.utils import (
     EntryNotFoundError,
+    HfHubHTTPError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     validate_hf_hub_args,
 )
-from requests import HTTPError
 from typing_extensions import Self
 
 from . import __version__
@@ -419,7 +419,7 @@ def load_config(
                 raise EnvironmentError(
                     f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
                 )
-            except HTTPError as err:
+            except HfHubHTTPError as err:
                 raise EnvironmentError(
                     "There was a specific connection error when trying to load"
                     f" {pretrained_model_name_or_path}:\n{err}"
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 79dc4c50a050..bfc4e9818ba3 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -9,7 +9,8 @@
     "filelock": "filelock",
     "flax": "flax>=0.4.1",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.34.0",
+    "httpx": "httpx<1.0.0",
+    "huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
     "requests-mock": "requests-mock==1.10.0",
     "importlib_metadata": "importlib_metadata",
     "invisible-watermark": "invisible-watermark>=0.2.0",
diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py
index 573828dc4b03..8050afff2767 100644
--- a/src/diffusers/models/modeling_flax_utils.py
+++ b/src/diffusers/models/modeling_flax_utils.py
@@ -26,11 +26,11 @@
 from huggingface_hub import create_repo, hf_hub_download
 from huggingface_hub.utils import (
     EntryNotFoundError,
+    HfHubHTTPError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     validate_hf_hub_args,
 )
-from requests import HTTPError
 
 from .. import __version__, is_torch_available
 from ..utils import (
@@ -385,7 +385,7 @@ def from_pretrained(
                 raise EnvironmentError(
                     f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
                 )
-            except HTTPError as err:
+            except HfHubHTTPError as err:
                 raise EnvironmentError(
                     f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
                     f"{err}"
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index 388128df0ebd..b7a3e08105ff 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -19,12 +19,12 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
 
+import httpx
 import requests
 import torch
 from huggingface_hub import DDUFEntry, ModelCard, model_info, snapshot_download
-from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args
+from huggingface_hub.utils import HfHubHTTPError, OfflineModeIsEnabled, validate_hf_hub_args
 from packaging import version
-from requests.exceptions import HTTPError
 
 from .. import __version__
 from ..utils import (
@@ -1110,7 +1110,7 @@ def _download_dduf_file(
     if not local_files_only:
         try:
             info = model_info(pretrained_model_name, token=token, revision=revision)
-        except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e:
+        except (HfHubHTTPError, OfflineModeIsEnabled, requests.ConnectionError, httpx.NetworkError) as e:
             logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.")
             local_files_only = True
             model_info_call_error = e  # save error to reraise it if model is not cached locally
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 01b3c56777c8..3f6e53099b38 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin
 
+import httpx
 import numpy as np
 import PIL.Image
 import requests
@@ -36,9 +37,8 @@
     read_dduf_file,
     snapshot_download,
 )
-from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args
+from huggingface_hub.utils import HfHubHTTPError, OfflineModeIsEnabled, validate_hf_hub_args
 from packaging import version
-from requests.exceptions import HTTPError
 from tqdm.auto import tqdm
 from typing_extensions import Self
 
@@ -1616,7 +1616,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
         if not local_files_only:
             try:
                 info = model_info(pretrained_model_name, token=token, revision=revision)
-            except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e:
+            except (HfHubHTTPError, OfflineModeIsEnabled, requests.ConnectionError, httpx.NetworkError) as e:
                 logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.")
                 local_files_only = True
                 model_info_call_error = e  # save error to reraise it if model is not cached locally
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index fcdf49156a8f..b6e99452aa88 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -38,13 +38,13 @@
 from huggingface_hub.file_download import REGEX_COMMIT_HASH
 from huggingface_hub.utils import (
     EntryNotFoundError,
+    HfHubHTTPError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     is_jinja_available,
     validate_hf_hub_args,
 )
 from packaging import version
-from requests import HTTPError
 
 from .. import __version__
 from .constants import (
@@ -316,7 +316,7 @@ def _get_model_file(
             raise EnvironmentError(
                 f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
             ) from e
-        except HTTPError as e:
+        except HfHubHTTPError as e:
             raise EnvironmentError(
                 f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{e}"
             ) from e
@@ -432,7 +432,7 @@ def _get_checkpoint_shard_files(
 
     # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
     # we don't have to catch them here. We have also dealt with EntryNotFoundError.
-    except HTTPError as e:
+    except HfHubHTTPError as e:
         raise EnvironmentError(
             f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load {pretrained_model_name_or_path}. You should try"
             " again after checking your internet connection."
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 5e7be62342c3..3a008edfe1c2 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -37,9 +37,8 @@
 import torch.nn as nn
 from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size
 from huggingface_hub import ModelCard, delete_repo, snapshot_download, try_to_load_from_cache
-from huggingface_hub.utils import is_jinja_available
+from huggingface_hub.utils import HfHubHTTPError, is_jinja_available
 from parameterized import parameterized
-from requests.exceptions import HTTPError
 
 from diffusers.models import FluxTransformer2DModel, SD3Transformer2DModel, UNet2DConditionModel
 from diffusers.models.attention_processor import (
@@ -272,7 +271,7 @@ def test_cached_files_are_used_when_no_internet(self):
         response_mock = mock.Mock()
         response_mock.status_code = 500
         response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock())
         response_mock.json.return_value = {}
 
         # Download this model to make sure it's in the cache.
@@ -296,7 +295,7 @@ def test_local_files_only_with_sharded_checkpoint(self):
         error_response = mock.Mock(
             status_code=500,
             headers={},
-            raise_for_status=mock.Mock(side_effect=HTTPError),
+            raise_for_status=mock.Mock(side_effect=HfHubHTTPError("Server down", response=mock.Mock())),
             json=mock.Mock(return_value={}),
         )
 
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 09df140f1af8..3a6981361268 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -33,9 +33,9 @@
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
+from huggingface_hub.utils import HfHubHTTPError
 from parameterized import parameterized
 from PIL import Image
-from requests.exceptions import HTTPError
 from transformers import CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from diffusers import (
@@ -430,7 +430,7 @@ def test_cached_files_are_used_when_no_internet(self):
         response_mock = mock.Mock()
         response_mock.status_code = 500
         response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock())
         response_mock.json.return_value = {}
 
         # Download this model to make sure it's in the cache.
@@ -457,7 +457,7 @@ def test_local_files_only_are_used_when_no_internet(self):
         response_mock = mock.Mock()
         response_mock.status_code = 500
         response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock())
         response_mock.json.return_value = {}
 
         # first check that with local files only the pipeline can only be used if cached

From 016316a57a8a0df9cacca66af2fa290862b8d5c4 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 10:20:19 +0200
Subject: [PATCH 02/69] mirage pipeline first commit

---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/models/__init__.py              |   1 +
 src/diffusers/models/transformers/__init__.py |   1 +
 .../models/transformers/transformer_mirage.py | 489 ++++++++++++++
 src/diffusers/pipelines/__init__.py           |   1 +
 src/diffusers/pipelines/mirage/__init__.py    |   4 +
 .../pipelines/mirage/pipeline_mirage.py       | 629 ++++++++++++++++++
 .../pipelines/mirage/pipeline_output.py       |  35 +
 .../test_models_transformer_mirage.py         | 252 +++++++
 9 files changed, 1413 insertions(+)
 create mode 100644 src/diffusers/models/transformers/transformer_mirage.py
 create mode 100644 src/diffusers/pipelines/mirage/__init__.py
 create mode 100644 src/diffusers/pipelines/mirage/pipeline_mirage.py
 create mode 100644 src/diffusers/pipelines/mirage/pipeline_output.py
 create mode 100644 tests/models/transformers/test_models_transformer_mirage.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 8867250deda8..6fc6ac5f3ebd 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -224,6 +224,7 @@
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
+            "MirageTransformer2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
             "MotionAdapter",
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 457f70448af3..279e69216b1b 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -93,6 +93,7 @@
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
+    _import_structure["transformers.transformer_mirage"] = ["MirageTransformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index b60f0636e6dc..ebe0d0c9b8e1 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -29,6 +29,7 @@
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
+    from .transformer_mirage import MirageTransformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
new file mode 100644
index 000000000000..39c569cbb26b
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -0,0 +1,489 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union, Tuple
+import torch
+import math
+from torch import Tensor, nn
+from torch.nn.functional import fold, unfold
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..modeling_outputs import Transformer2DModelOutput
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+
+
+logger = logging.get_logger(__name__)
+
+
+# Mirage Layer Components
+def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
+    img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
+    img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
+    img_ids[..., 1] = torch.arange(w // patch_size, device=device)[None, :]
+    return img_ids.reshape((h // patch_size) * (w // patch_size), 2).unsqueeze(0).repeat(bs, 1, 1)
+
+
+def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq)
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.rope_rearrange = Rearrange("b n d (i j) -> b n d i j", i=2, j=2)
+
+    def rope(self, pos: Tensor, dim: int, theta: int) -> Tensor:
+        assert dim % 2 == 0
+        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+        omega = 1.0 / (theta**scale)
+        out = pos.unsqueeze(-1) * omega.unsqueeze(0)
+        out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+        out = self.rope_rearrange(out)
+        return out.float()
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [self.rope(ids[:, :, i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+
+
+def timestep_embedding(t: Tensor, dim: int, max_period: int = 10000, time_factor: float = 1000.0) -> Tensor:
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms * self.scale).to(dtype=x_dtype)
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.lin = nn.Linear(dim, 6 * dim, bias=True)
+        nn.init.constant_(self.lin.weight, 0)
+        nn.init.constant_(self.lin.bias, 0)
+
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(6, dim=-1)
+        return ModulationOut(*out[:3]), ModulationOut(*out[3:])
+
+
+class MirageBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+
+        self._fsdp_wrap = True
+        self._activation_checkpointing = True
+
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.hidden_size = hidden_size
+
+        # img qkv
+        self.img_pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_qkv_proj = nn.Linear(hidden_size, hidden_size * 3, bias=False)
+        self.attn_out = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.qk_norm = QKNorm(self.head_dim)
+
+        # txt kv
+        self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
+        self.k_norm = RMSNorm(self.head_dim)
+
+
+        # mlp
+        self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.gate_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
+        self.down_proj = nn.Linear(self.mlp_hidden_dim, hidden_size, bias=False)
+        self.mlp_act = nn.GELU(approximate="tanh")
+
+        self.modulation = Modulation(hidden_size)
+        self.spatial_cond_kv_proj: None | nn.Linear = None
+
+    def attn_forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        pe: Tensor,
+        modulation: ModulationOut,
+        spatial_conditioning: None | Tensor = None,
+        attention_mask: None | Tensor = None,
+    ) -> Tensor:
+        # image tokens proj and norm
+        img_mod = (1 + modulation.scale) * self.img_pre_norm(img) + modulation.shift
+
+        img_qkv = self.img_qkv_proj(img_mod)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.qk_norm(img_q, img_k, img_v)
+
+        # txt tokens proj and norm
+        txt_kv = self.txt_kv_proj(txt)
+        txt_k, txt_v = rearrange(txt_kv, "B L (K H D) -> K B H L D", K=2, H=self.num_heads)
+        txt_k = self.k_norm(txt_k)
+
+        # compute attention
+        img_q, img_k = apply_rope(img_q, pe), apply_rope(img_k, pe)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        # optional spatial conditioning tokens
+        cond_len = 0
+        if self.spatial_cond_kv_proj is not None:
+            assert spatial_conditioning is not None
+            cond_kv = self.spatial_cond_kv_proj(spatial_conditioning)
+            cond_k, cond_v = rearrange(cond_kv, "B L (K H D) -> K B H L D", K=2, H=self.num_heads)
+            cond_k = apply_rope(cond_k, pe)
+            cond_len = cond_k.shape[2]
+            k = torch.cat((cond_k, k), dim=2)
+            v = torch.cat((cond_v, v), dim=2)
+
+        # build additive attention bias
+        attn_bias: Tensor | None = None
+        attn_mask: Tensor | None = None
+
+        # build multiplicative 0/1 mask for provided attention_mask over [cond?, text, image] keys
+        if attention_mask is not None:
+            bs, _, l_img, _ = img_q.shape
+            l_txt = txt_k.shape[2]
+            l_all = k.shape[2]
+
+            assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
+            assert (
+                attention_mask.shape[-1] == l_txt
+            ), f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+
+            device = img_q.device
+
+            ones_img = torch.ones((bs, l_img), dtype=torch.bool, device=device)
+            cond_mask = torch.ones((bs, cond_len), dtype=torch.bool, device=device)
+
+            mask_parts = [
+                cond_mask,
+                attention_mask.to(torch.bool),
+                ones_img,
+            ]
+            joint_mask = torch.cat(mask_parts, dim=-1)  # (B, L_all)
+
+            # repeat across heads and query positions
+            attn_mask = joint_mask[:, None, None, :].expand(-1, self.num_heads, l_img, -1)  # (B,H,L_img,L_all)
+
+        attn = torch.nn.functional.scaled_dot_product_attention(
+            img_q.contiguous(), k.contiguous(), v.contiguous(), attn_mask=attn_mask
+        )
+        attn = rearrange(attn, "B H L D -> B L (H D)")
+        attn = self.attn_out(attn)
+
+        return attn
+
+    def ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
+        x = (1 + modulation.scale) * self.post_attention_layernorm(x) + modulation.shift
+        return self.down_proj(self.mlp_act(self.gate_proj(x)) * self.up_proj(x))
+
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        spatial_conditioning: Tensor | None = None,
+        attention_mask: Tensor | None = None,
+        **_: dict[str, Any],
+    ) -> Tensor:
+        mod_attn, mod_mlp = self.modulation(vec)
+
+        img = img + mod_attn.gate * self.attn_forward(
+            img,
+            txt,
+            pe,
+            mod_attn,
+            spatial_conditioning=spatial_conditioning,
+            attention_mask=attention_mask,
+        )
+        img = img + mod_mlp.gate * self.ffn_forward(img, mod_mlp)
+        return img
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+        nn.init.constant_(self.adaLN_modulation[1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[1].bias, 0)
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+
+
+@dataclass
+class MirageParams:
+    in_channels: int
+    patch_size: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    axes_dim: list[int]
+    theta: int
+    time_factor: float = 1000.0
+    time_max_period: int = 10_000
+    conditioning_block_ids: list[int] | None = None
+
+
+def img2seq(img: Tensor, patch_size: int) -> Tensor:
+    """Flatten an image into a sequence of patches"""
+    return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
+
+
+def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
+    """Revert img2seq"""
+    if isinstance(shape, tuple):
+        shape = shape[-2:]
+    elif isinstance(shape, torch.Tensor):
+        shape = (int(shape[0]), int(shape[1]))
+    else:
+        raise NotImplementedError(f"shape type {type(shape)} not supported")
+    return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
+
+
+class MirageTransformer2DModel(ModelMixin, ConfigMixin):
+    """Mirage Transformer model with IP-Adapter support."""
+
+    config_name = "config.json"
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        patch_size: int = 2,
+        context_in_dim: int = 2304,
+        hidden_size: int = 1792,
+        mlp_ratio: float = 3.5,
+        num_heads: int = 28,
+        depth: int = 16,
+        axes_dim: list = None,
+        theta: int = 10000,
+        time_factor: float = 1000.0,
+        time_max_period: int = 10000,
+        conditioning_block_ids: list = None,
+        **kwargs
+    ):
+        super().__init__()
+
+        if axes_dim is None:
+            axes_dim = [32, 32]
+
+        # Create MirageParams from the provided arguments
+        params = MirageParams(
+            in_channels=in_channels,
+            patch_size=patch_size,
+            context_in_dim=context_in_dim,
+            hidden_size=hidden_size,
+            mlp_ratio=mlp_ratio,
+            num_heads=num_heads,
+            depth=depth,
+            axes_dim=axes_dim,
+            theta=theta,
+            time_factor=time_factor,
+            time_max_period=time_max_period,
+            conditioning_block_ids=conditioning_block_ids,
+        )
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.patch_size = params.patch_size
+        self.out_channels = self.in_channels * self.patch_size**2
+
+        self.time_factor = params.time_factor
+        self.time_max_period = params.time_max_period
+
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+
+        pe_dim = params.hidden_size // params.num_heads
+
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels * self.patch_size**2, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        conditioning_block_ids: list[int] = params.conditioning_block_ids or list(range(params.depth))
+
+        self.blocks = nn.ModuleList(
+            [
+                MirageBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                )
+                for i in range(params.depth)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+    def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
+        """Timestep independent stuff"""
+        txt = self.txt_in(txt)
+        img = img2seq(image_latent, self.patch_size)
+        bs, _, h, w = image_latent.shape
+        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
+        pe = self.pe_embedder(img_ids)
+        return img, txt, pe
+
+    def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
+        return self.time_in(
+            timestep_embedding(
+                t=timestep, dim=256, max_period=self.time_max_period, time_factor=self.time_factor
+            ).to(dtype)
+        )
+
+    def forward_transformers(
+        self,
+        image_latent: Tensor,
+        cross_attn_conditioning: Tensor,
+        timestep: Optional[Tensor] = None,
+        time_embedding: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        **block_kwargs: Any,
+    ) -> Tensor:
+        img = self.img_in(image_latent)
+
+        if time_embedding is not None:
+            vec = time_embedding
+        else:
+            if timestep is None:
+                raise ValueError("Please provide either a timestep or a timestep_embedding")
+            vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
+
+        for block in self.blocks:
+            img = block(
+                img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
+            )
+
+        img = self.final_layer(img, vec)
+        return img
+
+    def forward(
+        self,
+        image_latent: Tensor,
+        timestep: Tensor,
+        cross_attn_conditioning: Tensor,
+        micro_conditioning: Tensor,
+        cross_attn_mask: None | Tensor = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        img_seq, txt, pe = self.process_inputs(image_latent, cross_attn_conditioning)
+        img_seq = self.forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
+        output = seq2img(img_seq, self.patch_size, image_latent.shape)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 190c7871d270..7b7ebb633c3b 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -144,6 +144,7 @@
         "FluxKontextPipeline",
         "FluxKontextInpaintPipeline",
     ]
+    _import_structure["mirage"] = ["MiragePipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
         "AudioLDM2Pipeline",
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
new file mode 100644
index 000000000000..4fd8ad191b3f
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/__init__.py
@@ -0,0 +1,4 @@
+from .pipeline_mirage import MiragePipeline
+from .pipeline_output import MiragePipelineOutput
+
+__all__ = ["MiragePipeline", "MiragePipelineOutput"]
\ No newline at end of file
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
new file mode 100644
index 000000000000..126eab07977c
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -0,0 +1,629 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import html
+import re
+import urllib.parse as ul
+
+import ftfy
+import torch
+from transformers import (
+    AutoTokenizer,
+    GemmaTokenizerFast,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, AutoencoderDC
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import MiragePipelineOutput
+
+try:
+    from ...models.transformers.transformer_mirage import MirageTransformer2DModel
+except ImportError:
+    MirageTransformer2DModel = None
+
+logger = logging.get_logger(__name__)
+
+
+class TextPreprocessor:
+    """Text preprocessing utility for MiragePipeline."""
+
+    def __init__(self):
+        """Initialize text preprocessor."""
+        self.bad_punct_regex = re.compile(
+            r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + r"\\" + r"\/" + r"\*" + r"]{1,}"
+        )
+
+    def clean_text(self, text: str) -> str:
+        """Clean text using comprehensive text processing logic."""
+        # See Deepfloyd https://github.com/deep-floyd/IF/blob/develop/deepfloyd_if/modules/t5.py
+        text = str(text)
+        text = ul.unquote_plus(text)
+        text = text.strip().lower()
+        text = re.sub("<person>", "person", text)
+
+        # Remove all urls:
+        text = re.sub(
+            r"\b((?:https?|www):(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@))",
+            "",
+            text,
+        )  # regex for urls
+
+        # @<nickname>
+        text = re.sub(r"@[\w\d]+\b", "", text)
+
+        # 31C0—31EF CJK Strokes through 4E00—9FFF CJK Unified Ideographs
+        text = re.sub(r"[\u31c0-\u31ef]+", "", text)
+        text = re.sub(r"[\u31f0-\u31ff]+", "", text)
+        text = re.sub(r"[\u3200-\u32ff]+", "", text)
+        text = re.sub(r"[\u3300-\u33ff]+", "", text)
+        text = re.sub(r"[\u3400-\u4dbf]+", "", text)
+        text = re.sub(r"[\u4dc0-\u4dff]+", "", text)
+        text = re.sub(r"[\u4e00-\u9fff]+", "", text)
+
+        # все виды тире / all types of dash --> "-"
+        text = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
+            "-",
+            text,
+        )
+
+        # кавычки к одному стандарту
+        text = re.sub(r"[`´«»""¨]", '"', text)
+        text = re.sub(r"['']", "'", text)
+
+        # &quot; and &amp
+        text = re.sub(r"&quot;?", "", text)
+        text = re.sub(r"&amp", "", text)
+
+        # ip addresses:
+        text = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", text)
+
+        # article ids:
+        text = re.sub(r"\d:\d\d\s+$", "", text)
+
+        # \n
+        text = re.sub(r"\\n", " ", text)
+
+        # "#123", "#12345..", "123456.."
+        text = re.sub(r"#\d{1,3}\b", "", text)
+        text = re.sub(r"#\d{5,}\b", "", text)
+        text = re.sub(r"\b\d{6,}\b", "", text)
+
+        # filenames:
+        text = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", text)
+
+        # Clean punctuation
+        text = re.sub(r"[\"\']{2,}", r'"', text)  # """AUSVERKAUFT"""
+        text = re.sub(r"[\.]{2,}", r" ", text)
+
+        text = re.sub(self.bad_punct_regex, r" ", text)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        text = re.sub(r"\s+\.\s+", r" ", text)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, text)) > 3:
+            text = re.sub(regex2, " ", text)
+
+        # Basic cleaning
+        text = ftfy.fix_text(text)
+        text = html.unescape(html.unescape(text))
+        text = text.strip()
+
+        # Clean alphanumeric patterns
+        text = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", text)  # jc6640
+        text = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", text)  # jc6640vc
+        text = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", text)  # 6640vc231
+
+        # Common spam patterns
+        text = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", text)
+        text = re.sub(r"(free\s)?download(\sfree)?", "", text)
+        text = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", text)
+        text = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", text)
+        text = re.sub(r"\bpage\s+\d+\b", "", text)
+
+        text = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", text)  # j2d1a2a...
+        text = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", text)
+
+        # Final cleanup
+        text = re.sub(r"\b\s+\:\s+", r": ", text)
+        text = re.sub(r"(\D[,\./])\b", r"\1 ", text)
+        text = re.sub(r"\s+", " ", text)
+
+        text.strip()
+
+        text = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", text)
+        text = re.sub(r"^[\'\_,\-\:;]", r"", text)
+        text = re.sub(r"[\'\_,\-\:\-\+]$", r"", text)
+        text = re.sub(r"^\.\S+$", "", text)
+
+        return text.strip()
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MiragePipeline
+        >>> from diffusers.models import AutoencoderKL, AutoencoderDC
+        >>> from transformers import T5GemmaModel, GemmaTokenizerFast
+
+        >>> # Load pipeline directly with from_pretrained
+        >>> pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+
+        >>> # Or initialize pipeline components manually
+        >>> transformer = MirageTransformer2DModel.from_pretrained("path/to/transformer")
+        >>> scheduler = FlowMatchEulerDiscreteScheduler()
+        >>> # Load T5Gemma encoder
+        >>> t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
+        >>> text_encoder = t5gemma_model.encoder
+        >>> tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
+        >>> vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
+
+        >>> pipe = MiragePipeline(
+        ...     transformer=transformer,
+        ...     scheduler=scheduler,
+        ...     text_encoder=text_encoder,
+        ...     tokenizer=tokenizer,
+        ...     vae=vae
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+        >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+        >>> image.save("mirage_output.png")
+        ```
+"""
+
+
+class MiragePipeline(
+    DiffusionPipeline,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Mirage Transformer.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        transformer ([`MirageTransformer2DModel`]):
+            The Mirage transformer model to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        text_encoder ([`T5EncoderModel`]):
+            Standard text encoder model for encoding prompts.
+        tokenizer ([`T5TokenizerFast` or `GemmaTokenizerFast`]):
+            Tokenizer for the text encoder.
+        vae ([`AutoencoderKL`] or [`AutoencoderDC`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Supports both AutoencoderKL (8x compression) and AutoencoderDC (32x compression).
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents"]
+    _optional_components = []
+
+    # Component configurations for automatic loading
+    config_name = "model_index.json"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        """
+        Override from_pretrained to ensure T5GemmaEncoder is available for loading.
+
+        This ensures that T5GemmaEncoder from transformers is accessible in the module namespace
+        during component loading, which is required for MiragePipeline checkpoints that use
+        T5GemmaEncoder as the text encoder.
+        """
+        # Ensure T5GemmaEncoder is available for loading
+        import transformers
+        if not hasattr(transformers, 'T5GemmaEncoder'):
+            try:
+                from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+                transformers.T5GemmaEncoder = T5GemmaEncoder
+            except ImportError:
+                # T5GemmaEncoder not available in this transformers version
+                pass
+
+        # Proceed with standard loading
+        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+
+    def __init__(
+        self,
+        transformer: MirageTransformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        text_encoder: Union[T5EncoderModel, Any],
+        tokenizer: Union[T5TokenizerFast, GemmaTokenizerFast, AutoTokenizer],
+        vae: Union[AutoencoderKL, AutoencoderDC],
+    ):
+        super().__init__()
+
+        if MirageTransformer2DModel is None:
+            raise ImportError(
+                "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
+            )
+
+        # Store standard components
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+
+        # Initialize text preprocessor
+        self.text_preprocessor = TextPreprocessor()
+
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+        )
+
+        # Enhance VAE with universal properties for both AutoencoderKL and AutoencoderDC
+        self._enhance_vae_properties()
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae.spatial_compression_ratio)
+
+    def _enhance_vae_properties(self):
+        """Add universal properties to VAE for consistent interface across AutoencoderKL and AutoencoderDC."""
+        if not hasattr(self, "vae") or self.vae is None:
+            return
+
+        # Set spatial_compression_ratio property
+        if hasattr(self.vae, "spatial_compression_ratio"):
+            # AutoencoderDC already has this property
+            pass
+        elif hasattr(self.vae, "config") and hasattr(self.vae.config, "block_out_channels"):
+            # AutoencoderKL: calculate from block_out_channels
+            self.vae.spatial_compression_ratio = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        else:
+            # Fallback
+            self.vae.spatial_compression_ratio = 8
+
+        # Set scaling_factor property with safe defaults
+        if hasattr(self.vae, "config"):
+            self.vae.scaling_factor = getattr(self.vae.config, "scaling_factor", 0.18215)
+        else:
+            self.vae.scaling_factor = 0.18215
+
+        # Set shift_factor property with safe defaults (0.0 for AutoencoderDC)
+        if hasattr(self.vae, "config"):
+            shift_factor = getattr(self.vae.config, "shift_factor", None)
+            if shift_factor is None:  # AutoencoderDC case
+                self.vae.shift_factor = 0.0
+            else:
+                self.vae.shift_factor = shift_factor
+        else:
+            self.vae.shift_factor = 0.0
+
+        # Set latent_channels property (like VaeTower does)
+        if hasattr(self.vae, "config") and hasattr(self.vae.config, "latent_channels"):
+            # AutoencoderDC has latent_channels in config
+            self.vae.latent_channels = int(self.vae.config.latent_channels)
+        elif hasattr(self.vae, "config") and hasattr(self.vae.config, "in_channels"):
+            # AutoencoderKL has in_channels in config
+            self.vae.latent_channels = int(self.vae.config.in_channels)
+        else:
+            # Fallback based on VAE type - DC-AE typically has 32, AutoencoderKL has 4/16
+            if hasattr(self.vae, "spatial_compression_ratio") and self.vae.spatial_compression_ratio == 32:
+                self.vae.latent_channels = 32  # DC-AE default
+            else:
+                self.vae.latent_channels = 4   # AutoencoderKL default
+
+    @property
+    def vae_scale_factor(self):
+        """Compatibility property that returns spatial compression ratio."""
+        return getattr(self.vae, "spatial_compression_ratio", 8)
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        """Prepare initial latents for the diffusion process."""
+        if latents is None:
+            latent_height, latent_width = height // self.vae.spatial_compression_ratio, width // self.vae.spatial_compression_ratio
+            shape = (batch_size, num_channels_latents, latent_height, latent_width)
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # FlowMatchEulerDiscreteScheduler doesn't use init_noise_sigma scaling
+        return latents
+
+    def encode_prompt(self, prompt: Union[str, List[str]], device: torch.device):
+        """Encode text prompt using standard text encoder and tokenizer."""
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        return self._encode_prompt_standard(prompt, device)
+
+    def _encode_prompt_standard(self, prompt: List[str], device: torch.device):
+        """Encode prompt using standard text encoder and tokenizer with batch processing."""
+        # Clean text using modular preprocessor
+        cleaned_prompts = [self.text_preprocessor.clean_text(text) for text in prompt]
+        cleaned_uncond_prompts = [self.text_preprocessor.clean_text("") for _ in prompt]
+
+        # Batch conditional and unconditional prompts together for efficiency
+        all_prompts = cleaned_prompts + cleaned_uncond_prompts
+
+        # Tokenize all prompts in one batch
+        tokens = self.tokenizer(
+            all_prompts,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+
+        input_ids = tokens["input_ids"].to(device)
+        attention_mask = tokens["attention_mask"].bool().to(device)
+
+        # Encode all prompts in one batch
+        with torch.no_grad():
+            # Disable autocast like in TextTower
+            with torch.autocast("cuda", enabled=False):
+                emb = self.text_encoder(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+
+        # Use last hidden state (matching TextTower's use_last_hidden_state=True default)
+        all_embeddings = emb["last_hidden_state"]
+
+        # Split back into conditional and unconditional
+        batch_size = len(prompt)
+        text_embeddings = all_embeddings[:batch_size]
+        uncond_text_embeddings = all_embeddings[batch_size:]
+
+        cross_attn_mask = attention_mask[:batch_size]
+        uncond_cross_attn_mask = attention_mask[batch_size:]
+
+        return text_embeddings, cross_attn_mask, uncond_text_embeddings, uncond_cross_attn_mask
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        guidance_scale: float,
+        callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
+    ):
+        """Check that all inputs are in correct format."""
+        if height % self.vae.spatial_compression_ratio != 0 or width % self.vae.spatial_compression_ratio != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}.")
+
+        if guidance_scale < 1.0:
+            raise ValueError(f"guidance_scale has to be >= 1.0 but is {guidance_scale}")
+
+        if callback_on_step_end_tensor_inputs is not None and not isinstance(callback_on_step_end_tensor_inputs, list):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be a list but is {callback_on_step_end_tensor_inputs}"
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.mirage.MiragePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self, step, timestep, callback_kwargs)`.
+                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include tensors that are listed
+                in the `._callback_tensor_inputs` attribute.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.mirage.MiragePipelineOutput`] or `tuple`: [`~pipelines.mirage.MiragePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # 0. Default height and width to transformer config
+        height = height or 256
+        width = width or 256
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            guidance_scale,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError("prompt must be provided as a string or list of strings")
+
+        device = self._execution_device
+
+        # 2. Encode input prompt
+        text_embeddings, cross_attn_mask, uncond_text_embeddings, uncond_cross_attn_mask = self.encode_prompt(
+            prompt, device
+        )
+
+        # 3. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.vae.latent_channels  # From your transformer config
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare extra step kwargs
+        extra_step_kwargs = {}
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_eta:
+            extra_step_kwargs["eta"] = 0.0
+
+        # 6. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Duplicate latents for CFG
+                latents_in = torch.cat([latents, latents], dim=0)
+
+                # Cross-attention batch (uncond, cond)
+                ca_embed = torch.cat([uncond_text_embeddings, text_embeddings], dim=0)
+                ca_mask = None
+                if cross_attn_mask is not None and uncond_cross_attn_mask is not None:
+                    ca_mask = torch.cat([uncond_cross_attn_mask, cross_attn_mask], dim=0)
+
+                # Normalize timestep for the transformer
+                t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).repeat(2).to(device)
+
+                # Process inputs for transformer
+                img_seq, txt, pe = self.transformer.process_inputs(latents_in, ca_embed)
+
+                # Forward through transformer layers
+                img_seq = self.transformer.forward_transformers(
+                    img_seq, txt, time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    pe=pe, attention_mask=ca_mask
+                )
+
+                # Convert back to image format
+                from ...models.transformers.transformer_mirage import seq2img
+                noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
+
+                # Apply CFG
+                noise_uncond, noise_text = noise_both.chunk(2, dim=0)
+                noise_pred = noise_uncond + guidance_scale * (noise_text - noise_uncond)
+
+                # Compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_on_step_end(self, i, t, callback_kwargs)
+
+                # Call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post-processing
+        if output_type == "latent":
+            image = latents
+        else:
+            # Unscale latents for VAE (supports both AutoencoderKL and AutoencoderDC)
+            latents = (latents / self.vae.scaling_factor) + self.vae.shift_factor
+            # Decode using VAE (AutoencoderKL or AutoencoderDC)
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # Use standard image processor for post-processing
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return MiragePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
new file mode 100644
index 000000000000..e5cdb2a40924
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -0,0 +1,35 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class MiragePipelineOutput(BaseOutput):
+    """
+    Output class for Mirage pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
\ No newline at end of file
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
new file mode 100644
index 000000000000..11accdaecbee
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel, MirageParams
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class MirageTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = MirageTransformer2DModel
+    main_input_name = "image_latent"
+
+    @property
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
+        return (16, 4, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4, 4)
+
+    def prepare_dummy_input(self, height=32, width=32):
+        batch_size = 1
+        num_latent_channels = 16
+        sequence_length = 16
+        embedding_dim = 1792
+
+        image_latent = torch.randn((batch_size, num_latent_channels, height, width)).to(torch_device)
+        cross_attn_conditioning = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        micro_conditioning = torch.randn((batch_size, embedding_dim)).to(torch_device)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+
+        return {
+            "image_latent": image_latent,
+            "timestep": timestep,
+            "cross_attn_conditioning": cross_attn_conditioning,
+            "micro_conditioning": micro_conditioning,
+        }
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 16,
+            "patch_size": 2,
+            "context_in_dim": 1792,
+            "hidden_size": 1792,
+            "mlp_ratio": 3.5,
+            "num_heads": 28,
+            "depth": 4,  # Smaller depth for testing
+            "axes_dim": [32, 32],
+            "theta": 10_000,
+        }
+        inputs_dict = self.prepare_dummy_input()
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            # Test forward
+            outputs = model(**inputs_dict)
+
+        self.assertIsNotNone(outputs)
+        expected_shape = inputs_dict["image_latent"].shape
+        self.assertEqual(outputs.shape, expected_shape)
+
+    def test_mirage_params_initialization(self):
+        # Test model initialization
+        model = MirageTransformer2DModel(
+            in_channels=16,
+            patch_size=2,
+            context_in_dim=1792,
+            hidden_size=1792,
+            mlp_ratio=3.5,
+            num_heads=28,
+            depth=4,
+            axes_dim=[32, 32],
+            theta=10_000,
+        )
+        self.assertEqual(model.config.in_channels, 16)
+        self.assertEqual(model.config.hidden_size, 1792)
+        self.assertEqual(model.config.num_heads, 28)
+
+    def test_model_with_dict_config(self):
+        # Test model initialization with from_config
+        config_dict = {
+            "in_channels": 16,
+            "patch_size": 2,
+            "context_in_dim": 1792,
+            "hidden_size": 1792,
+            "mlp_ratio": 3.5,
+            "num_heads": 28,
+            "depth": 4,
+            "axes_dim": [32, 32],
+            "theta": 10_000,
+        }
+
+        model = MirageTransformer2DModel.from_config(config_dict)
+        self.assertEqual(model.config.in_channels, 16)
+        self.assertEqual(model.config.hidden_size, 1792)
+
+    def test_process_inputs(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            img_seq, txt, pe = model.process_inputs(
+                inputs_dict["image_latent"],
+                inputs_dict["cross_attn_conditioning"]
+            )
+
+        # Check shapes
+        batch_size = inputs_dict["image_latent"].shape[0]
+        height, width = inputs_dict["image_latent"].shape[2:]
+        patch_size = init_dict["patch_size"]
+        expected_seq_len = (height // patch_size) * (width // patch_size)
+
+        self.assertEqual(img_seq.shape, (batch_size, expected_seq_len, init_dict["in_channels"] * patch_size**2))
+        self.assertEqual(txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"]))
+        # Check that pe has the correct batch size, sequence length and some embedding dimension
+        self.assertEqual(pe.shape[0], batch_size)  # batch size
+        self.assertEqual(pe.shape[1], 1)  # unsqueeze(1) in EmbedND
+        self.assertEqual(pe.shape[2], expected_seq_len)  # sequence length
+        self.assertEqual(pe.shape[-2:], (2, 2))  # rope rearrange output
+
+    def test_forward_transformers(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            # Process inputs first
+            img_seq, txt, pe = model.process_inputs(
+                inputs_dict["image_latent"],
+                inputs_dict["cross_attn_conditioning"]
+            )
+
+            # Test forward_transformers
+            output_seq = model.forward_transformers(
+                img_seq,
+                txt,
+                timestep=inputs_dict["timestep"],
+                pe=pe
+            )
+
+        # Check output shape
+        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"]**2
+        self.assertEqual(output_seq.shape, (img_seq.shape[0], img_seq.shape[1], expected_out_channels))
+
+    def test_attention_mask(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        # Create attention mask
+        batch_size = inputs_dict["cross_attn_conditioning"].shape[0]
+        seq_len = inputs_dict["cross_attn_conditioning"].shape[1]
+        attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool).to(torch_device)
+        attention_mask[:, seq_len//2:] = False  # Mask second half
+
+        with torch.no_grad():
+            outputs = model(
+                **inputs_dict,
+                cross_attn_mask=attention_mask
+            )
+
+        self.assertIsNotNone(outputs)
+        expected_shape = inputs_dict["image_latent"].shape
+        self.assertEqual(outputs.shape, expected_shape)
+
+    def test_invalid_config(self):
+        # Test invalid configuration - hidden_size not divisible by num_heads
+        with self.assertRaises(ValueError):
+            MirageTransformer2DModel(
+                in_channels=16,
+                patch_size=2,
+                context_in_dim=1792,
+                hidden_size=1793,  # Not divisible by 28
+                mlp_ratio=3.5,
+                num_heads=28,
+                depth=4,
+                axes_dim=[32, 32],
+                theta=10_000,
+            )
+
+        # Test invalid axes_dim that doesn't sum to pe_dim
+        with self.assertRaises(ValueError):
+            MirageTransformer2DModel(
+                in_channels=16,
+                patch_size=2,
+                context_in_dim=1792,
+                hidden_size=1792,
+                mlp_ratio=3.5,
+                num_heads=28,
+                depth=4,
+                axes_dim=[30, 30],  # Sum = 60, but pe_dim = 1792/28 = 64
+                theta=10_000,
+            )
+
+    def test_gradient_checkpointing_enable(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # Enable gradient checkpointing
+        model.enable_gradient_checkpointing()
+
+        # Check that _activation_checkpointing is set
+        for block in model.blocks:
+            self.assertTrue(hasattr(block, '_activation_checkpointing'))
+
+    def test_from_config(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        # Create model from config
+        model = self.model_class.from_config(init_dict)
+        self.assertIsInstance(model, self.model_class)
+        self.assertEqual(model.config.in_channels, init_dict["in_channels"])
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 4ac274be3d7647655437c6b810d1daa5c650f093 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 11:51:14 +0200
Subject: [PATCH 03/69] use attention processors

---
 src/diffusers/models/attention_processor.py   | 58 +++++++++++++
 .../models/transformers/transformer_mirage.py | 86 ++++++++++++++++---
 2 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 990245de1742..08e80e4329ba 100755
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -5609,6 +5609,63 @@ def __new__(cls, *args, **kwargs):
         return processor
 
 
+class MirageAttnProcessor2_0:
+    r"""
+    Processor for implementing Mirage-style attention with multi-source tokens and RoPE.
+    Properly integrates with diffusers Attention module while handling Mirage-specific logic.
+    """
+
+    def __init__(self):
+        if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
+            raise ImportError("MirageAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: "Attention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Apply Mirage attention using standard diffusers interface.
+
+        Expected tensor formats from MirageBlock.attn_forward():
+        - hidden_states: Image queries with RoPE applied [B, H, L_img, D]
+        - encoder_hidden_states: Packed key+value tensors [B, H, L_all, 2*D]
+          (concatenated keys and values from text + image + spatial conditioning)
+        - attention_mask: Custom attention mask [B, H, L_img, L_all] or None
+        """
+
+        if encoder_hidden_states is None:
+            raise ValueError(
+                "MirageAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
+                "This should be provided by MirageBlock.attn_forward()."
+            )
+
+        # Unpack the combined key+value tensor
+        # encoder_hidden_states is [B, H, L_all, 2*D] containing [keys, values]
+        key, value = encoder_hidden_states.chunk(2, dim=-1)  # Each [B, H, L_all, D]
+
+        # Apply scaled dot-product attention with Mirage's processed tensors
+        # hidden_states is image queries [B, H, L_img, D]
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            hidden_states.contiguous(), key.contiguous(), value.contiguous(), attn_mask=attention_mask
+        )
+
+        # Reshape from [B, H, L_img, D] to [B, L_img, H*D]
+        batch_size, num_heads, seq_len, head_dim = attn_output.shape
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, num_heads * head_dim)
+
+        # Apply output projection using the diffusers Attention module
+        attn_output = attn.to_out[0](attn_output)
+        if len(attn.to_out) > 1:
+            attn_output = attn.to_out[1](attn_output)  # dropout if present
+
+        return attn_output
+
+
 ADDED_KV_ATTENTION_PROCESSORS = (
     AttnAddedKVProcessor,
     SlicedAttnAddedKVProcessor,
@@ -5657,6 +5714,7 @@ def __new__(cls, *args, **kwargs):
     PAGHunyuanAttnProcessor2_0,
     PAGCFGHunyuanAttnProcessor2_0,
     LuminaAttnProcessor2_0,
+    MirageAttnProcessor2_0,
     FusedAttnProcessor2_0,
     CustomDiffusionXFormersAttnProcessor,
     CustomDiffusionAttnProcessor2_0,
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 39c569cbb26b..0225b9532aff 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -24,6 +24,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ..modeling_utils import ModelMixin
 from ..modeling_outputs import Transformer2DModelOutput
+from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 
 
@@ -159,13 +160,21 @@ def __init__(
         # img qkv
         self.img_pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_qkv_proj = nn.Linear(hidden_size, hidden_size * 3, bias=False)
-        self.attn_out = nn.Linear(hidden_size, hidden_size, bias=False)
         self.qk_norm = QKNorm(self.head_dim)
 
         # txt kv
         self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
         self.k_norm = RMSNorm(self.head_dim)
 
+        self.attention = Attention(
+            query_dim=hidden_size,
+            heads=num_heads,
+            dim_head=self.head_dim,
+            bias=False,
+            out_bias=False,
+            processor=MirageAttnProcessor2_0(),
+        )
+
 
         # mlp
         self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
@@ -214,15 +223,11 @@ def attn_forward(
             k = torch.cat((cond_k, k), dim=2)
             v = torch.cat((cond_v, v), dim=2)
 
-        # build additive attention bias
-        attn_bias: Tensor | None = None
-        attn_mask: Tensor | None = None
-
         # build multiplicative 0/1 mask for provided attention_mask over [cond?, text, image] keys
+        attn_mask: Tensor | None = None
         if attention_mask is not None:
             bs, _, l_img, _ = img_q.shape
             l_txt = txt_k.shape[2]
-            l_all = k.shape[2]
 
             assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
             assert (
@@ -244,11 +249,13 @@ def attn_forward(
             # repeat across heads and query positions
             attn_mask = joint_mask[:, None, None, :].expand(-1, self.num_heads, l_img, -1)  # (B,H,L_img,L_all)
 
-        attn = torch.nn.functional.scaled_dot_product_attention(
-            img_q.contiguous(), k.contiguous(), v.contiguous(), attn_mask=attn_mask
+        kv_packed = torch.cat([k, v], dim=-1)
+
+        attn = self.attention(
+            hidden_states=img_q,                    
+            encoder_hidden_states=kv_packed,        
+            attention_mask=attn_mask,
         )
-        attn = rearrange(attn, "B H L D -> B L (H D)")
-        attn = self.attn_out(attn)
 
         return attn
 
@@ -413,6 +420,65 @@ def __init__(
 
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
 
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
     def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
         """Timestep independent stuff"""
         txt = self.txt_in(txt)

From 904debcd11de7c6103e091b3223cd459b03d05a1 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 12:50:19 +0200
Subject: [PATCH 04/69] use diffusers rmsnorm

---
 .../models/transformers/transformer_mirage.py  | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 0225b9532aff..f4199da1edcc 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -26,12 +26,12 @@
 from ..modeling_outputs import Transformer2DModelOutput
 from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..normalization import RMSNorm
 
 
 logger = logging.get_logger(__name__)
 
 
-# Mirage Layer Components
 def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
     img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
     img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
@@ -93,23 +93,13 @@ def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
 
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms * self.scale).to(dtype=x_dtype)
 
 
 class QKNorm(torch.nn.Module):
     def __init__(self, dim: int):
         super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
+        self.query_norm = RMSNorm(dim, eps=1e-6)
+        self.key_norm = RMSNorm(dim, eps=1e-6)
 
     def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
         q = self.query_norm(q)
@@ -164,7 +154,7 @@ def __init__(
 
         # txt kv
         self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
-        self.k_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6)
 
         self.attention = Attention(
             query_dim=hidden_size,

From 122115adb1305834b298e677ae30fcef65c4fd35 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 14:26:50 +0200
Subject: [PATCH 05/69] use diffusers timestep embedding method

---
 .../models/transformers/transformer_mirage.py  | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index f4199da1edcc..916559eb47ac 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -27,6 +27,7 @@
 from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..normalization import RMSNorm
+from ..embeddings import get_timestep_embedding
 
 
 logger = logging.get_logger(__name__)
@@ -71,15 +72,6 @@ def forward(self, ids: Tensor) -> Tensor:
         return emb.unsqueeze(1)
 
 
-def timestep_embedding(t: Tensor, dim: int, max_period: int = 10000, time_factor: float = 1000.0) -> Tensor:
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
 
 
 class MLPEmbedder(nn.Module):
@@ -480,8 +472,12 @@ def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[T
 
     def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
-            timestep_embedding(
-                t=timestep, dim=256, max_period=self.time_max_period, time_factor=self.time_factor
+            get_timestep_embedding(
+                timesteps=timestep,
+                embedding_dim=256,
+                max_period=self.time_max_period,
+                scale=self.time_factor,
+                flip_sin_to_cos=True  # Match original cos, sin order
             ).to(dtype)
         )
 

From 4588bbeb4229fd307119257e273a424b370573b1 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 26 Sep 2025 18:41:17 +0530
Subject: [PATCH 06/69] [CI] disable installing transformers from main in ci
 for now. (#12397)

* disable installing transformers from main in ci for now.

* up

* u[p
---
 .github/workflows/pr_modular_tests.yml              |  5 +++--
 .github/workflows/pr_tests.yml                      | 12 +++++++-----
 .github/workflows/pr_tests_gpu.yml                  | 13 ++++++++-----
 tests/pipelines/kandinsky/test_kandinsky.py         |  4 +++-
 .../pipelines/kandinsky/test_kandinsky_combined.py  | 12 +++++++++---
 tests/pipelines/kandinsky/test_kandinsky_img2img.py |  4 +++-
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py |  4 +++-
 7 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml
index e01345e32524..c6e87e642dc5 100644
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -110,8 +110,9 @@ jobs:
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
       run: |
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 34a344528e3e..ebfe9f442f30 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -116,8 +116,9 @@ jobs:
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
       run: |
@@ -253,9 +254,10 @@ jobs:
         python -m uv pip install -e [quality,test]
         # TODO (sayakpaul, DN6): revisit `--no-deps`
         python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-        python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        python -m uv pip install -U tokenizers
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+        # python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        # python -m uv pip install -U tokenizers
+        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
       run: |
diff --git a/.github/workflows/pr_tests_gpu.yml b/.github/workflows/pr_tests_gpu.yml
index 45294c89fe35..1a8d5f6b815e 100644
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -132,8 +132,9 @@ jobs:
         run: |
           python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
           python -m uv pip install -e [quality,test]
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+          # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+          # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
 
       - name: Environment
         run: |
@@ -203,8 +204,9 @@ jobs:
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
         python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
 
     - name: Environment
       run: |
@@ -266,7 +268,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
+        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
         python -m uv pip install -e [quality,test,training]
 
     - name: Environment
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 9fa39b1bf581..6207e71df8cd 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -218,7 +218,9 @@ def get_dummy_inputs(self, device, seed=0):
         return dummy.get_dummy_inputs(device=device, seed=seed)
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky(self):
         device = "cpu"
diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py
index ca80461d87b1..eba897659700 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -76,7 +76,9 @@ def get_dummy_inputs(self, device, seed=0):
         return inputs
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky(self):
         device = "cpu"
@@ -187,7 +189,9 @@ def get_dummy_inputs(self, device, seed=0):
         return inputs
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky(self):
         device = "cpu"
@@ -301,7 +305,9 @@ def get_dummy_inputs(self, device, seed=0):
         return inputs
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky(self):
         device = "cpu"
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 6bcd9587f239..6d1b43a24fd9 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -240,7 +240,9 @@ def get_dummy_inputs(self, device, seed=0):
         return dummies.get_dummy_inputs(device=device, seed=seed)
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky_img2img(self):
         device = "cpu"
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 6383ca71ef23..e2f4aa2a4f14 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -234,7 +234,9 @@ def get_dummy_inputs(self, device, seed=0):
         return dummies.get_dummy_inputs(device=device, seed=seed)
 
     @pytest.mark.xfail(
-        condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True
+        condition=is_transformers_version(">=", "4.56.2"),
+        reason="Latest transformers changes the slices",
+        strict=False,
     )
     def test_kandinsky_inpaint(self):
         device = "cpu"

From e3fe0e8e1f79216cfe83719debc1ed33dfb3e788 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 15:17:11 +0200
Subject: [PATCH 07/69] remove MirageParams

---
 .../models/transformers/transformer_mirage.py | 64 +++++--------------
 .../pipelines/mirage/pipeline_output.py       |  2 +-
 .../test_models_transformer_mirage.py         |  8 +--
 3 files changed, 22 insertions(+), 52 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 916559eb47ac..396e000524ec 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -288,20 +288,6 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
         return x
 
 
-@dataclass
-class MirageParams:
-    in_channels: int
-    patch_size: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    axes_dim: list[int]
-    theta: int
-    time_factor: float = 1000.0
-    time_max_period: int = 10_000
-    conditioning_block_ids: list[int] | None = None
 
 
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
@@ -348,55 +334,39 @@ def __init__(
         if axes_dim is None:
             axes_dim = [32, 32]
 
-        # Create MirageParams from the provided arguments
-        params = MirageParams(
-            in_channels=in_channels,
-            patch_size=patch_size,
-            context_in_dim=context_in_dim,
-            hidden_size=hidden_size,
-            mlp_ratio=mlp_ratio,
-            num_heads=num_heads,
-            depth=depth,
-            axes_dim=axes_dim,
-            theta=theta,
-            time_factor=time_factor,
-            time_max_period=time_max_period,
-            conditioning_block_ids=conditioning_block_ids,
-        )
-
-        self.params = params
-        self.in_channels = params.in_channels
-        self.patch_size = params.patch_size
+        # Store parameters directly
+        self.in_channels = in_channels
+        self.patch_size = patch_size
         self.out_channels = self.in_channels * self.patch_size**2
 
-        self.time_factor = params.time_factor
-        self.time_max_period = params.time_max_period
+        self.time_factor = time_factor
+        self.time_max_period = time_max_period
 
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
 
-        pe_dim = params.hidden_size // params.num_heads
+        pe_dim = hidden_size // num_heads
 
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
 
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
         self.img_in = nn.Linear(self.in_channels * self.patch_size**2, self.hidden_size, bias=True)
         self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
 
-        conditioning_block_ids: list[int] = params.conditioning_block_ids or list(range(params.depth))
+        conditioning_block_ids: list[int] = conditioning_block_ids or list(range(depth))
 
         self.blocks = nn.ModuleList(
             [
                 MirageBlock(
                     self.hidden_size,
                     self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
+                    mlp_ratio=mlp_ratio,
                 )
-                for i in range(params.depth)
+                for i in range(depth)
             ]
         )
 
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
index e5cdb2a40924..dfb55821d142 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Union
 
 import numpy as np
 import PIL.Image
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 11accdaecbee..5e7b0bd165a6 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel, MirageParams
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 
 from ...testing_utils import enable_full_determinism, torch_device
 from ..test_modeling_common import ModelTesterMixin
@@ -88,9 +88,9 @@ def test_forward_signature(self):
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
-        self.assertEqual(outputs.shape, expected_shape)
+        self.assertEqual(outputs.sample.shape, expected_shape)
 
-    def test_mirage_params_initialization(self):
+    def test_model_initialization(self):
         # Test model initialization
         model = MirageTransformer2DModel(
             in_channels=16,
@@ -196,7 +196,7 @@ def test_attention_mask(self):
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
-        self.assertEqual(outputs.shape, expected_shape)
+        self.assertEqual(outputs.sample.shape, expected_shape)
 
     def test_invalid_config(self):
         # Test invalid configuration - hidden_size not divisible by num_heads

From 85ae87b9311a1432f43c2928389c8eafc86c0991 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 16:35:56 +0200
Subject: [PATCH 08/69] checkpoint conversion script

---
 scripts/convert_mirage_to_diffusers.py | 312 +++++++++++++++++++++++++
 1 file changed, 312 insertions(+)
 create mode 100644 scripts/convert_mirage_to_diffusers.py

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
new file mode 100644
index 000000000000..85716e69ff92
--- /dev/null
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+Script to convert Mirage checkpoint from original codebase to diffusers format.
+"""
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import torch
+from safetensors.torch import save_file
+from transformers import GemmaTokenizerFast
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.pipelines.mirage import MiragePipeline
+
+def load_reference_config(vae_type: str) -> dict:
+    """Load transformer config from existing pipeline checkpoint."""
+
+    if vae_type == "flux":
+        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated/transformer/config.json"
+    elif vae_type == "dc-ae":
+        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated/transformer/config.json"
+    else:
+        raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
+
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Reference config not found: {config_path}")
+
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+
+    print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
+    return config
+
+def create_parameter_mapping() -> dict:
+    """Create mapping from old parameter names to new diffusers names."""
+
+    # Key mappings for structural changes
+    mapping = {}
+
+    # RMSNorm: scale -> weight
+    for i in range(16):  # 16 layers
+        mapping[f"blocks.{i}.qk_norm.query_norm.scale"] = f"blocks.{i}.qk_norm.query_norm.weight"
+        mapping[f"blocks.{i}.qk_norm.key_norm.scale"] = f"blocks.{i}.qk_norm.key_norm.weight"
+        mapping[f"blocks.{i}.k_norm.scale"] = f"blocks.{i}.k_norm.weight"
+
+        # Attention: attn_out -> attention.to_out.0
+        mapping[f"blocks.{i}.attn_out.weight"] = f"blocks.{i}.attention.to_out.0.weight"
+
+    return mapping
+
+def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
+    """Convert old checkpoint parameters to new diffusers format."""
+
+    print("Converting checkpoint parameters...")
+
+    mapping = create_parameter_mapping()
+    converted_state_dict = {}
+
+    # First, print available keys to understand structure
+    print("Available keys in checkpoint:")
+    for key in sorted(old_state_dict.keys())[:10]:  # Show first 10 keys
+        print(f"  {key}")
+    if len(old_state_dict) > 10:
+        print(f"  ... and {len(old_state_dict) - 10} more")
+
+    for key, value in old_state_dict.items():
+        new_key = key
+
+        # Apply specific mappings if needed
+        if key in mapping:
+            new_key = mapping[key]
+            print(f"  Mapped: {key} -> {new_key}")
+
+        # Handle img_qkv_proj -> split to to_q, to_k, to_v
+        if "img_qkv_proj.weight" in key:
+            print(f"  Found QKV projection: {key}")
+            # Split QKV weight into separate Q, K, V projections
+            qkv_weight = value
+            hidden_size = qkv_weight.shape[1]
+            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
+
+            # Extract layer number from key (e.g., blocks.0.img_qkv_proj.weight -> 0)
+            parts = key.split('.')
+            layer_idx = None
+            for i, part in enumerate(parts):
+                if part == 'blocks' and i + 1 < len(parts) and parts[i+1].isdigit():
+                    layer_idx = parts[i+1]
+                    break
+
+            if layer_idx is not None:
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_q.weight"] = q_weight
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_k.weight"] = k_weight
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_v.weight"] = v_weight
+                print(f"  Split QKV for layer {layer_idx}")
+
+                # Also keep the original img_qkv_proj for backward compatibility
+                converted_state_dict[new_key] = value
+        else:
+            converted_state_dict[new_key] = value
+
+    print(f"✓ Converted {len(converted_state_dict)} parameters")
+    return converted_state_dict
+
+
+def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> MirageTransformer2DModel:
+    """Create and load MirageTransformer2DModel from old checkpoint."""
+
+    print(f"Loading checkpoint from: {checkpoint_path}")
+
+    # Load old checkpoint
+    if not os.path.exists(checkpoint_path):
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+
+    old_checkpoint = torch.load(checkpoint_path, map_location='cpu')
+
+    # Handle different checkpoint formats
+    if isinstance(old_checkpoint, dict):
+        if 'model' in old_checkpoint:
+            state_dict = old_checkpoint['model']
+        elif 'state_dict' in old_checkpoint:
+            state_dict = old_checkpoint['state_dict']
+        else:
+            state_dict = old_checkpoint
+    else:
+        state_dict = old_checkpoint
+
+    print(f"✓ Loaded checkpoint with {len(state_dict)} parameters")
+
+    # Convert parameter names if needed
+    converted_state_dict = convert_checkpoint_parameters(state_dict)
+
+    # Create transformer with config
+    print("Creating MirageTransformer2DModel...")
+    transformer = MirageTransformer2DModel(**config)
+
+    # Load state dict
+    print("Loading converted parameters...")
+    missing_keys, unexpected_keys = transformer.load_state_dict(converted_state_dict, strict=False)
+
+    if missing_keys:
+        print(f"⚠ Missing keys: {missing_keys}")
+    if unexpected_keys:
+        print(f"⚠ Unexpected keys: {unexpected_keys}")
+
+    if not missing_keys and not unexpected_keys:
+        print("✓ All parameters loaded successfully!")
+
+    return transformer
+
+def copy_pipeline_components(vae_type: str, output_path: str):
+    """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
+
+    if vae_type == "flux":
+        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated"
+    else:  # dc-ae
+        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated"
+
+    components = ["vae", "scheduler", "text_encoder", "tokenizer"]
+
+    for component in components:
+        src_path = os.path.join(ref_pipeline, component)
+        dst_path = os.path.join(output_path, component)
+
+        if os.path.exists(src_path):
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
+            else:
+                shutil.copy2(src_path, dst_path)
+            print(f"✓ Copied {component}")
+        else:
+            print(f"⚠ Component not found: {src_path}")
+
+def create_model_index(vae_type: str, output_path: str):
+    """Create model_index.json for the pipeline."""
+
+    if vae_type == "flux":
+        vae_class = "AutoencoderKL"
+    else:  # dc-ae
+        vae_class = "AutoencoderDC"
+
+    model_index = {
+        "_class_name": "MiragePipeline",
+        "_diffusers_version": "0.31.0.dev0",
+        "_name_or_path": os.path.basename(output_path),
+        "scheduler": [
+            "diffusers",
+            "FlowMatchEulerDiscreteScheduler"
+        ],
+        "text_encoder": [
+            "transformers",
+            "T5GemmaEncoder"
+        ],
+        "tokenizer": [
+            "transformers",
+            "GemmaTokenizerFast"
+        ],
+        "transformer": [
+            "diffusers",
+            "MirageTransformer2DModel"
+        ],
+        "vae": [
+            "diffusers",
+            vae_class
+        ]
+    }
+
+    model_index_path = os.path.join(output_path, "model_index.json")
+    with open(model_index_path, 'w') as f:
+        json.dump(model_index, f, indent=2)
+
+    print(f"✓ Created model_index.json")
+
+def main(args):
+    # Validate inputs
+    if not os.path.exists(args.checkpoint_path):
+        raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint_path}")
+
+    # Load reference config based on VAE type
+    config = load_reference_config(args.vae_type)
+
+    # Create output directory
+    os.makedirs(args.output_path, exist_ok=True)
+    print(f"✓ Output directory: {args.output_path}")
+
+    # Create transformer from checkpoint
+    transformer = create_transformer_from_checkpoint(args.checkpoint_path, config)
+
+    # Save transformer
+    transformer_path = os.path.join(args.output_path, "transformer")
+    os.makedirs(transformer_path, exist_ok=True)
+
+    # Save config
+    with open(os.path.join(transformer_path, "config.json"), 'w') as f:
+        json.dump(config, f, indent=2)
+
+    # Save model weights as safetensors
+    state_dict = transformer.state_dict()
+    save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
+    print(f"✓ Saved transformer to {transformer_path}")
+
+    # Copy other pipeline components
+    copy_pipeline_components(args.vae_type, args.output_path)
+
+    # Create model index
+    create_model_index(args.vae_type, args.output_path)
+
+    # Verify the pipeline can be loaded
+    try:
+        pipeline = MiragePipeline.from_pretrained(args.output_path)
+        print(f"Pipeline loaded successfully!")
+        print(f"Transformer: {type(pipeline.transformer).__name__}")
+        print(f"VAE: {type(pipeline.vae).__name__}")
+        print(f"Text Encoder: {type(pipeline.text_encoder).__name__}")
+        print(f"Scheduler: {type(pipeline.scheduler).__name__}")
+
+        # Display model info
+        num_params = sum(p.numel() for p in pipeline.transformer.parameters())
+        print(f"✓ Transformer parameters: {num_params:,}")
+
+    except Exception as e:
+        print(f"Pipeline verification failed: {e}")
+        return False
+
+    print("Conversion completed successfully!")
+    print(f"Converted pipeline saved to: {args.output_path}")
+    print(f"VAE type: {args.vae_type}")
+
+
+    return True
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
+
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        required=True,
+        help="Path to the original Mirage checkpoint (.pth file)"
+    )
+
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output directory for the converted diffusers pipeline"
+    )
+
+    parser.add_argument(
+        "--vae_type",
+        type=str,
+        choices=["flux", "dc-ae"],
+        required=True,
+        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        success = main(args)
+        if not success:
+            sys.exit(1)
+    except Exception as e:
+        print(f"❌ Conversion failed: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
\ No newline at end of file

From 9a697d06b70eaa4e0c9f1f1b5bca6209c65b005b Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 17:00:55 +0200
Subject: [PATCH 09/69] ruff formating

---
 scripts/convert_mirage_to_diffusers.py        | 83 ++++++++-----------
 .../models/transformers/transformer_mirage.py | 41 ++++-----
 src/diffusers/pipelines/mirage/__init__.py    |  3 +-
 .../pipelines/mirage/pipeline_mirage.py       | 50 +++++++----
 .../pipelines/mirage/pipeline_output.py       |  2 +-
 .../test_models_transformer_mirage.py         | 30 +++----
 6 files changed, 100 insertions(+), 109 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 85716e69ff92..5e2a2ff768f4 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -8,16 +8,17 @@
 import os
 import shutil
 import sys
+
 import torch
 from safetensors.torch import save_file
-from transformers import GemmaTokenizerFast
 
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
-from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.pipelines.mirage import MiragePipeline
 
+
 def load_reference_config(vae_type: str) -> dict:
     """Load transformer config from existing pipeline checkpoint."""
 
@@ -31,12 +32,13 @@ def load_reference_config(vae_type: str) -> dict:
     if not os.path.exists(config_path):
         raise FileNotFoundError(f"Reference config not found: {config_path}")
 
-    with open(config_path, 'r') as f:
+    with open(config_path, "r") as f:
         config = json.load(f)
 
     print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
     return config
 
+
 def create_parameter_mapping() -> dict:
     """Create mapping from old parameter names to new diffusers names."""
 
@@ -54,6 +56,7 @@ def create_parameter_mapping() -> dict:
 
     return mapping
 
+
 def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
     """Convert old checkpoint parameters to new diffusers format."""
 
@@ -82,15 +85,14 @@ def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
             print(f"  Found QKV projection: {key}")
             # Split QKV weight into separate Q, K, V projections
             qkv_weight = value
-            hidden_size = qkv_weight.shape[1]
             q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
 
             # Extract layer number from key (e.g., blocks.0.img_qkv_proj.weight -> 0)
-            parts = key.split('.')
+            parts = key.split(".")
             layer_idx = None
             for i, part in enumerate(parts):
-                if part == 'blocks' and i + 1 < len(parts) and parts[i+1].isdigit():
-                    layer_idx = parts[i+1]
+                if part == "blocks" and i + 1 < len(parts) and parts[i + 1].isdigit():
+                    layer_idx = parts[i + 1]
                     break
 
             if layer_idx is not None:
@@ -117,14 +119,14 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     if not os.path.exists(checkpoint_path):
         raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
 
-    old_checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    old_checkpoint = torch.load(checkpoint_path, map_location="cpu")
 
     # Handle different checkpoint formats
     if isinstance(old_checkpoint, dict):
-        if 'model' in old_checkpoint:
-            state_dict = old_checkpoint['model']
-        elif 'state_dict' in old_checkpoint:
-            state_dict = old_checkpoint['state_dict']
+        if "model" in old_checkpoint:
+            state_dict = old_checkpoint["model"]
+        elif "state_dict" in old_checkpoint:
+            state_dict = old_checkpoint["state_dict"]
         else:
             state_dict = old_checkpoint
     else:
@@ -153,6 +155,7 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
 
     return transformer
 
+
 def copy_pipeline_components(vae_type: str, output_path: str):
     """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
 
@@ -176,6 +179,7 @@ def copy_pipeline_components(vae_type: str, output_path: str):
         else:
             print(f"⚠ Component not found: {src_path}")
 
+
 def create_model_index(vae_type: str, output_path: str):
     """Create model_index.json for the pipeline."""
 
@@ -188,33 +192,19 @@ def create_model_index(vae_type: str, output_path: str):
         "_class_name": "MiragePipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
-        "scheduler": [
-            "diffusers",
-            "FlowMatchEulerDiscreteScheduler"
-        ],
-        "text_encoder": [
-            "transformers",
-            "T5GemmaEncoder"
-        ],
-        "tokenizer": [
-            "transformers",
-            "GemmaTokenizerFast"
-        ],
-        "transformer": [
-            "diffusers",
-            "MirageTransformer2DModel"
-        ],
-        "vae": [
-            "diffusers",
-            vae_class
-        ]
+        "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
+        "text_encoder": ["transformers", "T5GemmaEncoder"],
+        "tokenizer": ["transformers", "GemmaTokenizerFast"],
+        "transformer": ["diffusers", "MirageTransformer2DModel"],
+        "vae": ["diffusers", vae_class],
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
-    with open(model_index_path, 'w') as f:
+    with open(model_index_path, "w") as f:
         json.dump(model_index, f, indent=2)
 
-    print(f"✓ Created model_index.json")
+    print("✓ Created model_index.json")
+
 
 def main(args):
     # Validate inputs
@@ -236,7 +226,7 @@ def main(args):
     os.makedirs(transformer_path, exist_ok=True)
 
     # Save config
-    with open(os.path.join(transformer_path, "config.json"), 'w') as f:
+    with open(os.path.join(transformer_path, "config.json"), "w") as f:
         json.dump(config, f, indent=2)
 
     # Save model weights as safetensors
@@ -253,7 +243,7 @@ def main(args):
     # Verify the pipeline can be loaded
     try:
         pipeline = MiragePipeline.from_pretrained(args.output_path)
-        print(f"Pipeline loaded successfully!")
+        print("Pipeline loaded successfully!")
         print(f"Transformer: {type(pipeline.transformer).__name__}")
         print(f"VAE: {type(pipeline.vae).__name__}")
         print(f"Text Encoder: {type(pipeline.text_encoder).__name__}")
@@ -271,24 +261,18 @@ def main(args):
     print(f"Converted pipeline saved to: {args.output_path}")
     print(f"VAE type: {args.vae_type}")
 
-
     return True
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
 
     parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="Path to the original Mirage checkpoint (.pth file)"
+        "--checkpoint_path", type=str, required=True, help="Path to the original Mirage checkpoint (.pth file)"
     )
 
     parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Output directory for the converted diffusers pipeline"
+        "--output_path", type=str, required=True, help="Output directory for the converted diffusers pipeline"
     )
 
     parser.add_argument(
@@ -296,7 +280,7 @@ def main(args):
         type=str,
         choices=["flux", "dc-ae"],
         required=True,
-        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)"
+        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)",
     )
 
     args = parser.parse_args()
@@ -306,7 +290,8 @@ def main(args):
         if not success:
             sys.exit(1)
     except Exception as e:
-        print(f"❌ Conversion failed: {e}")
+        print(f"Conversion failed: {e}")
         import traceback
+
         traceback.print_exc()
-        sys.exit(1)
\ No newline at end of file
+        sys.exit(1)
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 396e000524ec..923d44d4f1ec 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -13,21 +13,21 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Union, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
+
 import torch
-import math
-from torch import Tensor, nn
-from torch.nn.functional import fold, unfold
 from einops import rearrange
 from einops.layers.torch import Rearrange
+from torch import Tensor, nn
+from torch.nn.functional import fold, unfold
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from ..modeling_outputs import Transformer2DModelOutput
-from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..normalization import RMSNorm
+from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ..embeddings import get_timestep_embedding
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import RMSNorm
 
 
 logger = logging.get_logger(__name__)
@@ -72,8 +72,6 @@ def forward(self, ids: Tensor) -> Tensor:
         return emb.unsqueeze(1)
 
 
-
-
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
@@ -85,8 +83,6 @@ def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
 
-
-
 class QKNorm(torch.nn.Module):
     def __init__(self, dim: int):
         super().__init__()
@@ -157,7 +153,6 @@ def __init__(
             processor=MirageAttnProcessor2_0(),
         )
 
-
         # mlp
         self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.gate_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
@@ -212,9 +207,9 @@ def attn_forward(
             l_txt = txt_k.shape[2]
 
             assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
-            assert (
-                attention_mask.shape[-1] == l_txt
-            ), f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+            assert attention_mask.shape[-1] == l_txt, (
+                f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+            )
 
             device = img_q.device
 
@@ -234,8 +229,8 @@ def attn_forward(
         kv_packed = torch.cat([k, v], dim=-1)
 
         attn = self.attention(
-            hidden_states=img_q,                    
-            encoder_hidden_states=kv_packed,        
+            hidden_states=img_q,
+            encoder_hidden_states=kv_packed,
             attention_mask=attn_mask,
         )
 
@@ -288,8 +283,6 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
         return x
 
 
-
-
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
     """Flatten an image into a sequence of patches"""
     return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
@@ -327,7 +320,7 @@ def __init__(
         time_factor: float = 1000.0,
         time_max_period: int = 10000,
         conditioning_block_ids: list = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -447,7 +440,7 @@ def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Te
                 embedding_dim=256,
                 max_period=self.time_max_period,
                 scale=self.time_factor,
-                flip_sin_to_cos=True  # Match original cos, sin order
+                flip_sin_to_cos=True,  # Match original cos, sin order
             ).to(dtype)
         )
 
@@ -470,9 +463,7 @@ def forward_transformers(
             vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
 
         for block in self.blocks:
-            img = block(
-                img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
-            )
+            img = block(img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs)
 
         img = self.final_layer(img, vec)
         return img
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
index 4fd8ad191b3f..cba951057370 100644
--- a/src/diffusers/pipelines/mirage/__init__.py
+++ b/src/diffusers/pipelines/mirage/__init__.py
@@ -1,4 +1,5 @@
 from .pipeline_mirage import MiragePipeline
 from .pipeline_output import MiragePipelineOutput
 
-__all__ = ["MiragePipeline", "MiragePipelineOutput"]
\ No newline at end of file
+
+__all__ = ["MiragePipeline", "MiragePipelineOutput"]
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 126eab07977c..c4a4783c5f38 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import html
 import inspect
 import os
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import html
 import re
 import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import ftfy
 import torch
@@ -31,7 +30,7 @@
 
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, AutoencoderDC
+from ...models import AutoencoderDC, AutoencoderKL
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -41,6 +40,7 @@
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import MiragePipelineOutput
 
+
 try:
     from ...models.transformers.transformer_mirage import MirageTransformer2DModel
 except ImportError:
@@ -55,7 +55,19 @@ class TextPreprocessor:
     def __init__(self):
         """Initialize text preprocessor."""
         self.bad_punct_regex = re.compile(
-            r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + r"\\" + r"\/" + r"\*" + r"]{1,}"
+            r"["
+            + "#®•©™&@·º½¾¿¡§~"
+            + r"\)"
+            + r"\("
+            + r"\]"
+            + r"\["
+            + r"\}"
+            + r"\{"
+            + r"\|"
+            + r"\\"
+            + r"\/"
+            + r"\*"
+            + r"]{1,}"
         )
 
     def clean_text(self, text: str) -> str:
@@ -93,7 +105,7 @@ def clean_text(self, text: str) -> str:
         )
 
         # кавычки к одному стандарту
-        text = re.sub(r"[`´«»""¨]", '"', text)
+        text = re.sub(r"[`´«»" "¨]", '"', text)
         text = re.sub(r"['']", "'", text)
 
         # &quot; and &amp
@@ -243,9 +255,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         """
         # Ensure T5GemmaEncoder is available for loading
         import transformers
-        if not hasattr(transformers, 'T5GemmaEncoder'):
+
+        if not hasattr(transformers, "T5GemmaEncoder"):
             try:
                 from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+
                 transformers.T5GemmaEncoder = T5GemmaEncoder
             except ImportError:
                 # T5GemmaEncoder not available in this transformers version
@@ -254,7 +268,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # Proceed with standard loading
         return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
 
-
     def __init__(
         self,
         transformer: MirageTransformer2DModel,
@@ -333,7 +346,7 @@ def _enhance_vae_properties(self):
             if hasattr(self.vae, "spatial_compression_ratio") and self.vae.spatial_compression_ratio == 32:
                 self.vae.latent_channels = 32  # DC-AE default
             else:
-                self.vae.latent_channels = 4   # AutoencoderKL default
+                self.vae.latent_channels = 4  # AutoencoderKL default
 
     @property
     def vae_scale_factor(self):
@@ -353,7 +366,10 @@ def prepare_latents(
     ):
         """Prepare initial latents for the diffusion process."""
         if latents is None:
-            latent_height, latent_width = height // self.vae.spatial_compression_ratio, width // self.vae.spatial_compression_ratio
+            latent_height, latent_width = (
+                height // self.vae.spatial_compression_ratio,
+                width // self.vae.spatial_compression_ratio,
+            )
             shape = (batch_size, num_channels_latents, latent_height, latent_width)
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
@@ -424,7 +440,9 @@ def check_inputs(
     ):
         """Check that all inputs are in correct format."""
         if height % self.vae.spatial_compression_ratio != 0 or width % self.vae.spatial_compression_ratio != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}.")
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}."
+            )
 
         if guidance_scale < 1.0:
             raise ValueError(f"guidance_scale has to be >= 1.0 but is {guidance_scale}")
@@ -584,12 +602,16 @@ def __call__(
 
                 # Forward through transformer layers
                 img_seq = self.transformer.forward_transformers(
-                    img_seq, txt, time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
-                    pe=pe, attention_mask=ca_mask
+                    img_seq,
+                    txt,
+                    time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    pe=pe,
+                    attention_mask=ca_mask,
                 )
 
                 # Convert back to image format
                 from ...models.transformers.transformer_mirage import seq2img
+
                 noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
 
                 # Apply CFG
@@ -626,4 +648,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return MiragePipelineOutput(images=image)
\ No newline at end of file
+        return MiragePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
index dfb55821d142..e41c8e3bea00 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -32,4 +32,4 @@ class MiragePipelineOutput(BaseOutput):
             num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
     """
 
-    images: Union[List[PIL.Image.Image], np.ndarray]
\ No newline at end of file
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 5e7b0bd165a6..0085627aa7e4 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -133,8 +133,7 @@ def test_process_inputs(self):
 
         with torch.no_grad():
             img_seq, txt, pe = model.process_inputs(
-                inputs_dict["image_latent"],
-                inputs_dict["cross_attn_conditioning"]
+                inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
         # Check shapes
@@ -144,7 +143,9 @@ def test_process_inputs(self):
         expected_seq_len = (height // patch_size) * (width // patch_size)
 
         self.assertEqual(img_seq.shape, (batch_size, expected_seq_len, init_dict["in_channels"] * patch_size**2))
-        self.assertEqual(txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"]))
+        self.assertEqual(
+            txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"])
+        )
         # Check that pe has the correct batch size, sequence length and some embedding dimension
         self.assertEqual(pe.shape[0], batch_size)  # batch size
         self.assertEqual(pe.shape[1], 1)  # unsqueeze(1) in EmbedND
@@ -160,20 +161,14 @@ def test_forward_transformers(self):
         with torch.no_grad():
             # Process inputs first
             img_seq, txt, pe = model.process_inputs(
-                inputs_dict["image_latent"],
-                inputs_dict["cross_attn_conditioning"]
+                inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
             # Test forward_transformers
-            output_seq = model.forward_transformers(
-                img_seq,
-                txt,
-                timestep=inputs_dict["timestep"],
-                pe=pe
-            )
+            output_seq = model.forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
 
         # Check output shape
-        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"]**2
+        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"] ** 2
         self.assertEqual(output_seq.shape, (img_seq.shape[0], img_seq.shape[1], expected_out_channels))
 
     def test_attention_mask(self):
@@ -186,13 +181,10 @@ def test_attention_mask(self):
         batch_size = inputs_dict["cross_attn_conditioning"].shape[0]
         seq_len = inputs_dict["cross_attn_conditioning"].shape[1]
         attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool).to(torch_device)
-        attention_mask[:, seq_len//2:] = False  # Mask second half
+        attention_mask[:, seq_len // 2 :] = False  # Mask second half
 
         with torch.no_grad():
-            outputs = model(
-                **inputs_dict,
-                cross_attn_mask=attention_mask
-            )
+            outputs = model(**inputs_dict, cross_attn_mask=attention_mask)
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
@@ -237,7 +229,7 @@ def test_gradient_checkpointing_enable(self):
 
         # Check that _activation_checkpointing is set
         for block in model.blocks:
-            self.assertTrue(hasattr(block, '_activation_checkpointing'))
+            self.assertTrue(hasattr(block, "_activation_checkpointing"))
 
     def test_from_config(self):
         init_dict, _ = self.prepare_init_args_and_inputs_for_common()
@@ -249,4 +241,4 @@ def test_from_config(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 9c0944581a386736bc808e68d7dfb52d8cf1790e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 26 Sep 2025 21:50:16 +0530
Subject: [PATCH 10/69] [docs] slight edits to the attention backends docs.
 (#12394)

* slight edits to the attention backends docs.

* Update docs/source/en/optimization/attention_backends.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/attention_backends.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/optimization/attention_backends.md b/docs/source/en/optimization/attention_backends.md
index 04c8b4ba921c..e603878a6383 100644
--- a/docs/source/en/optimization/attention_backends.md
+++ b/docs/source/en/optimization/attention_backends.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # Attention backends
 
-> [!TIP]
+> [!NOTE]
 > The attention dispatcher is an experimental feature. Please open an issue if you have any feedback or encounter any problems.
 
 Diffusers provides several optimized attention algorithms that are more memory and computationally efficient through it's *attention dispatcher*. The dispatcher acts as a router for managing and switching between different attention implementations and provides a unified interface for interacting with them.
@@ -33,7 +33,7 @@ The [`~ModelMixin.set_attention_backend`] method iterates through all the module
 
 The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
 
-> [!TIP]
+> [!NOTE]
 > FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`.
 
 ```py
@@ -78,10 +78,16 @@ with attention_backend("_flash_3_hub"):
     image = pipeline(prompt).images[0]
 ```
 
+> [!TIP]
+> Most attention backends support `torch.compile` without graph breaks and can be used to further speed up inference.
+
 ## Available backends
 
 Refer to the table below for a complete list of available attention backends and their variants.
 
+<details>
+<summary>Expand</summary>
+
 | Backend Name | Family | Description |
 |--------------|--------|-------------|
 | `native` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Default backend using PyTorch's scaled_dot_product_attention |
@@ -104,3 +110,5 @@ Refer to the table below for a complete list of available attention backends and
 | `_sage_qk_int8_pv_fp16_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (CUDA) |
 | `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) |
 | `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention |
+
+</details>
\ No newline at end of file

From 041501aea92919c9c7f36e189fc9cf7d865ebb96 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 26 Sep 2025 22:38:43 +0530
Subject: [PATCH 11/69] [docs] remove docstrings from repeated methods in
 `lora_pipeline.py` (#12393)

* start unbloating docstrings (save_lora_weights).

* load_lora_weights()

* lora_state_dict

* fuse_lora

* unfuse_lora

* load_lora_into_transformer
---
 src/diffusers/loaders/lora_pipeline.py | 2234 ++----------------------
 1 file changed, 107 insertions(+), 2127 deletions(-)

diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
index 8060b519f147..65bdae692070 100644
--- a/src/diffusers/loaders/lora_pipeline.py
+++ b/src/diffusers/loaders/lora_pipeline.py
@@ -621,33 +621,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
-        `self.text_encoder`.
-
-        All kwargs are forwarded to `self.lora_state_dict`.
-
-        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
-        loaded.
-
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is
-        loaded into `self.unet`.
-
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state
-        dict is loaded into `self.text_encoder`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -967,35 +941,7 @@ def save_lora_weights(
         text_encoder_2_lora_adapter_metadata=None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the UNet and text encoder.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `unet`.
-            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
-                encoder LoRA state dict because it comes from 🤗 Transformers.
-            text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
-                encoder LoRA state dict because it comes from 🤗 Transformers.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            unet_lora_adapter_metadata:
-                LoRA adapter metadata associated with the unet to be serialized with the state dict.
-            text_encoder_lora_adapter_metadata:
-                LoRA adapter metadata associated with the text encoder to be serialized with the state dict.
-            text_encoder_2_lora_adapter_metadata:
-                LoRA adapter metadata associated with the second text encoder to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -1036,35 +982,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -1076,21 +994,7 @@ def fuse_lora(
 
     def unfuse_lora(self, components: List[str] = ["unet", "text_encoder", "text_encoder_2"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
-            unfuse_text_encoder (`bool`, defaults to `True`):
-                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
-                LoRA parameters then it won't have any effect.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -1116,51 +1020,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -1214,30 +1074,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
-        `self.text_encoder`.
-
-        All kwargs are forwarded to `self.lora_state_dict`.
-
-        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is
-        loaded.
-
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -1306,26 +1143,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`SD3Transformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -1420,35 +1238,7 @@ def save_lora_weights(
         text_encoder_2_lora_adapter_metadata=None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the UNet and text encoder.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
-                encoder LoRA state dict because it comes from 🤗 Transformers.
-            text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
-                encoder LoRA state dict because it comes from 🤗 Transformers.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
-            text_encoder_lora_adapter_metadata:
-                LoRA adapter metadata associated with the text encoder to be serialized with the state dict.
-            text_encoder_2_lora_adapter_metadata:
-                LoRA adapter metadata associated with the second text encoder to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -1490,35 +1280,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -1531,21 +1293,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin.unfuse_lora with unet->transformer
     def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder", "text_encoder_2"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
-            unfuse_text_encoder (`bool`, defaults to `True`):
-                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
-                LoRA parameters then it won't have any effect.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -1567,51 +1315,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -1666,25 +1370,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -1730,26 +1416,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`AuraFlowTransformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -1781,25 +1448,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -1831,35 +1480,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -1872,18 +1493,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -1910,50 +1520,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -2207,30 +1774,7 @@ def load_lora_into_transformer(
         hotswap: bool = False,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            network_alphas (`Dict[str, float]`):
-                The value of the network alpha used for stable learning and preventing underflow. This value has the
-                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
-                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
-            transformer (`FluxTransformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"):
             raise ValueError(
@@ -2435,35 +1979,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
 
         transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer
@@ -2806,30 +2322,7 @@ def load_lora_into_transformer(
         hotswap: bool = False,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            network_alphas (`Dict[str, float]`):
-                The value of the network alpha used for stable learning and preventing underflow. This value has the
-                same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
-                link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
-            transformer (`UVit2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"):
             raise ValueError(
@@ -2979,51 +2472,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -3077,25 +2526,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -3141,26 +2572,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`CogVideoXTransformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -3180,7 +2592,6 @@ def load_lora_into_transformer(
         )
 
     @classmethod
-    # Adapted from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.save_lora_weights without support for text encoder
     def save_lora_weights(
         cls,
         save_directory: Union[str, os.PathLike],
@@ -3192,25 +2603,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -3241,35 +2634,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -3281,18 +2646,7 @@ def fuse_lora(
 
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -3314,51 +2668,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -3413,25 +2723,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -3477,26 +2769,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`MochiTransformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -3528,25 +2801,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -3578,35 +2833,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -3619,20 +2846,9 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
-        """
-        super().unfuse_lora(components=components, **kwargs)
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
+        """
+        super().unfuse_lora(components=components, **kwargs)
 
 
 class LTXVideoLoraLoaderMixin(LoraBaseMixin):
@@ -3651,50 +2867,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -3753,25 +2926,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -3817,26 +2972,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`LTXVideoTransformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -3868,25 +3004,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -3918,35 +3036,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -3959,18 +3049,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -3992,51 +3071,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -4091,25 +3126,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -4155,26 +3172,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`SanaTransformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -4206,25 +3204,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -4256,59 +3236,20 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
-        """
-        super().fuse_lora(
-            components=components,
-            lora_scale=lora_scale,
-            safe_fusing=safe_fusing,
-            adapter_names=adapter_names,
-            **kwargs,
-        )
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
+        """
+        super().fuse_lora(
+            components=components,
+            lora_scale=lora_scale,
+            safe_fusing=safe_fusing,
+            adapter_names=adapter_names,
+            **kwargs,
+        )
 
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -4329,50 +3270,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading original format HunyuanVideo LoRA checkpoints.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -4431,25 +3329,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -4495,26 +3375,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`HunyuanVideoTransformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -4546,25 +3407,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -4596,35 +3439,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -4637,18 +3452,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -4669,50 +3473,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -4772,25 +3533,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -4836,26 +3579,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`Lumina2Transformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -4887,25 +3611,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -4937,35 +3643,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -4978,18 +3656,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -5010,50 +3677,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -5159,25 +3783,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -5247,26 +3853,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`WanTransformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -5298,25 +3885,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -5348,35 +3917,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -5389,18 +3930,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -5422,50 +3952,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -5573,25 +4060,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -5661,26 +4130,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`SkyReelsV2Transformer3DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -5712,25 +4162,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -5762,35 +4194,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -5803,18 +4207,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -5836,51 +4229,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -5935,25 +4284,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -5999,26 +4330,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`CogView4Transformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -6050,25 +4362,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -6100,35 +4394,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -6141,18 +4407,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -6162,61 +4417,18 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin):
     Load LoRA layers into [`HiDreamImageTransformer2DModel`]. Specific to [`HiDreamImagePipeline`].
     """
 
-    _lora_loadable_modules = ["transformer"]
-    transformer_name = TRANSFORMER_NAME
-
-    @classmethod
-    @validate_hf_hub_args
-    def lora_state_dict(
-        cls,
-        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        **kwargs,
-    ):
-        r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
+    _lora_loadable_modules = ["transformer"]
+    transformer_name = TRANSFORMER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -6275,25 +4487,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -6339,26 +4533,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`HiDreamImageTransformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -6390,25 +4565,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -6440,35 +4597,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -6481,18 +4610,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 
@@ -6513,51 +4631,7 @@ def lora_state_dict(
         **kwargs,
     ):
         r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_lora_metadata (`bool`, *optional*, defaults to False):
-                When enabled, additionally return the LoRA adapter metadata, typically found in the state dict.
-
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
         """
         # Load the main state dict first which has the LoRA layers for either of
         # transformer and text encoder or both.
@@ -6618,25 +4692,7 @@ def load_lora_weights(
         **kwargs,
     ):
         """
-        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
-        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
-        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
-        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
-        dict is loaded into `self.transformer`.
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            kwargs (`dict`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
         """
         if not USE_PEFT_BACKEND:
             raise ValueError("PEFT backend is required for this method.")
@@ -6682,26 +4738,7 @@ def load_lora_into_transformer(
         metadata=None,
     ):
         """
-        This will load the LoRA layers specified in `state_dict` into `transformer`.
-
-        Parameters:
-            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
-            transformer (`QwenImageTransformer2DModel`):
-                The Transformer model to load the LoRA layers into.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
-            low_cpu_mem_usage (`bool`, *optional*):
-                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
-                weights.
-            hotswap (`bool`, *optional*):
-                See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`].
-            metadata (`dict`):
-                Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived
-                from the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
         """
         if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
             raise ValueError(
@@ -6733,25 +4770,7 @@ def save_lora_weights(
         transformer_lora_adapter_metadata: Optional[dict] = None,
     ):
         r"""
-        Save the LoRA parameters corresponding to the transformer.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `transformer`.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            transformer_lora_adapter_metadata:
-                LoRA adapter metadata associated with the transformer to be serialized with the state dict.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
         """
         lora_layers = {}
         lora_metadata = {}
@@ -6783,35 +4802,7 @@ def fuse_lora(
         **kwargs,
     ):
         r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
-            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
-            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
-            adapter_names (`List[str]`, *optional*):
-                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
-
-        Example:
-
-        ```py
-        from diffusers import DiffusionPipeline
-        import torch
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ).to("cuda")
-        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
-        pipeline.fuse_lora(lora_scale=0.7)
-        ```
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
         """
         super().fuse_lora(
             components=components,
@@ -6824,18 +4815,7 @@ def fuse_lora(
     # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
     def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
-
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
-
-        Args:
-            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
-            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
         """
         super().unfuse_lora(components=components, **kwargs)
 

From 19085ac8f4947091b1e6b3ca980153eadc12c653 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 29 Sep 2025 13:08:05 +0530
Subject: [PATCH 12/69] Don't skip Qwen model tests for group offloading with
 disk (#12382)

u[
---
 tests/models/test_modeling_common.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 3a008edfe1c2..90ded6a7ecb2 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1793,11 +1793,6 @@ def test_group_offloading_with_disk(self, offload_type, record_stream, atol=1e-5
         if not self.model_class._supports_group_offloading:
             pytest.skip("Model does not support group offloading.")
 
-        if self.model_class.__name__ == "QwenImageTransformer2DModel":
-            pytest.skip(
-                "QwenImageTransformer2DModel doesn't support group offloading with disk. Needs to be investigated."
-            )
-
         def _has_generator_arg(model):
             sig = inspect.signature(model.forward)
             params = sig.parameters

From 0a151115bbe493de74a4565e57352a0890e94777 Mon Sep 17 00:00:00 2001
From: Akshay Babbar <priv.akshay@outlook.com>
Date: Mon, 29 Sep 2025 14:20:05 +0530
Subject: [PATCH 13/69] Fix #12116:  preserve boolean dtype for attention masks
 in ChromaPipeline  (#12263)

* fix: preserve boolean dtype for attention masks in ChromaPipeline

- Convert attention masks to bool and prevent dtype corruption
- Fix both positive and negative mask handling in _get_t5_prompt_embeds
- Remove float conversion in _prepare_attention_mask method

Fixes #12116

* test: add ChromaPipeline attention mask dtype tests

* test: add slow ChromaPipeline attention mask tests

* chore: removed comments

* refactor: removing redundant type conversion

* Remove dedicated dtype tests as per  feedback

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 19ea7729c9d9..5482035b3afb 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -238,7 +238,7 @@ def _get_t5_prompt_embeds(
         # Chroma requires the attention mask to include one padding token
         seq_lengths = attention_mask.sum(dim=1)
         mask_indices = torch.arange(attention_mask.size(1)).unsqueeze(0).expand(batch_size, -1)
-        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).long()
+        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).bool()
 
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device), output_hidden_states=False, attention_mask=attention_mask.to(device)
@@ -246,7 +246,7 @@ def _get_t5_prompt_embeds(
 
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        attention_mask = attention_mask.to(dtype=dtype, device=device)
+        attention_mask = attention_mask.to(device=device)
 
         _, seq_len, _ = prompt_embeds.shape
 
@@ -605,10 +605,9 @@ def _prepare_attention_mask(
 
         # Extend the prompt attention mask to account for image tokens in the final sequence
         attention_mask = torch.cat(
-            [attention_mask, torch.ones(batch_size, sequence_length, device=attention_mask.device)],
+            [attention_mask, torch.ones(batch_size, sequence_length, device=attention_mask.device, dtype=torch.bool)],
             dim=1,
         )
-        attention_mask = attention_mask.to(dtype)
 
         return attention_mask
 

From 64a5187d96f9376c7cf5123db810f2d2da79d7d0 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 29 Sep 2025 18:04:18 +0530
Subject: [PATCH 14/69] [quantization] feat: support aobaseconfig classes in
 `TorchAOConfig` (#12275)

* feat: support aobaseconfig classes.

* [docs] AOBaseConfig (#12302)

init

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

* up

* replace with is_torchao_version

* up

* up

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/quantization/torchao.md        | 105 ++++++-----
 .../quantizers/quantization_config.py         | 165 +++++++++++++++---
 .../quantizers/torchao/torchao_quantizer.py   |  92 +++++++---
 tests/quantization/torchao/test_torchao.py    |  22 +++
 4 files changed, 295 insertions(+), 89 deletions(-)

diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
index 5c7578dcbb4e..18cc109e0785 100644
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -11,69 +11,96 @@ specific language governing permissions and limitations under the License. -->
 
 # torchao
 
-[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch. It provides high-performance dtypes, optimization techniques, and kernels for inference and training, featuring composability with native PyTorch features like [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html), FullyShardedDataParallel (FSDP), and more.
+[torchao](https://github.com/pytorch/ao) provides high-performance dtypes and optimizations based on quantization and sparsity for inference and training PyTorch models. It is supported for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
 
-Before you begin, make sure you have Pytorch 2.5+ and TorchAO installed.
+Make sure Pytorch 2.5+ and torchao are installed with the command below.
 
 ```bash
-pip install -U torch torchao
+uv pip install -U torch torchao
 ```
 
+Each quantization dtype is available as a separate instance of a [AOBaseConfig](https://docs.pytorch.org/ao/main/api_ref_quantization.html#inference-apis-for-quantize) class. This provides more flexible configuration options by exposing more available arguments.
 
-Quantize a model by passing [`TorchAoConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
+Pass the `AOBaseConfig` of a quantization dtype, like [Int4WeightOnlyConfig](https://docs.pytorch.org/ao/main/generated/torchao.quantization.Int4WeightOnlyConfig) to [`TorchAoConfig`] in [`~ModelMixin.from_pretrained`].
 
-The example below only quantizes the weights to int8.
-
-```python
+```py
 import torch
-from diffusers import FluxPipeline, AutoModel, TorchAoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-dtype = torch.bfloat16
+from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
+from torchao.quantization import Int8WeightOnlyConfig
 
-quantization_config = TorchAoConfig("int8wo")
-transformer = AutoModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=dtype,
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={"transformer": TorchAoConfig(Int8WeightOnlyConfig(group_size=128)))}
 )
-pipe = FluxPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=dtype,
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantzation_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
 )
-pipe.to("cuda")
+```
 
-# Without quantization: ~31.447 GB
-# With quantization: ~20.40 GB
-print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB")
+For simple use cases, you could also provide a string identifier in [`TorchAo`] as shown below.
 
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
-).images[0]
-image.save("output.png")
+```py
+import torch
+from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
+
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={"transformer": TorchAoConfig("int8wo")}
+)
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantzation_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
 ```
 
-TorchAO is fully compatible with [torch.compile](../optimization/fp16#torchcompile), setting it apart from other quantization methods. This makes it easy to speed up inference with just one line of code.
+## torch.compile
+
+torchao supports [torch.compile](../optimization/fp16#torchcompile) which can speed up inference with one line of code.
 
 ```python
-# In the above code, add the following after initializing the transformer
-transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
+import torch
+from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
+from torchao.quantization import Int4WeightOnlyConfig
+
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={"transformer": TorchAoConfig(Int4WeightOnlyConfig(group_size=128)))}
+)
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantzation_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+
+pipeline.transformer.compile(transformer, mode="max-autotune", fullgraph=True)
 ```
 
-For speed and memory benchmarks on Flux and CogVideoX, please refer to the table [here](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450). You can also find some torchao [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) numbers for various hardware.
+Refer to this [table](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450) for inference speed and memory usage benchmarks with Flux and CogVideoX. More benchmarks on various hardware are also available in the torchao [repository](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
 
 > [!TIP]
 > The FP8 post-training quantization schemes in torchao are effective for GPUs with compute capability of at least 8.9 (RTX-4090, Hopper, etc.). FP8 often provides the best speed, memory, and quality trade-off when generating images and videos. We recommend combining FP8 and torch.compile if your GPU is compatible.
 
-torchao also supports an automatic quantization API through [autoquant](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#autoquantization). Autoquantization determines the best quantization strategy applicable to a model by comparing the performance of each technique on chosen input types and shapes. Currently, this can be used directly on the underlying modeling components. Diffusers will also expose an autoquant configuration option in the future.
+## autoquant
+
+torchao provides [autoquant](https://docs.pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) an automatic quantization API. Autoquantization chooses the best quantization strategy by comparing the performance of each strategy on chosen input types and shapes. This is only supported in Diffusers for individual models at the moment.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from torchao.quantization import autoquant
+
+# Load the pipeline
+pipeline = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-schnell",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
 
-The `TorchAoConfig` class accepts three parameters:
-- `quant_type`: A string value mentioning one of the quantization types below.
-- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`FluxTransformer2DModel`]'s first block, one would specify: `modules_to_not_convert=["single_transformer_blocks.0"]`.
-- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`.
+transformer = autoquant(pipeline.transformer)
+```
 
 ## Supported quantization types
 
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index bf857956512c..5dd8f56717df 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -21,19 +21,20 @@
 """
 
 import copy
+import dataclasses
 import importlib.metadata
 import inspect
 import json
 import os
 import warnings
-from dataclasses import dataclass
+from dataclasses import dataclass, is_dataclass
 from enum import Enum
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from packaging import version
 
-from ..utils import is_torch_available, is_torchao_available, logging
+from ..utils import is_torch_available, is_torchao_available, is_torchao_version, logging
 
 
 if is_torch_available():
@@ -443,7 +444,7 @@ class TorchAoConfig(QuantizationConfigMixin):
     """This is a config class for torchao quantization/sparsity techniques.
 
     Args:
-        quant_type (`str`):
+        quant_type (Union[`str`, AOBaseConfig]):
             The type of quantization we want to use, currently supporting:
                 - **Integer quantization:**
                     - Full function names: `int4_weight_only`, `int8_dynamic_activation_int4_weight`,
@@ -465,6 +466,7 @@ class TorchAoConfig(QuantizationConfigMixin):
                 - **Unsigned Integer quantization:**
                     - Full function names: `uintx_weight_only`
                     - Shorthands: `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo`
+                - An AOBaseConfig instance: for more advanced configuration options.
         modules_to_not_convert (`List[str]`, *optional*, default to `None`):
             The list of modules to not quantize, useful for quantizing models that explicitly require to have some
             modules left in their original precision.
@@ -478,6 +480,12 @@ class TorchAoConfig(QuantizationConfigMixin):
         ```python
         from diffusers import FluxTransformer2DModel, TorchAoConfig
 
+        # AOBaseConfig-based configuration
+        from torchao.quantization import Int8WeightOnlyConfig
+
+        quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+
+        # String-based config
         quantization_config = TorchAoConfig("int8wo")
         transformer = FluxTransformer2DModel.from_pretrained(
             "black-forest-labs/Flux.1-Dev",
@@ -490,7 +498,7 @@ class TorchAoConfig(QuantizationConfigMixin):
 
     def __init__(
         self,
-        quant_type: str,
+        quant_type: Union[str, "AOBaseConfig"],  # noqa: F821
         modules_to_not_convert: Optional[List[str]] = None,
         **kwargs,
     ) -> None:
@@ -504,34 +512,103 @@ def __init__(
         else:
             self.quant_type_kwargs = kwargs
 
-        TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
-        if self.quant_type not in TORCHAO_QUANT_TYPE_METHODS.keys():
-            is_floating_quant_type = self.quant_type.startswith("float") or self.quant_type.startswith("fp")
-            if is_floating_quant_type and not self._is_xpu_or_cuda_capability_atleast_8_9():
+        self.post_init()
+
+    def post_init(self):
+        if not isinstance(self.quant_type, str):
+            if is_torchao_version("<=", "0.9.0"):
                 raise ValueError(
-                    f"Requested quantization type: {self.quant_type} is not supported on GPUs with CUDA capability <= 8.9. You "
-                    f"can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`."
+                    f"torchao <= 0.9.0 only supports string quant_type, got {type(self.quant_type).__name__}. "
+                    f"Upgrade to torchao > 0.9.0 to use AOBaseConfig."
                 )
 
-            raise ValueError(
-                f"Requested quantization type: {self.quant_type} is not supported or is an incorrect `quant_type` name. If you think the "
-                f"provided quantization type should be supported, please open an issue at https://github.com/huggingface/diffusers/issues."
-            )
+            from torchao.quantization.quant_api import AOBaseConfig
 
-        method = TORCHAO_QUANT_TYPE_METHODS[self.quant_type]
-        signature = inspect.signature(method)
-        all_kwargs = {
-            param.name
-            for param in signature.parameters.values()
-            if param.kind in [inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD]
-        }
-        unsupported_kwargs = list(self.quant_type_kwargs.keys() - all_kwargs)
+            if not isinstance(self.quant_type, AOBaseConfig):
+                raise TypeError(f"quant_type must be a AOBaseConfig instance, got {type(self.quant_type).__name__}")
 
-        if len(unsupported_kwargs) > 0:
-            raise ValueError(
-                f'The quantization method "{quant_type}" does not support the following keyword arguments: '
-                f"{unsupported_kwargs}. The following keywords arguments are supported: {all_kwargs}."
-            )
+        elif isinstance(self.quant_type, str):
+            TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
+
+            if self.quant_type not in TORCHAO_QUANT_TYPE_METHODS.keys():
+                is_floating_quant_type = self.quant_type.startswith("float") or self.quant_type.startswith("fp")
+                if is_floating_quant_type and not self._is_xpu_or_cuda_capability_atleast_8_9():
+                    raise ValueError(
+                        f"Requested quantization type: {self.quant_type} is not supported on GPUs with CUDA capability <= 8.9. You "
+                        f"can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`."
+                    )
+
+                raise ValueError(
+                    f"Requested quantization type: {self.quant_type} is not supported or is an incorrect `quant_type` name. If you think the "
+                    f"provided quantization type should be supported, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+
+            method = TORCHAO_QUANT_TYPE_METHODS[self.quant_type]
+            signature = inspect.signature(method)
+            all_kwargs = {
+                param.name
+                for param in signature.parameters.values()
+                if param.kind in [inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD]
+            }
+            unsupported_kwargs = list(self.quant_type_kwargs.keys() - all_kwargs)
+
+            if len(unsupported_kwargs) > 0:
+                raise ValueError(
+                    f'The quantization method "{self.quant_type}" does not support the following keyword arguments: '
+                    f"{unsupported_kwargs}. The following keywords arguments are supported: {all_kwargs}."
+                )
+
+    def to_dict(self):
+        """Convert configuration to a dictionary."""
+        d = super().to_dict()
+
+        if isinstance(self.quant_type, str):
+            # Handle layout serialization if present
+            if "quant_type_kwargs" in d and "layout" in d["quant_type_kwargs"]:
+                if is_dataclass(d["quant_type_kwargs"]["layout"]):
+                    d["quant_type_kwargs"]["layout"] = [
+                        d["quant_type_kwargs"]["layout"].__class__.__name__,
+                        dataclasses.asdict(d["quant_type_kwargs"]["layout"]),
+                    ]
+                if isinstance(d["quant_type_kwargs"]["layout"], list):
+                    assert len(d["quant_type_kwargs"]["layout"]) == 2, "layout saves layout name and layout kwargs"
+                    assert isinstance(d["quant_type_kwargs"]["layout"][0], str), "layout name must be a string"
+                    assert isinstance(d["quant_type_kwargs"]["layout"][1], dict), "layout kwargs must be a dict"
+                else:
+                    raise ValueError("layout must be a list")
+        else:
+            # Handle AOBaseConfig serialization
+            from torchao.core.config import config_to_dict
+
+            # For now we assume there is 1 config per Transformer, however in the future
+            # We may want to support a config per fqn.
+            d["quant_type"] = {"default": config_to_dict(self.quant_type)}
+
+        return d
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """Create configuration from a dictionary."""
+        if not is_torchao_version(">", "0.9.0"):
+            raise NotImplementedError("TorchAoConfig requires torchao > 0.9.0 for construction from dict")
+        config_dict = config_dict.copy()
+        quant_type = config_dict.pop("quant_type")
+
+        if isinstance(quant_type, str):
+            return cls(quant_type=quant_type, **config_dict)
+        # Check if we only have one key which is "default"
+        # In the future we may update this
+        assert len(quant_type) == 1 and "default" in quant_type, (
+            "Expected only one key 'default' in quant_type dictionary"
+        )
+        quant_type = quant_type["default"]
+
+        # Deserialize quant_type if needed
+        from torchao.core.config import config_from_dict
+
+        quant_type = config_from_dict(quant_type)
+
+        return cls(quant_type=quant_type, **config_dict)
 
     @classmethod
     def _get_torchao_quant_type_to_method(cls):
@@ -681,8 +758,38 @@ def _is_xpu_or_cuda_capability_atleast_8_9() -> bool:
             raise RuntimeError("TorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.")
 
     def get_apply_tensor_subclass(self):
-        TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
-        return TORCHAO_QUANT_TYPE_METHODS[self.quant_type](**self.quant_type_kwargs)
+        """Create the appropriate quantization method based on configuration."""
+        if not isinstance(self.quant_type, str):
+            return self.quant_type
+        else:
+            methods = self._get_torchao_quant_type_to_method()
+            quant_type_kwargs = self.quant_type_kwargs.copy()
+            if (
+                not torch.cuda.is_available()
+                and is_torchao_available()
+                and self.quant_type == "int4_weight_only"
+                and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+                and quant_type_kwargs.get("layout", None) is None
+            ):
+                if torch.xpu.is_available():
+                    if version.parse(importlib.metadata.version("torchao")) >= version.parse(
+                        "0.11.0"
+                    ) and version.parse(importlib.metadata.version("torch")) > version.parse("2.7.9"):
+                        from torchao.dtypes import Int4XPULayout
+                        from torchao.quantization.quant_primitives import ZeroPointDomain
+
+                        quant_type_kwargs["layout"] = Int4XPULayout()
+                        quant_type_kwargs["zero_point_domain"] = ZeroPointDomain.INT
+                    else:
+                        raise ValueError(
+                            "TorchAoConfig requires torchao >= 0.11.0 and torch >= 2.8.0 for XPU support. Please upgrade the version or use run on CPU with the cpu version pytorch."
+                        )
+                else:
+                    from torchao.dtypes import Int4CPULayout
+
+                    quant_type_kwargs["layout"] = Int4CPULayout()
+
+            return methods[self.quant_type](**quant_type_kwargs)
 
     def __repr__(self):
         r"""
diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py
index 976bc8a1e0e5..2334c7af8630 100644
--- a/src/diffusers/quantizers/torchao/torchao_quantizer.py
+++ b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -18,9 +18,10 @@
 """
 
 import importlib
+import re
 import types
 from fnmatch import fnmatch
-from typing import TYPE_CHECKING, Any, Dict, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from packaging import version
 
@@ -107,6 +108,21 @@ def _update_torch_safe_globals():
     _update_torch_safe_globals()
 
 
+def fuzzy_match_size(config_name: str) -> Optional[str]:
+    """
+    Extract the size digit from strings like "4weight", "8weight". Returns the digit as an integer if found, otherwise
+    None.
+    """
+    config_name = config_name.lower()
+
+    str_match = re.search(r"(\d)weight", config_name)
+
+    if str_match:
+        return str_match.group(1)
+
+    return None
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -176,8 +192,7 @@ def validate_environment(self, *args, **kwargs):
 
     def update_torch_dtype(self, torch_dtype):
         quant_type = self.quantization_config.quant_type
-
-        if quant_type.startswith("int") or quant_type.startswith("uint"):
+        if isinstance(quant_type, str) and (quant_type.startswith("int") or quant_type.startswith("uint")):
             if torch_dtype is not None and torch_dtype != torch.bfloat16:
                 logger.warning(
                     f"You are trying to set torch_dtype to {torch_dtype} for int4/int8/uintx quantization, but "
@@ -197,24 +212,44 @@ def update_torch_dtype(self, torch_dtype):
 
     def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
         quant_type = self.quantization_config.quant_type
-
-        if quant_type.startswith("int8") or quant_type.startswith("int4"):
-            # Note that int4 weights are created by packing into torch.int8, but since there is no torch.int4, we use torch.int8
-            return torch.int8
-        elif quant_type == "uintx_weight_only":
-            return self.quantization_config.quant_type_kwargs.get("dtype", torch.uint8)
-        elif quant_type.startswith("uint"):
-            return {
-                1: torch.uint1,
-                2: torch.uint2,
-                3: torch.uint3,
-                4: torch.uint4,
-                5: torch.uint5,
-                6: torch.uint6,
-                7: torch.uint7,
-            }[int(quant_type[4])]
-        elif quant_type.startswith("float") or quant_type.startswith("fp"):
-            return torch.bfloat16
+        from accelerate.utils import CustomDtype
+
+        if isinstance(quant_type, str):
+            if quant_type.startswith("int8"):
+                # Note that int4 weights are created by packing into torch.int8, but since there is no torch.int4, we use torch.int8
+                return torch.int8
+            elif quant_type.startswith("int4"):
+                return CustomDtype.INT4
+            elif quant_type == "uintx_weight_only":
+                return self.quantization_config.quant_type_kwargs.get("dtype", torch.uint8)
+            elif quant_type.startswith("uint"):
+                return {
+                    1: torch.uint1,
+                    2: torch.uint2,
+                    3: torch.uint3,
+                    4: torch.uint4,
+                    5: torch.uint5,
+                    6: torch.uint6,
+                    7: torch.uint7,
+                }[int(quant_type[4])]
+            elif quant_type.startswith("float") or quant_type.startswith("fp"):
+                return torch.bfloat16
+
+        elif is_torchao_version(">", "0.9.0"):
+            from torchao.core.config import AOBaseConfig
+
+            quant_type = self.quantization_config.quant_type
+            if isinstance(quant_type, AOBaseConfig):
+                # Extract size digit using fuzzy match on the class name
+                config_name = quant_type.__class__.__name__
+                size_digit = fuzzy_match_size(config_name)
+
+                # Map the extracted digit to appropriate dtype
+                if size_digit == "4":
+                    return CustomDtype.INT4
+                else:
+                    # Default to int8
+                    return torch.int8
 
         if isinstance(target_dtype, SUPPORTED_TORCH_DTYPES_FOR_QUANTIZATION):
             return target_dtype
@@ -297,6 +332,21 @@ def get_cuda_warm_up_factor(self):
         # Original mapping for non-AOBaseConfig types
         # For the uint types, this is a best guess. Once these types become more used
         # we can look into their nuances.
+        if is_torchao_version(">", "0.9.0"):
+            from torchao.core.config import AOBaseConfig
+
+            quant_type = self.quantization_config.quant_type
+            # For autoquant case, it will be treated in the string implementation below in map_to_target_dtype
+            if isinstance(quant_type, AOBaseConfig):
+                # Extract size digit using fuzzy match on the class name
+                config_name = quant_type.__class__.__name__
+                size_digit = fuzzy_match_size(config_name)
+
+                if size_digit == "4":
+                    return 8
+                else:
+                    return 4
+
         map_to_target_dtype = {"int4_*": 8, "int8_*": 4, "uint*": 8, "float8*": 4}
         quant_type = self.quantization_config.quant_type
         for pattern, target_dtype in map_to_target_dtype.items():
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 920c3a55f56c..38997de17b12 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -14,11 +14,13 @@
 # limitations under the License.
 
 import gc
+import importlib.metadata
 import tempfile
 import unittest
 from typing import List
 
 import numpy as np
+from packaging import version
 from parameterized import parameterized
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
@@ -65,6 +67,9 @@
     from torchao.quantization.quant_primitives import MappingType
     from torchao.utils import get_model_size_in_bytes
 
+    if version.parse(importlib.metadata.version("torchao")) >= version.Version("0.9.0"):
+        from torchao.quantization import Int8WeightOnlyConfig
+
 
 @require_torch
 @require_torch_accelerator
@@ -522,6 +527,15 @@ def test_sequential_cpu_offload(self):
         inputs = self.get_dummy_inputs(torch_device)
         _ = pipe(**inputs)
 
+    @require_torchao_version_greater_or_equal("0.9.0")
+    def test_aobase_config(self):
+        quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+        components = self.get_dummy_components(quantization_config)
+        pipe = FluxPipeline(**components).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        _ = pipe(**inputs)
+
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
@@ -628,6 +642,14 @@ def test_int_a16w8_cpu(self):
         self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
         self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)
 
+    @require_torchao_version_greater_or_equal("0.9.0")
+    def test_aobase_config(self):
+        quant_method, quant_method_kwargs = Int8WeightOnlyConfig(), {}
+        expected_slice = np.array([0.3613, -0.127, -0.0223, -0.2539, -0.459, 0.4961, -0.1357, -0.6992, 0.4551])
+        device = torch_device
+        self._test_original_model_expected_slice(quant_method, quant_method_kwargs, expected_slice)
+        self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)
+
 
 @require_torchao_version_greater_or_equal("0.7.0")
 class TorchAoCompileTest(QuantCompileTests, unittest.TestCase):

From ccedeca96e9aebd3e0e663e668891bea9de30dbc Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Mon, 29 Sep 2025 11:24:26 -0700
Subject: [PATCH 15/69] [docs] Distributed inference (#12285)

* init

* feedback

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../en/training/distributed_inference.md      | 96 +++++++++----------
 1 file changed, 45 insertions(+), 51 deletions(-)

diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
index a536703f5bce..58ec77f75bf3 100644
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -12,17 +12,23 @@ specific language governing permissions and limitations under the License.
 
 # Distributed inference
 
-On distributed setups, you can run inference across multiple GPUs with 🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) or [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html), which is useful for generating with multiple prompts in parallel.
+Distributed inference splits the workload across multiple GPUs. It a useful technique for fitting larger models in memory and can process multiple prompts for higher throughput.
 
-This guide will show you how to use 🤗 Accelerate and PyTorch Distributed for distributed inference.
+This guide will show you how to use [Accelerate](https://huggingface.co/docs/accelerate/index) and [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) for distributed inference.
 
-## 🤗 Accelerate
+## Accelerate
 
-🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) is a library designed to make it easy to train or run inference across distributed setups. It simplifies the process of setting up the distributed environment, allowing you to focus on your PyTorch code.
+Accelerate is a library designed to simplify inference and training on multiple accelerators by handling the setup, allowing users to focus on their PyTorch code.
 
-To begin, create a Python file and initialize an [`accelerate.PartialState`] to create a distributed environment; your setup is automatically detected so you don't need to explicitly define the `rank` or `world_size`. Move the [`DiffusionPipeline`] to `distributed_state.device` to assign a GPU to each process.
+Install Accelerate with the following command.
 
-Now use the [`~accelerate.PartialState.split_between_processes`] utility as a context manager to automatically distribute the prompts between the number of processes.
+```bash
+uv pip install accelerate
+```
+
+Initialize a [`accelerate.PartialState`] class in a Python file to create a distributed environment. The [`accelerate.PartialState`] class manages process management, device control and distribution, and process coordination.
+
+Move the [`DiffusionPipeline`] to [`accelerate.PartialState.device`] to assign a GPU to each process.
 
 ```py
 import torch
@@ -30,33 +36,31 @@ from accelerate import PartialState
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+    "Qwen/Qwen-Image", torch_dtype=torch.float16
 )
 distributed_state = PartialState()
 pipeline.to(distributed_state.device)
+```
+
+Use the [`~accelerate.PartialState.split_between_processes`] utility as a context manager to automatically distribute the prompts between the number of processes.
 
+```py
 with distributed_state.split_between_processes(["a dog", "a cat"]) as prompt:
     result = pipeline(prompt).images[0]
     result.save(f"result_{distributed_state.process_index}.png")
 ```
 
-Use the `--num_processes` argument to specify the number of GPUs to use, and call `accelerate launch` to run the script:
+Call `accelerate launch` to run the script and use the `--num_processes` argument to set the number of GPUs to use.
 
 ```bash
 accelerate launch run_distributed.py --num_processes=2
 ```
 
-<Tip>
-
-Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
-
-</Tip>
-
 ## PyTorch Distributed
 
-PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
+PyTorch [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) enables [data parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=data_parallelism), which replicates the same model on each device, to process different batches of data in parallel.
 
-To start, create a Python file and import `torch.distributed` and `torch.multiprocessing` to set up the distributed process group and to spawn the processes for inference on each GPU. You should also initialize a [`DiffusionPipeline`]:
+Import `torch.distributed` and `torch.multiprocessing` into a Python file to set up the distributed process group and to spawn the processes for inference on each GPU.
 
 ```py
 import torch
@@ -65,20 +69,20 @@ import torch.multiprocessing as mp
 
 from diffusers import DiffusionPipeline
 
-sd = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+pipeline = DiffusionPipeline.from_pretrained(
+    "Qwen/Qwen-Image", torch_dtype=torch.float16,
 )
 ```
 
-You'll want to create a function to run inference; [`init_process_group`](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group) handles creating a distributed environment with the type of backend to use, the `rank` of the current process, and the `world_size` or the number of processes participating. If you're running inference in parallel over 2 GPUs, then the `world_size` is 2.
+Create a function for inference with [init_process_group](https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group). This method creates a distributed environment with the backend type, the `rank` of the current process, and the `world_size` or number of processes participating (for example, 2 GPUs would be `world_size=2`).
 
-Move the [`DiffusionPipeline`] to `rank` and use `get_rank` to assign a GPU to each process, where each process handles a different prompt:
+Move the pipeline to `rank` and use `get_rank` to assign a GPU to each process. Each process handles a different prompt.
 
 ```py
 def run_inference(rank, world_size):
     dist.init_process_group("nccl", rank=rank, world_size=world_size)
 
-    sd.to(rank)
+    pipeline.to(rank)
 
     if torch.distributed.get_rank() == 0:
         prompt = "a dog"
@@ -89,7 +93,7 @@ def run_inference(rank, world_size):
     image.save(f"./{'_'.join(prompt)}.png")
 ```
 
-To run the distributed inference, call [`mp.spawn`](https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn) to run the `run_inference` function on the number of GPUs defined in `world_size`:
+Use [mp.spawn](https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn) to create the number of processes defined in `world_size`.
 
 ```py
 def main():
@@ -101,31 +105,26 @@ if __name__ == "__main__":
     main()
 ```
 
-Once you've completed the inference script, use the `--nproc_per_node` argument to specify the number of GPUs to use and call `torchrun` to run the script:
+Call `torchrun` to run the inference script and use the `--nproc_per_node` argument to set the number of GPUs to use.
 
 ```bash
 torchrun run_distributed.py --nproc_per_node=2
 ```
 
-> [!TIP]
-> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
-
-## Model sharding
+## device_map
 
-Modern diffusion systems such as [Flux](../api/pipelines/flux) are very large and have multiple models. For example, [Flux.1-Dev](https://hf.co/black-forest-labs/FLUX.1-dev) is made up of two text encoders - [T5-XXL](https://hf.co/google/t5-v1_1-xxl) and [CLIP-L](https://hf.co/openai/clip-vit-large-patch14) - a [diffusion transformer](../api/models/flux_transformer), and a [VAE](../api/models/autoencoderkl). With a model this size, it can be challenging to run inference on consumer GPUs.
+The `device_map` argument enables distributed inference by automatically placing model components on separate GPUs. This is especially useful when a model doesn't fit on a single GPU. You can use `device_map` to selectively load and unload the required model components at a given stage as shown in the example below (assumes two GPUs are available).
 
-Model sharding is a technique that distributes models across GPUs when the models don't fit on a single GPU. The example below assumes two 16GB GPUs are available for inference.
-
-Start by computing the text embeddings with the text encoders. Keep the text encoders on two GPUs by setting `device_map="balanced"`. The `balanced` strategy evenly distributes the model on all available GPUs. Use the `max_memory` parameter to allocate the maximum amount of memory for each text encoder on each GPU.
-
-> [!TIP]
-> **Only** load the text encoders for this step! The diffusion transformer and VAE are loaded in a later step to preserve memory.
+Set `device_map="balanced"` to evenly distributes the text encoders on all available GPUs. You can use the `max_memory` argument to allocate a maximum amount of memory for each text encoder. Don't load any other pipeline components to avoid memory usage.
 
 ```py
 from diffusers import FluxPipeline
 import torch
 
-prompt = "a photo of a dog with cat-like look"
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
 
 pipeline = FluxPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
@@ -142,7 +141,7 @@ with torch.no_grad():
     )
 ```
 
-Once the text embeddings are computed, remove them from the GPU to make space for the diffusion transformer.
+After the text embeddings are computed, remove them from the GPU to make space for the diffusion transformer.
 
 ```py
 import gc 
@@ -162,7 +161,7 @@ del pipeline
 flush()
 ```
 
-Load the diffusion transformer next which has 12.5B parameters. This time, set `device_map="auto"` to automatically distribute the model across two 16GB GPUs. The `auto` strategy is backed by [Accelerate](https://hf.co/docs/accelerate/index) and available as a part of the [Big Model Inference](https://hf.co/docs/accelerate/concept_guides/big_model_inference) feature. It starts by distributing a model across the fastest device first (GPU) before moving to slower devices like the CPU and hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency.
+Set `device_map="auto"` to automatically distribute the model on the two GPUs. This strategy places a model on the fastest device first before placing a model on a slower device like a CPU or hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency.
 
 ```py
 from diffusers import AutoModel
@@ -177,9 +176,9 @@ transformer = AutoModel.from_pretrained(
 ```
 
 > [!TIP]
-> At any point, you can try `print(pipeline.hf_device_map)` to see how the various models are distributed across devices. This is useful for tracking the device placement of the models. You can also try `print(transformer.hf_device_map)` to see how the transformer model is sharded across devices.
+> Run `pipeline.hf_device_map` to see how the various models are distributed across devices. This is useful for tracking model device placement. You can also call `hf_device_map` on the transformer model to see how it is distributed.
 
-Add the transformer model to the pipeline for denoising, but set the other model-level components like the text encoders and VAE to `None` because you don't need them yet.
+Add the transformer model to the pipeline and set the `output_type="latent"` to generate the latents.
 
 ```py
 pipeline = FluxPipeline.from_pretrained(
@@ -206,21 +205,12 @@ latents = pipeline(
 ).images
 ```
 
-Remove the pipeline and transformer from memory as they're no longer needed.
-
-```py
-del pipeline.transformer
-del pipeline
-
-flush()
-```
-
-Finally, decode the latents with the VAE into an image. The VAE is typically small enough to be loaded on a single GPU.
+Remove the pipeline and transformer from memory and load a VAE to decode the latents. The VAE is typically small enough to be loaded on a single device.
 
 ```py
+import torch
 from diffusers import AutoencoderKL
 from diffusers.image_processor import VaeImageProcessor
-import torch 
 
 vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
 vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
@@ -236,4 +226,8 @@ with torch.no_grad():
     image[0].save("split_transformer.png")
 ```
 
-By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
+## Resources
+
+- Take a look at this [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for a minimal example of distributed inference with Accelerate.
+- For more details, check out Accelerate's [Distributed inference](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+- The `device_map` argument assign models or an entire pipeline to devices. Refer to the [device placement](../using-diffusers/loading#device-placement) docs for more information.
\ No newline at end of file

From c07fcf780a199be58018a80349e8447077146ac5 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Mon, 29 Sep 2025 11:36:14 -0700
Subject: [PATCH 16/69] [docs] Model formats (#12256)

* init

* config

* lora metadata

* feedback

* fix

* cache allocator warmup for from_single_file

* feedback

* feedback
---
 docs/source/en/_toctree.yml                   |   2 +-
 docs/source/en/using-diffusers/loading.md     |   3 +
 .../en/using-diffusers/other-formats.md       | 539 +++++-------------
 3 files changed, 159 insertions(+), 385 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 96c6fbb17ff1..ada5e3889581 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -25,7 +25,7 @@
   - local: using-diffusers/schedulers
     title: Schedulers
   - local: using-diffusers/other-formats
-    title: Model files and layouts
+    title: Model formats
   - local: using-diffusers/push_to_hub
     title: Sharing pipelines and models
 
diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md
index 25b53d2f4d49..3fb608b1c26c 100644
--- a/docs/source/en/using-diffusers/loading.md
+++ b/docs/source/en/using-diffusers/loading.md
@@ -52,6 +52,9 @@ pipeline = QwenImagePipeline.from_pretrained(
 )
 ```
 
+> [!TIP]
+> Refer to the [Single file format](./other-formats#single-file-format) docs to learn how to load single file models.
+
 ### Local pipelines
 
 Pipelines can also be run locally. Use [`~huggingface_hub.snapshot_download`] to download a model repository.
diff --git a/docs/source/en/using-diffusers/other-formats.md b/docs/source/en/using-diffusers/other-formats.md
index 59835bbf2622..b6e333ed7715 100644
--- a/docs/source/en/using-diffusers/other-formats.md
+++ b/docs/source/en/using-diffusers/other-formats.md
@@ -10,504 +10,275 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Model files and layouts
-
 [[open-in-colab]]
 
-Diffusion models are saved in various file types and organized in different layouts. Diffusers stores model weights as safetensors files in *Diffusers-multifolder* layout and it also supports loading files (like safetensors and ckpt files) from a *single-file* layout which is commonly used in the diffusion ecosystem.
-
-Each layout has its own benefits and use cases, and this guide will show you how to load the different files and layouts, and how to convert them.
-
-## Files
-
-PyTorch model weights are typically saved with Python's [pickle](https://docs.python.org/3/library/pickle.html) utility as ckpt or bin files. However, pickle is not secure and pickled files may contain malicious code that can be executed. This vulnerability is a serious concern given the popularity of model sharing. To address this security issue, the [Safetensors](https://hf.co/docs/safetensors) library was developed as a secure alternative to pickle, which saves models as safetensors files.
+# Model formats
 
-### safetensors
+Diffusion models are typically stored in the Diffusers format or single-file format. Model files can be stored in various file types such as safetensors, dduf, or ckpt.
 
 > [!TIP]
-> Learn more about the design decisions and why safetensor files are preferred for saving and loading model weights in the [Safetensors audited as really safe and becoming the default](https://blog.eleuther.ai/safetensors-security-audit/) blog post.
-
-[Safetensors](https://hf.co/docs/safetensors) is a safe and fast file format for securely storing and loading tensors. Safetensors restricts the header size to limit certain types of attacks, supports lazy loading (useful for distributed setups), and has generally faster loading speeds.
+> Format refers to whether the weights are stored in a directory structure and file refers to the file type.
 
-Make sure you have the [Safetensors](https://hf.co/docs/safetensors) library installed.
+This guide will show you how to load pipelines and models from these formats and files.
 
-```py
-!pip install safetensors
-```
+## Diffusers format
 
-Safetensors stores weights in a safetensors file. Diffusers loads safetensors files by default if they're available and the Safetensors library is installed. There are two ways safetensors files can be organized:
+The Diffusers format stores each model (UNet, transformer, text encoder) in a separate subfolder. There are several benefits to storing models separately.
 
-1. Diffusers-multifolder layout: there may be several separate safetensors files, one for each pipeline component (text encoder, UNet, VAE), organized in subfolders (check out the [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main) repository as an example)
-2. single-file layout: all the model weights may be saved in a single file (check out the [WarriorMama777/OrangeMixs](https://hf.co/WarriorMama777/OrangeMixs/tree/main/Models/AbyssOrangeMix) repository as an example)
+- Faster overall pipeline initialization because you can load the individual model you need or load them all in parallel.
+- Reduced memory usage because you don't need to load all the pipeline components if you only need one model. [Reuse](./loading#reusing-models-in-multiple-pipelines) a model that is shared between multiple pipelines.
+- Lower storage requirements because common models shared between multiple pipelines are only downloaded once.
+- Flexibility to use new or improved models in a pipeline.
 
-<hfoptions id="safetensors">
-<hfoption id="multifolder">
+## Single file format
 
-Use the [`~DiffusionPipeline.from_pretrained`] method to load a model with safetensors files stored in multiple folders.
-
-```py
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    use_safetensors=True
-)
-```
+A single-file format stores *all* the model (UNet, transformer, text encoder) weights in a single file. Benefits of single-file formats include the following.
 
-</hfoption>
-<hfoption id="single file">
+- Greater compatibility with [ComfyUI](https://github.com/comfyanonymous/ComfyUI) or [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui).
+- Easier to download and share a single file.
 
-Use the [`~loaders.FromSingleFileMixin.from_single_file`] method to load a model with all the weights stored in a single safetensors file.
+Use [`~loaders.FromSingleFileMixin.from_single_file`] to load a single file.
 
 ```py
-from diffusers import StableDiffusionPipeline
+import torch
+from diffusers import StableDiffusionXLPipeline
 
-pipeline = StableDiffusionPipeline.from_single_file(
-    "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+pipeline = StableDiffusionXLPipeline.from_single_file(
+    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
+    torch_dtype=torch.float16,
+    device_map="cuda"
 )
 ```
 
-</hfoption>
-</hfoptions>
-
-#### LoRAs
-
-[LoRAs](../tutorials/using_peft_for_inference) are lightweight checkpoints fine-tuned to generate images or video in a specific style. If you are using a checkpoint trained with a Diffusers training script, the LoRA configuration is automatically saved as metadata in a safetensors file. When the safetensors file is loaded, the metadata is parsed to correctly configure the LoRA and avoids missing or incorrect LoRA configurations.
-
-The easiest way to inspect the metadata, if available, is by clicking on the Safetensors logo next to the weights.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/safetensors_lora.png"/>
-</div>
-
-For LoRAs that aren't trained with Diffusers, you can still save metadata with the `transformer_lora_adapter_metadata` and `text_encoder_lora_adapter_metadata` arguments in [`~loaders.FluxLoraLoaderMixin.save_lora_weights`] as long as it is a safetensors file.
+The [`~loaders.FromSingleFileMixin.from_single_file`] method also supports passing new models or schedulers.
 
 ```py
 import torch
-from diffusers import FluxPipeline
+from diffusers import FluxPipeline, FluxTransformer2DModel
 
+transformer = FluxTransformer2DModel.from_single_file(
+    "https://huggingface.co/Kijai/flux-fp8/blob/main/flux1-dev-fp8.safetensors", torch_dtype=torch.bfloat16
+)
 pipeline = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-).to("cuda")
-pipeline.load_lora_weights("linoyts/yarn_art_Flux_LoRA")
-pipeline.save_lora_weights(
-    transformer_lora_adapter_metadata={"r": 16, "lora_alpha": 16},
-    text_encoder_lora_adapter_metadata={"r": 8, "lora_alpha": 8}
+    "black-forest-labs/FLUX.1-dev",
+    transformer=transformer,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
 )
 ```
 
-### ckpt
-
-> [!WARNING]
-> Pickled files may be unsafe because they can be exploited to execute malicious code. It is recommended to use safetensors files instead where possible, or convert the weights to safetensors files.
+### Configuration options
 
-PyTorch's [torch.save](https://pytorch.org/docs/stable/generated/torch.save.html) function uses Python's [pickle](https://docs.python.org/3/library/pickle.html) utility to serialize and save models. These files are saved as a ckpt file and they contain the entire model's weights.
+Diffusers format models have a `config.json` file in their repositories with important attributes such as the number of layers and attention heads. The [`~loaders.FromSingleFileMixin.from_single_file`] method automatically determines the appropriate config to use from `config.json`. This may fail in a few rare instances though, in which case, you should use the `config` argument.
 
-Use the [`~loaders.FromSingleFileMixin.from_single_file`] method to directly load a ckpt file.
+You should also use the `config` argument if the models in a pipeline are different from the original implementation or if it doesn't have the necessary metadata to determine the correct config.
 
 ```py
-from diffusers import StableDiffusionPipeline
-
-pipeline = StableDiffusionPipeline.from_single_file(
-    "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt"
-)
-```
-
-## Storage layout
-
-There are two ways model files are organized, either in a Diffusers-multifolder layout or in a single-file layout. The Diffusers-multifolder layout is the default, and each component file (text encoder, UNet, VAE) is stored in a separate subfolder. Diffusers also supports loading models from a single-file layout where all the components are bundled together.
+from diffusers import StableDiffusionXLPipeline
 
-### Diffusers-multifolder
+ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
 
-The Diffusers-multifolder layout is the default storage layout for Diffusers. Each component's (text encoder, UNet, VAE) weights are stored in a separate subfolder. The weights can be stored as safetensors or ckpt files.
+pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, config="segmind/SSD-1B")
+```
 
-<div class="flex flex-row gap-4">
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multifolder-layout.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">multifolder layout</figcaption>
-  </div>
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multifolder-unet.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">UNet subfolder</figcaption>
-  </div>
-</div>
+Diffusers attempts to infer the pipeline components based on the signature types of the pipeline class when using `original_config` with `local_files_only=True`. It won't download the config files from a Hub repository to avoid backward breaking changes when you can't connect to the internet. This method isn't as reliable as providing a path to a local model with the `config` argument and may lead to errors. You should run the pipeline with `local_files_only=False` to download the config files to the local cache to avoid errors.
 
-To load from Diffusers-multifolder layout, use the [`~DiffusionPipeline.from_pretrained`] method.
+Override default configs by passing the arguments directly to [`~loaders.FromSingleFileMixin.from_single_file`]. The examples below demonstrate how to override the configs in a pipeline or model.
 
 ```py
-from diffusers import DiffusionPipeline
+from diffusers import StableDiffusionXLInstructPix2PixPipeline
 
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    use_safetensors=True,
-).to("cuda")
+ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
+pipeline = StableDiffusionXLInstructPix2PixPipeline.from_single_file(
+    ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True
+)
 ```
 
-Benefits of using the Diffusers-multifolder layout include:
-
-1. Faster to load each component file individually or in parallel.
-2. Reduced memory usage because you only load the components you need. For example, models like [SDXL Turbo](https://hf.co/stabilityai/sdxl-turbo), [SDXL Lightning](https://hf.co/ByteDance/SDXL-Lightning), and [Hyper-SD](https://hf.co/ByteDance/Hyper-SD) have the same components except for the UNet. You can reuse their shared components with the [`~DiffusionPipeline.from_pipe`] method without consuming any additional memory (take a look at the [Reuse a pipeline](./loading#reuse-a-pipeline) guide) and only load the UNet. This way, you don't need to download redundant components and unnecessarily use more memory.
-
-    ```py
-    import torch
-    from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
-
-    # download one model
-    sdxl_pipeline = StableDiffusionXLPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        torch_dtype=torch.float16,
-        variant="fp16",
-        use_safetensors=True,
-    ).to("cuda")
-
-    # switch UNet for another model
-    unet = UNet2DConditionModel.from_pretrained(
-        "stabilityai/sdxl-turbo",
-        subfolder="unet",
-        torch_dtype=torch.float16,
-        variant="fp16",
-        use_safetensors=True
-    )
-    # reuse all the same components in new model except for the UNet
-    turbo_pipeline = StableDiffusionXLPipeline.from_pipe(
-        sdxl_pipeline, unet=unet,
-    ).to("cuda")
-    turbo_pipeline.scheduler = EulerDiscreteScheduler.from_config(
-        turbo_pipeline.scheduler.config,
-        timestep_spacing="trailing"
-    )
-    image = turbo_pipeline(
-        "an astronaut riding a unicorn on mars",
-        num_inference_steps=1,
-        guidance_scale=0.0,
-    ).images[0]
-    image
-    ```
-
-3. Reduced storage requirements because if a component, such as the SDXL [VAE](https://hf.co/madebyollin/sdxl-vae-fp16-fix), is shared across multiple models, you only need to download and store a single copy of it instead of downloading and storing it multiple times. For 10 SDXL models, this can save ~3.5GB of storage. The storage savings is even greater for newer models like PixArt Sigma, where the [text encoder](https://hf.co/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS/tree/main/text_encoder) alone is ~19GB!
-4. Flexibility to replace a component in the model with a newer or better version.
-
-    ```py
-    from diffusers import DiffusionPipeline, AutoencoderKL
-
-    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
-    pipeline = DiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        vae=vae,
-        torch_dtype=torch.float16,
-        variant="fp16",
-        use_safetensors=True,
-    ).to("cuda")
-    ```
-
-5. More visibility and information about a model's components, which are stored in a [config.json](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/unet/config.json) file in each component subfolder.
-
-### Single-file
-
-The single-file layout stores all the model weights in a single file. All the model components (text encoder, UNet, VAE) weights are kept together instead of separately in subfolders. This can be a safetensors or ckpt file.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/single-file-layout.png"/>
-</div>
-
-To load from a single-file layout, use the [`~loaders.FromSingleFileMixin.from_single_file`] method.
-
 ```py
-import torch
-from diffusers import StableDiffusionXLPipeline
+from diffusers import UNet2DConditionModel
 
-pipeline = StableDiffusionXLPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    use_safetensors=True,
-).to("cuda")
+ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
+model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
 ```
 
-Benefits of using a single-file layout include:
-
-1. Easy compatibility with diffusion interfaces such as [ComfyUI](https://github.com/comfyanonymous/ComfyUI) or [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) which commonly use a single-file layout.
-2. Easier to manage (download and share) a single file.
-
-### DDUF
-
-> [!WARNING]
-> DDUF is an experimental file format and APIs related to it can change in the future.
-
-DDUF (**D**DUF **D**iffusion **U**nified **F**ormat) is a file format designed to make storing, distributing, and using diffusion models much easier. Built on the ZIP file format, DDUF offers a standardized, efficient, and flexible way to package all parts of a diffusion model into a single, easy-to-manage file. It provides a balance between Diffusers multi-folder format and the widely popular single-file format.
+### Local files
 
-Learn more details about DDUF on the Hugging Face Hub [documentation](https://huggingface.co/docs/hub/dduf).
+The [`~loaders.FromSingleFileMixin.from_single_file`] method attempts to configure a pipeline or model by inferring the model type from the keys in the checkpoint file. For example, any single file checkpoint based on the Stable Diffusion XL base model is configured from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0).
 
-Pass a checkpoint to the `dduf_file` parameter to load it in [`DiffusionPipeline`].
+If you're working with local files, download the config files with the [`~huggingface_hub.snapshot_download`] method and the model checkpoint with [`~huggingface_hub.hf_hub_download`]. These files are downloaded to your [cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache), but you can download them to a specific directory with the `local_dir` argument.
 
 ```py
-from diffusers import DiffusionPipeline
-import torch
-
-pipe = DiffusionPipeline.from_pretrained(
-    "DDUF/FLUX.1-dev-DDUF", dduf_file="FLUX.1-dev.dduf", torch_dtype=torch.bfloat16
-).to("cuda")
-image = pipe(
-    "photo a cat holding a sign that says Diffusers", num_inference_steps=50, guidance_scale=3.5
-).images[0]
-image.save("cat.png")
-```
-
-To save a pipeline as a `.dduf` checkpoint, use the [`~huggingface_hub.export_folder_as_dduf`] utility, which takes care of all the necessary file-level validations.
+from huggingface_hub import hf_hub_download, snapshot_download
+from diffusers import StableDiffusionXLPipeline
 
-```py
-from huggingface_hub import export_folder_as_dduf
-from diffusers import DiffusionPipeline
-import torch 
+my_local_checkpoint_path = hf_hub_download(
+    repo_id="segmind/SSD-1B",
+    filename="SSD-1B.safetensors"
+)
 
-pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
+my_local_config_path = snapshot_download(
+    repo_id="segmind/SSD-1B",
+    allow_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
+)
 
-save_folder = "flux-dev"
-pipe.save_pretrained("flux-dev")
-export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder)
+pipeline = StableDiffusionXLPipeline.from_single_file(
+    my_local_checkpoint_path, config=my_local_config_path, local_files_only=True
+)
 ```
 
-> [!TIP]
-> Packaging and loading quantized checkpoints in the DDUF format is supported as long as they respect the multi-folder structure.
-
-## Convert layout and files
-
-Diffusers provides many scripts and methods to convert storage layouts and file formats to enable broader support across the diffusion ecosystem.
+### Symlink
 
-Take a look at the [diffusers/scripts](https://github.com/huggingface/diffusers/tree/main/scripts) collection to find a script that fits your conversion needs.
+If you're working with a file system that does not support symlinking, download the checkpoint file to a local directory first with the `local_dir` parameter. Using the `local_dir` parameter automatically disables symlinks.
 
-> [!TIP]
-> Scripts that have "`to_diffusers`" appended at the end mean they convert a model to the Diffusers-multifolder layout. Each script has their own specific set of arguments for configuring the conversion, so make sure you check what arguments are available!
+```py
+from huggingface_hub import hf_hub_download, snapshot_download
+from diffusers import StableDiffusionXLPipeline
 
-For example, to convert a Stable Diffusion XL model stored in Diffusers-multifolder layout to a single-file layout, run the [convert_diffusers_to_original_sdxl.py](https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_sdxl.py) script. Provide the path to the model to convert, and the path to save the converted model to. You can optionally specify whether you want to save the model as a safetensors file and whether to save the model in half-precision.
+my_local_checkpoint_path = hf_hub_download(
+    repo_id="segmind/SSD-1B",
+    filename="SSD-1B.safetensors"
+    local_dir="my_local_checkpoints",
+)
+print("My local checkpoint: ", my_local_checkpoint_path)
 
-```bash
-python convert_diffusers_to_original_sdxl.py --model_path path/to/model/to/convert --checkpoint_path path/to/save/model/to --use_safetensors
+my_local_config_path = snapshot_download(
+    repo_id="segmind/SSD-1B",
+    allow_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
+)
+print("My local config: ", my_local_config_path)
 ```
 
-You can also save a model to Diffusers-multifolder layout with the [`~DiffusionPipeline.save_pretrained`] method. This creates a directory for you if it doesn't already exist, and it also saves the files as a safetensors file by default.
+Pass these paths to [`~loaders.FromSingleFileMixin.from_single_file`].
 
 ```py
-from diffusers import StableDiffusionXLPipeline
-
 pipeline = StableDiffusionXLPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
+    my_local_checkpoint_path, config=my_local_config_path, local_files_only=True
 )
-pipeline.save_pretrained()
 ```
 
-Lastly, there are also Spaces, such as [SD To Diffusers](https://hf.co/spaces/diffusers/sd-to-diffusers) and [SD-XL To Diffusers](https://hf.co/spaces/diffusers/sdxl-to-diffusers), that provide a more user-friendly interface for converting models to Diffusers-multifolder layout. This is the easiest and most convenient option for converting layouts, and it'll open a PR on your model repository with the converted files. However, this option is not as reliable as running a script, and the Space may fail for more complicated models.
+## File types
 
-## Single-file layout usage
+Models can be stored in several file types. Safetensors is the most common file type but you may encounter other file types on the Hub or diffusion community.
 
-Now that you're familiar with the differences between the Diffusers-multifolder and single-file layout, this section shows you how to load models and pipeline components, customize configuration options for loading, and load local files with the [`~loaders.FromSingleFileMixin.from_single_file`] method.
+### safetensors
 
-### Load a pipeline or model
+[Safetensors](https://hf.co/docs/safetensors) is a safe and fast file type for securely storing and loading tensors. It restricts the header size to limit certain types of attacks, supports lazy loading (useful for distributed setups), and generally loads faster.
 
-Pass the file path of the pipeline or model to the [`~loaders.FromSingleFileMixin.from_single_file`] method to load it.
+Diffusers loads safetensors file by default (a required dependency) if they are available and the Safetensors library is installed.
 
-<hfoptions id="pipeline-model">
-<hfoption id="pipeline">
+Use [`~DiffusionPipeline.from_pretrained`] or [`~loaders.FromSingleFileMixin.from_single_file`] to load safetensor files.
 
 ```py
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path)
-```
-
-</hfoption>
-<hfoption id="model">
+import torch
+from diffusers import DiffusionPipeline
 
-```py
-from diffusers import StableCascadeUNet
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch.dtype=torch.float16,
+    device_map="cuda"
+)
 
-ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors"
-model = StableCascadeUNet.from_single_file(ckpt_path)
+pipeline = DiffusionPipeline.from_single_file(
+    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
+    torch_dtype=torch.float16,
+)
 ```
 
-</hfoption>
-</hfoptions>
+If you're using a checkpoint trained with a Diffusers training script, metadata such as the LoRA configuration, is automatically saved. When the file is loaded, the metadata is parsed to correctly configure the LoRA and avoid missing or incorrect LoRA configs. Inspect the metadata of a safetensors file by clicking on the <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/safetensors/logo.png" alt="safetensors logo" style="vertical-align: middle; display: inline-block; max-height: 0.8em; max-width: 0.8em; margin: 0; padding: 0; line-height: 1;"> logo next to the file on the Hub.
 
-Customize components in the pipeline by passing them directly to the [`~loaders.FromSingleFileMixin.from_single_file`] method. For example, you can use a different scheduler in a pipeline.
+Save the metadata for LoRAs that aren't trained with Diffusers with either `transformer_lora_adapter_metadata` or `unet_lora_adapter_metadata` depending on your model. For the text encoder, use the `text_encoder_lora_adapter_metadata` and `text_encoder_2_lora_adapter_metadata` arguments in [`~loaders.FluxLoraLoaderMixin.save_lora_weights`]. This is only supported for safetensors files.
 
 ```py
-from diffusers import StableDiffusionXLPipeline, DDIMScheduler
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-scheduler = DDIMScheduler()
-pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler)
-```
-
-Or you could use a ControlNet model in the pipeline.
-
-```py
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+from diffusers import FluxPipeline
 
-ckpt_path = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
-controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-pipeline = StableDiffusionControlNetPipeline.from_single_file(ckpt_path, controlnet=controlnet)
+pipeline = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
+).to("cuda")
+pipeline.load_lora_weights("linoyts/yarn_art_Flux_LoRA")
+pipeline.save_lora_weights(
+    text_encoder_lora_adapter_metadata={"r": 8, "lora_alpha": 8},
+    text_encoder_2_lora_adapter_metadata={"r": 8, "lora_alpha": 8}
+)
 ```
 
-### Customize configuration options
-
-Models have a configuration file that define their attributes like the number of inputs in a UNet. Pipelines configuration options are available in the pipeline's class. For example, if you look at the [`StableDiffusionXLInstructPix2PixPipeline`] class, there is an option to scale the image latents with the `is_cosxl_edit` parameter.
-
-These configuration files can be found in the models Hub repository or another location from which the configuration file originated (for example, a GitHub repository or locally on your device).
+### ckpt
 
-<hfoptions id="config-file">
-<hfoption id="Hub configuration file">
+Older model weights are commonly saved with Python's [pickle](https://docs.python.org/3/library/pickle.html) utility in a ckpt file.
 
-> [!TIP]
-> The [`~loaders.FromSingleFileMixin.from_single_file`] method automatically maps the checkpoint to the appropriate model repository, but there are cases where it is useful to use the `config` parameter. For example, if the model components in the checkpoint are different from the original checkpoint or if a checkpoint doesn't have the necessary metadata to correctly determine the configuration to use for the pipeline.
+Pickled files may be unsafe because they can be exploited to execute malicious code. It is recommended to use safetensors files or convert the weights to safetensors files.
 
-The [`~loaders.FromSingleFileMixin.from_single_file`] method automatically determines the configuration to use from the configuration file in the model repository. You could also explicitly specify the configuration to use by providing the repository id to the `config` parameter.
+Use [`~loaders.FromSingleFileMixin.from_single_file`] to load a ckpt file.
 
 ```py
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors"
-repo_id = "segmind/SSD-1B"
+from diffusers import DiffusionPipeline
 
-pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id)
+pipeline = DiffusionPipeline.from_single_file(
+    "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt"
+)
 ```
 
-The model loads the configuration file for the [UNet](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json), [VAE](https://huggingface.co/segmind/SSD-1B/blob/main/vae/config.json), and [text encoder](https://huggingface.co/segmind/SSD-1B/blob/main/text_encoder/config.json) from their respective subfolders in the repository.
-
-</hfoption>
-<hfoption id="original configuration file">
-
-The [`~loaders.FromSingleFileMixin.from_single_file`] method can also load the original configuration file of a pipeline that is stored elsewhere. Pass a local path or URL of the original configuration file to the `original_config` parameter.
-
-```py
-from diffusers import StableDiffusionXLPipeline
-
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
-
-pipeline = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config)
-```
+### dduf
 
 > [!TIP]
-> Diffusers attempts to infer the pipeline components based on the type signatures of the pipeline class when you use `original_config` with `local_files_only=True`, instead of fetching the configuration files from the model repository on the Hub. This prevents backward breaking changes in code that can't connect to the internet to fetch the necessary configuration files.
->
-> This is not as reliable as providing a path to a local model repository with the `config` parameter, and might lead to errors during pipeline configuration. To avoid errors, run the pipeline with `local_files_only=False` once to download the appropriate pipeline configuration files to the local cache.
-
-</hfoption>
-</hfoptions>
+> DDUF is an experimental file type and the API may change. Refer to the DDUF [docs](https://huggingface.co/docs/hub/dduf) to learn more.
 
-While the configuration files specify the pipeline or models default parameters, you can override them by providing the parameters directly to the [`~loaders.FromSingleFileMixin.from_single_file`] method. Any parameter supported by the model or pipeline class can be configured in this way.
+DDUF is a file type designed to unify different diffusion model distribution methods and weight-saving formats. It is a standardized and flexible method to package all components of a diffusion model into a single file, providing a balance between the Diffusers and single-file formats.
 
-<hfoptions id="override">
-<hfoption id="pipeline">
+Use the `dduf_file` argument in [`~DiffusionPipeline.from_pretrained`] to load a DDUF file. You can also load quantized dduf files as long as they are stored in the Diffusers format.
 
-For example, to scale the image latents in [`StableDiffusionXLInstructPix2PixPipeline`] pass the `is_cosxl_edit` parameter.
-
-```python
-from diffusers import StableDiffusionXLInstructPix2PixPipeline
+```py
+import torch
+from diffusers import DiffusionPipeline
 
-ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
-pipeline = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True)
+pipeline = DiffusionPipeline.from_pretrained(
+    "DDUF/FLUX.1-dev-DDUF",
+    dduf_file="FLUX.1-dev.dduf",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
 ```
 
-</hfoption>
-<hfoption id="model">
+To save a pipeline as a dduf file, use the [`~huggingface_hub.export_folder_as_dduf`] utility.
 
-For example, to upcast the attention dimensions in a [`UNet2DConditionModel`] pass the `upcast_attention` parameter.
+```py
+import torch
+from diffusers import DiffusionPipeline
+from huggingface_hub import export_folder_as_dduf
 
-```python
-from diffusers import UNet2DConditionModel
+pipeline = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
 
-ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors"
-model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True)
+save_folder = "flux-dev"
+pipeline.save_pretrained("flux-dev")
+export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder)
 ```
 
-</hfoption>
-</hfoptions>
-
-### Local files
-
-In Diffusers>=v0.28.0, the [`~loaders.FromSingleFileMixin.from_single_file`] method attempts to configure a pipeline or model by inferring the model type from the keys in the checkpoint file. The inferred model type is used to determine the appropriate model repository on the Hugging Face Hub to configure the model or pipeline.
+## Converting formats and files
 
-For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repository to configure the pipeline.
+Diffusers provides scripts and methods to convert format and files to enable broader support across the diffusion ecosystem.
 
-But if you're working in an environment with restricted internet access, you should download the configuration files with the [`~huggingface_hub.snapshot_download`] function, and the model checkpoint with the [`~huggingface_hub.hf_hub_download`] function. By default, these files are downloaded to the Hugging Face Hub [cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache), but you can specify a preferred directory to download the files to with the `local_dir` parameter.
+Take a look at the [diffusers/scripts](https://github.com/huggingface/diffusers/tree/main/scripts) folder to find a conversion script. Scripts with `"to_diffusers` appended at the end converts a model to the Diffusers format. Each script has a specific set of arguments for configuring the conversion. Make sure you check what arguments are available.
 
-Pass the configuration and checkpoint paths to the [`~loaders.FromSingleFileMixin.from_single_file`] method to load locally.
+The example below converts a model stored in Diffusers format to a single-file format. Provide the path to the model to convert and where to save the converted model. You can optionally specify what file type and data type to save the model as.
 
-<hfoptions id="local">
-<hfoption id="Hub cache directory">
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
-
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-)
-
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allow_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-)
-
-pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
+```bash
+python convert_diffusers_to_original_sdxl.py --model_path path/to/model/to/convert --checkpoint_path path/to/save/model/to --use_safetensors
 ```
 
-</hfoption>
-<hfoption id="specific local directory">
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
+The [`~DiffusionPipeline.save_pretrained`] method also saves a model in Diffusers format and takes care of creating subfolders for each model. It saves the files as safetensor files by default.
 
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-    local_dir="my_local_checkpoints"
-)
+```py
+from diffusers import DiffusionPipeline
 
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allow_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-    local_dir="my_local_config"
+pipeline = DiffusionPipeline.from_single_file(
+    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors",
 )
-
-pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
+pipeline.save_pretrained()
 ```
 
-</hfoption>
-</hfoptions>
-
-#### Local files without symlink
-
-> [!TIP]
-> In huggingface_hub>=v0.23.0, the `local_dir_use_symlinks` argument isn't necessary for the [`~huggingface_hub.hf_hub_download`] and [`~huggingface_hub.snapshot_download`] functions.
-
-The [`~loaders.FromSingleFileMixin.from_single_file`] method relies on the [huggingface_hub](https://hf.co/docs/huggingface_hub/index) caching mechanism to fetch and store checkpoints and configuration files for models and pipelines. If you're working with a file system that does not support symlinking, you should download the checkpoint file to a local directory first, and disable symlinking with the `local_dir_use_symlink=False` parameter in the [`~huggingface_hub.hf_hub_download`] function and [`~huggingface_hub.snapshot_download`] functions.
-
-```python
-from huggingface_hub import hf_hub_download, snapshot_download
-
-my_local_checkpoint_path = hf_hub_download(
-    repo_id="segmind/SSD-1B",
-    filename="SSD-1B.safetensors"
-    local_dir="my_local_checkpoints",
-    local_dir_use_symlinks=False
-)
-print("My local checkpoint: ", my_local_checkpoint_path)
+Finally, you can use a Space like [SD To Diffusers](https://hf.co/spaces/diffusers/sd-to-diffusers) or [SD-XL To Diffusers](https://hf.co/spaces/diffusers/sdxl-to-diffusers) to convert models to the Diffusers format. It'll open a PR on your model repository with the converted files. This is the easiest way to convert a model, but it may fail for more complicated models. Using a conversion script is more reliable.
 
-my_local_config_path = snapshot_download(
-    repo_id="segmind/SSD-1B",
-    allow_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"]
-    local_dir_use_symlinks=False,
-)
-print("My local config: ", my_local_config_path)
-```
+## Resources
 
-Then you can pass the local paths to the `pretrained_model_link_or_path` and `config` parameters.
+- Learn more about the design decisions and why safetensor files are preferred for saving and loading model weights in the [Safetensors audited as really safe and becoming the default](https://blog.eleuther.ai/safetensors-security-audit/) blog post.
 
-```python
-pipeline = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True)
-```

From 76d4e416bc8952701e0b37c929bbb3253ff05f5f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 29 Sep 2025 11:42:34 -1000
Subject: [PATCH 17/69] [modular]some small fix (#12307)

* fix

* add mellon node registry

* style

* update docstring to include more info!

* support custom node mellon

* HTTPErrpr -> HfHubHTTPErrpr

* up

* Update src/diffusers/modular_pipelines/qwenimage/node_utils.py
---
 .../flux/modular_pipeline.py                  |   2 +
 .../modular_pipelines/mellon_node_utils.py    | 763 ++++++++++++++++++
 .../modular_pipelines/modular_pipeline.py     |  16 +-
 src/diffusers/modular_pipelines/node_utils.py | 665 ---------------
 .../qwenimage/before_denoise.py               |  15 +-
 .../modular_pipelines/qwenimage/encoders.py   |   2 +-
 .../modular_pipelines/qwenimage/inputs.py     |  12 +
 .../qwenimage/modular_blocks.py               |  64 +-
 .../qwenimage/modular_pipeline.py             |   4 +
 .../modular_pipelines/qwenimage/node_utils.py |  95 +++
 .../stable_diffusion_xl/before_denoise.py     |  20 +-
 .../stable_diffusion_xl/denoise.py            |   8 +-
 .../stable_diffusion_xl/encoders.py           |   8 +-
 .../stable_diffusion_xl/modular_blocks.py     |  59 +-
 .../stable_diffusion_xl/modular_pipeline.py   |   2 +
 .../stable_diffusion_xl/node_utils.py         |  99 +++
 .../modular_pipelines/wan/before_denoise.py   |   4 +-
 .../modular_pipelines/wan/denoise.py          |   4 +-
 .../modular_pipelines/wan/encoders.py         |   4 +-
 .../modular_pipelines/wan/modular_pipeline.py |   2 +
 20 files changed, 1107 insertions(+), 741 deletions(-)
 create mode 100644 src/diffusers/modular_pipelines/mellon_node_utils.py
 delete mode 100644 src/diffusers/modular_pipelines/node_utils.py
 create mode 100644 src/diffusers/modular_pipelines/qwenimage/node_utils.py
 create mode 100644 src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py

diff --git a/src/diffusers/modular_pipelines/flux/modular_pipeline.py b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
index e97445d411e4..7d869041f2a9 100644
--- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
@@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
     </Tip>
     """
 
+    default_blocks_name = "FluxAutoBlocks"
+
     @property
     def default_height(self):
         return self.default_sample_size * self.vae_scale_factor
diff --git a/src/diffusers/modular_pipelines/mellon_node_utils.py b/src/diffusers/modular_pipelines/mellon_node_utils.py
new file mode 100644
index 000000000000..a405aebee221
--- /dev/null
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
@@ -0,0 +1,763 @@
+import json
+import logging
+import os
+
+# Simple typed wrapper for parameter overrides
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    validate_hf_hub_args,
+)
+
+from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, PushToHubMixin, extract_commit_hash
+from .modular_pipeline import ModularPipelineBlocks
+
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_NODE_TYPES = {"controlnet", "vae_encoder", "denoise", "text_encoder", "decoder"}
+
+
+# Mellon Input Parameters (runtime parameters, not models)
+MELLON_INPUT_PARAMS = {
+    # controlnet
+    "control_image": {
+        "label": "Control Image",
+        "type": "image",
+        "display": "input",
+    },
+    "controlnet_conditioning_scale": {
+        "label": "Scale",
+        "type": "float",
+        "default": 0.5,
+        "min": 0,
+        "max": 1,
+    },
+    "control_guidance_end": {
+        "label": "End",
+        "type": "float",
+        "default": 1.0,
+        "min": 0,
+        "max": 1,
+    },
+    "control_guidance_start": {
+        "label": "Start",
+        "type": "float",
+        "default": 0.0,
+        "min": 0,
+        "max": 1,
+    },
+    "controlnet": {
+        "label": "Controlnet",
+        "type": "custom_controlnet",
+        "display": "input",
+    },
+    "embeddings": {
+        "label": "Text Embeddings",
+        "display": "input",
+        "type": "embeddings",
+    },
+    "image": {
+        "label": "Image",
+        "type": "image",
+        "display": "input",
+    },
+    "negative_prompt": {
+        "label": "Negative Prompt",
+        "type": "string",
+        "default": "",
+        "display": "textarea",
+    },
+    "prompt": {
+        "label": "Prompt",
+        "type": "string",
+        "default": "",
+        "display": "textarea",
+    },
+    "guidance_scale": {
+        "label": "Guidance Scale",
+        "type": "float",
+        "display": "slider",
+        "default": 5,
+        "min": 1.0,
+        "max": 30.0,
+        "step": 0.1,
+    },
+    "height": {
+        "label": "Height",
+        "type": "int",
+        "default": 1024,
+        "min": 64,
+        "step": 8,
+    },
+    "image_latents": {
+        "label": "Image Latents",
+        "type": "latents",
+        "display": "input",
+        "onChange": {False: ["height", "width"], True: ["strength"]},
+    },
+    "latents": {
+        "label": "Latents",
+        "type": "latents",
+        "display": "input",
+    },
+    "num_inference_steps": {
+        "label": "Steps",
+        "type": "int",
+        "display": "slider",
+        "default": 25,
+        "min": 1,
+        "max": 100,
+    },
+    "seed": {
+        "label": "Seed",
+        "type": "int",
+        "display": "random",
+        "default": 0,
+        "min": 0,
+        "max": 4294967295,
+    },
+    "strength": {
+        "label": "Strength",
+        "type": "float",
+        "default": 0.5,
+        "min": 0.0,
+        "max": 1.0,
+        "step": 0.01,
+    },
+    "width": {
+        "label": "Width",
+        "type": "int",
+        "default": 1024,
+        "min": 64,
+        "step": 8,
+    },
+    "ip_adapter": {
+        "label": "IP Adapter",
+        "type": "custom_ip_adapter",
+        "display": "input",
+    },
+}
+
+# Mellon Model Parameters (diffusers_auto_model types)
+MELLON_MODEL_PARAMS = {
+    "scheduler": {
+        "label": "Scheduler",
+        "display": "input",
+        "type": "diffusers_auto_model",
+    },
+    "text_encoders": {
+        "label": "Text Encoders",
+        "type": "diffusers_auto_models",
+        "display": "input",
+    },
+    "unet": {
+        "label": "Unet",
+        "display": "input",
+        "type": "diffusers_auto_model",
+        "onSignal": {
+            "action": "signal",
+            "target": "guider",
+        },
+    },
+    "guider": {
+        "label": "Guider",
+        "display": "input",
+        "type": "custom_guider",
+        "onChange": {False: ["guidance_scale"], True: []},
+    },
+    "vae": {
+        "label": "VAE",
+        "display": "input",
+        "type": "diffusers_auto_model",
+    },
+    "controlnet": {
+        "label": "Controlnet Model",
+        "type": "diffusers_auto_model",
+        "display": "input",
+    },
+}
+
+# Mellon Output Parameters (display = "output")
+MELLON_OUTPUT_PARAMS = {
+    "embeddings": {
+        "label": "Text Embeddings",
+        "display": "output",
+        "type": "embeddings",
+    },
+    "images": {
+        "label": "Images",
+        "type": "image",
+        "display": "output",
+    },
+    "image_latents": {
+        "label": "Image Latents",
+        "type": "latents",
+        "display": "output",
+    },
+    "latents": {
+        "label": "Latents",
+        "type": "latents",
+        "display": "output",
+    },
+    "latents_preview": {
+        "label": "Latents Preview",
+        "display": "output",
+        "type": "latent",
+    },
+    "controlnet_out": {
+        "label": "Controlnet",
+        "display": "output",
+        "type": "controlnet",
+    },
+}
+
+
+# Default param selections per supported node_type
+# from MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS.
+NODE_TYPE_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+            "vae",
+        ],
+        "outputs": [
+            "controlnet",
+        ],
+        "block_names": ["controlnet_vae_encoder"],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            # custom adapters coming in as inputs
+            "controlnet",
+            # ip_adapter is optional and custom; include if available
+            "ip_adapter",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+        "block_names": ["vae_encoder"],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+            # optional image prompt input supported in embeddings node
+            "image",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+        "block_names": ["text_encoder"],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+        "block_names": ["decode"],
+    },
+}
+
+
+@dataclass(frozen=True)
+class MellonParam:
+    name: str
+    label: str
+    type: str
+    display: Optional[str] = None
+    default: Any = None
+    min: Optional[float] = None
+    max: Optional[float] = None
+    step: Optional[float] = None
+    options: Any = None
+    value: Any = None
+    fieldOptions: Optional[Dict[str, Any]] = None
+    onChange: Any = None
+    onSignal: Any = None
+    _map_to_input: Any = None  # the block input name this parameter maps to
+
+    def to_dict(self) -> Dict[str, Any]:
+        data = asdict(self)
+        return {k: v for k, v in data.items() if not k.startswith("_") and v is not None}
+
+
+@dataclass
+class MellonNodeConfig(PushToHubMixin):
+    """
+    A MellonNodeConfig is a base class to build Mellon nodes UI with modular diffusers.
+
+    <Tip warning={true}>
+
+        This is an experimental feature and is likely to change in the future.
+
+    </Tip>
+    """
+
+    inputs: List[Union[str, MellonParam]]
+    model_inputs: List[Union[str, MellonParam]]
+    outputs: List[Union[str, MellonParam]]
+    blocks_names: list[str]
+    node_type: str
+    config_name = "mellon_config.json"
+
+    def __post_init__(self):
+        if isinstance(self.inputs, list):
+            self.inputs = self._resolve_params_list(self.inputs, MELLON_INPUT_PARAMS)
+        if isinstance(self.model_inputs, list):
+            self.model_inputs = self._resolve_params_list(self.model_inputs, MELLON_MODEL_PARAMS)
+        if isinstance(self.outputs, list):
+            self.outputs = self._resolve_params_list(self.outputs, MELLON_OUTPUT_PARAMS)
+
+    @staticmethod
+    def _resolve_params_list(
+        params: List[Union[str, MellonParam]], default_map: Dict[str, Dict[str, Any]]
+    ) -> Dict[str, Dict[str, Any]]:
+        def _resolve_param(
+            param: Union[str, MellonParam], default_params_map: Dict[str, Dict[str, Any]]
+        ) -> Tuple[str, Dict[str, Any]]:
+            if isinstance(param, str):
+                if param not in default_params_map:
+                    raise ValueError(f"Unknown param '{param}', please define a `MellonParam` object instead")
+                return param, default_params_map[param].copy()
+            elif isinstance(param, MellonParam):
+                param_dict = param.to_dict()
+                param_name = param_dict.pop("name")
+                return param_name, param_dict
+            else:
+                raise ValueError(
+                    f"Unknown param type '{type(param)}', please use a string or a  `MellonParam` object instead"
+                )
+
+        resolved = {}
+        for p in params:
+            logger.info(f" Resolving param: {p}")
+            name, cfg = _resolve_param(p, default_map)
+            if name in resolved:
+                raise ValueError(f"Duplicate param '{name}'")
+            resolved[name] = cfg
+        return resolved
+
+    @classmethod
+    @validate_hf_hub_args
+    def load_mellon_config(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        r"""
+        Load a model or scheduler configuration.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
+                      [`~ConfigMixin.save_config`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False):
+                Whether unused keyword arguments of the config are returned.
+            return_commit_hash (`bool`, *optional*, defaults to `False):
+                Whether the `commit_hash` of the loaded configuration are returned.
+
+        Returns:
+            `dict`:
+                A dictionary of all the parameters stored in a JSON configuration file.
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_dir = kwargs.pop("local_dir", None)
+        local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto")
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+        if cls.config_name is None:
+            raise ValueError(
+                "`self.config_name` is not defined. Note that one should not load a config from "
+                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+            )
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            try:
+                # Load from URL or cache if already cached
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=cls.config_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    local_dir=local_dir,
+                    local_dir_use_symlinks=local_dir_use_symlinks,
+                )
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `token` or log in with `hf auth login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+                    " this model name. Check the model page at"
+                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+                )
+            except HfHubHTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+                    " run the library in offline mode at"
+                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a {cls.config_name} file"
+                )
+        try:
+            with open(config_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            config_dict = json.loads(text)
+
+            commit_hash = extract_commit_hash(config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+
+        if not (return_unused_kwargs or return_commit_hash):
+            return config_dict
+
+        outputs = (config_dict,)
+
+        if return_unused_kwargs:
+            outputs += (kwargs,)
+
+        if return_commit_hash:
+            outputs += (commit_hash,)
+
+        return outputs
+
+    def save_mellon_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save the Mellon node definition to a JSON file.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file is saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+
+        self.to_json_file(output_config_file)
+        logger.info(f"Mellon node definition saved in {output_config_file}")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", None)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+            subfolder = kwargs.pop("subfolder", None)
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+                subfolder=subfolder,
+            )
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save the Mellon schema dictionary to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file to save a configuration instance's parameters.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string of the Mellon schema dict.
+
+        Args:
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+
+        mellon_dict = self.to_mellon_dict()
+        return json.dumps(mellon_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_mellon_dict(self) -> Dict[str, Any]:
+        """Return a JSON-serializable dict focusing on the Mellon schema fields only.
+
+        params is a single flat dict composed as: {**inputs, **model_inputs, **outputs}.
+        """
+        # inputs/model_inputs/outputs are already normalized dicts
+        merged_params = {}
+        merged_params.update(self.inputs or {})
+        merged_params.update(self.model_inputs or {})
+        merged_params.update(self.outputs or {})
+
+        return {
+            "node_type": self.node_type,
+            "blocks_names": self.blocks_names,
+            "params": merged_params,
+        }
+
+    @classmethod
+    def from_mellon_dict(cls, mellon_dict: Dict[str, Any]) -> "MellonNodeConfig":
+        """Create a config from a Mellon schema dict produced by to_mellon_dict().
+
+        Splits the flat params dict back into inputs/model_inputs/outputs using the known key spaces from
+        MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. Unknown keys are treated as inputs by
+        default.
+        """
+        flat_params = mellon_dict.get("params", {})
+
+        inputs: Dict[str, Any] = {}
+        model_inputs: Dict[str, Any] = {}
+        outputs: Dict[str, Any] = {}
+
+        for param_name, param_dict in flat_params.items():
+            if param_dict.get("display", "") == "output":
+                outputs[param_name] = param_dict
+            elif param_dict.get("type", "") in ("diffusers_auto_model", "diffusers_auto_models"):
+                model_inputs[param_name] = param_dict
+            else:
+                inputs[param_name] = param_dict
+
+        return cls(
+            inputs=inputs,
+            model_inputs=model_inputs,
+            outputs=outputs,
+            blocks_names=mellon_dict.get("blocks_names", []),
+            node_type=mellon_dict.get("node_type"),
+        )
+
+    # YiYi Notes: not used yet
+    @classmethod
+    def from_blocks(cls, blocks: ModularPipelineBlocks, node_type: str) -> "MellonNodeConfig":
+        """
+        Create an instance from a ModularPipeline object. If a preset exists in NODE_TYPE_PARAMS_MAP for the node_type,
+        use it; otherwise fall back to deriving lists from the pipeline's expected inputs/components/outputs.
+        """
+        if node_type not in NODE_TYPE_PARAMS_MAP:
+            raise ValueError(f"Node type {node_type} not supported")
+
+        blocks_names = list(blocks.sub_blocks.keys())
+
+        default_node_config = NODE_TYPE_PARAMS_MAP[node_type]
+        inputs_list: List[Union[str, MellonParam]] = default_node_config.get("inputs", [])
+        model_inputs_list: List[Union[str, MellonParam]] = default_node_config.get("model_inputs", [])
+        outputs_list: List[Union[str, MellonParam]] = default_node_config.get("outputs", [])
+
+        for required_input_name in blocks.required_inputs:
+            if required_input_name not in inputs_list:
+                inputs_list.append(
+                    MellonParam(
+                        name=required_input_name, label=required_input_name, type=required_input_name, display="input"
+                    )
+                )
+
+        for component_spec in blocks.expected_components:
+            if component_spec.name not in model_inputs_list:
+                model_inputs_list.append(
+                    MellonParam(
+                        name=component_spec.name,
+                        label=component_spec.name,
+                        type="diffusers_auto_model",
+                        display="input",
+                    )
+                )
+
+        return cls(
+            inputs=inputs_list,
+            model_inputs=model_inputs_list,
+            outputs=outputs_list,
+            blocks_names=blocks_names,
+            node_type=node_type,
+        )
+
+
+# Minimal modular registry for Mellon node configs
+class ModularMellonNodeRegistry:
+    """Registry mapping (pipeline class, blocks_name) -> list of MellonNodeConfig."""
+
+    def __init__(self):
+        self._registry = {}
+        self._initialized = False
+
+    def register(self, pipeline_cls: type, node_params: Dict[str, MellonNodeConfig]):
+        if not self._initialized:
+            _initialize_registry(self)
+        self._registry[pipeline_cls] = node_params
+
+    def get(self, pipeline_cls: type) -> MellonNodeConfig:
+        if not self._initialized:
+            _initialize_registry(self)
+        return self._registry.get(pipeline_cls, None)
+
+    def get_all(self) -> Dict[type, Dict[str, MellonNodeConfig]]:
+        if not self._initialized:
+            _initialize_registry(self)
+        return self._registry
+
+
+def _register_preset_node_types(
+    pipeline_cls, params_map: Dict[str, Dict[str, Any]], registry: ModularMellonNodeRegistry
+):
+    """Register all node-type presets for a given pipeline class from a params map."""
+    node_configs = {}
+    for node_type, spec in params_map.items():
+        node_config = MellonNodeConfig(
+            inputs=spec.get("inputs", []),
+            model_inputs=spec.get("model_inputs", []),
+            outputs=spec.get("outputs", []),
+            blocks_names=spec.get("block_names", []),
+            node_type=node_type,
+        )
+        node_configs[node_type] = node_config
+    registry.register(pipeline_cls, node_configs)
+
+
+def _initialize_registry(registry: ModularMellonNodeRegistry):
+    """Initialize the registry and register all available pipeline configs."""
+    print("Initializing registry")
+
+    registry._initialized = True
+
+    try:
+        from .qwenimage.modular_pipeline import QwenImageModularPipeline
+        from .qwenimage.node_utils import QwenImage_NODE_TYPES_PARAMS_MAP
+
+        _register_preset_node_types(QwenImageModularPipeline, QwenImage_NODE_TYPES_PARAMS_MAP, registry)
+    except Exception:
+        raise Exception("Failed to register QwenImageModularPipeline")
+
+    try:
+        from .stable_diffusion_xl.modular_pipeline import StableDiffusionXLModularPipeline
+        from .stable_diffusion_xl.node_utils import SDXL_NODE_TYPES_PARAMS_MAP
+
+        _register_preset_node_types(StableDiffusionXLModularPipeline, SDXL_NODE_TYPES_PARAMS_MAP, registry)
+    except Exception:
+        raise Exception("Failed to register StableDiffusionXLModularPipeline")
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 74ffc6234894..206d19f17371 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -51,6 +51,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# map regular pipeline to modular pipeline class name
 MODULAR_PIPELINE_MAPPING = OrderedDict(
     [
         ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
@@ -61,16 +62,6 @@
     ]
 )
 
-MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
-    [
-        ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
-        ("WanModularPipeline", "WanAutoBlocks"),
-        ("FluxModularPipeline", "FluxAutoBlocks"),
-        ("QwenImageModularPipeline", "QwenImageAutoBlocks"),
-        ("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
-    ]
-)
-
 
 @dataclass
 class PipelineState:
@@ -423,7 +414,7 @@ def set_block_state(self, state: PipelineState, block_state: BlockState):
                     state.set(input_param.name, param, input_param.kwargs_type)
 
             elif input_param.kwargs_type:
-                # if it is a kwargs type, e.g. "guider_input_fields", it is likely to be a list of parameters
+                # if it is a kwargs type, e.g. "denoiser_input_fields", it is likely to be a list of parameters
                 # we need to first find out which inputs are and loop through them.
                 intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type)
                 for param_name, current_value in intermediate_kwargs.items():
@@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
 
     config_name = "modular_model_index.json"
     hf_device_map = None
+    default_blocks_name = None
 
     # YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name
     def __init__(
@@ -1514,7 +1506,7 @@ def __init__(
               `_blocks_class_name` in the config dict
         """
         if blocks is None:
-            blocks_class_name = MODULAR_PIPELINE_BLOCKS_MAPPING.get(self.__class__.__name__)
+            blocks_class_name = self.default_blocks_name
             if blocks_class_name is not None:
                 diffusers_module = importlib.import_module("diffusers")
                 blocks_class = getattr(diffusers_module, blocks_class_name)
diff --git a/src/diffusers/modular_pipelines/node_utils.py b/src/diffusers/modular_pipelines/node_utils.py
deleted file mode 100644
index 5db860c7887d..000000000000
--- a/src/diffusers/modular_pipelines/node_utils.py
+++ /dev/null
@@ -1,665 +0,0 @@
-import json
-import logging
-import os
-from pathlib import Path
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import PIL
-import torch
-
-from ..configuration_utils import ConfigMixin
-from ..image_processor import PipelineImageInput
-from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks
-from .modular_pipeline_utils import InputParam
-
-
-logger = logging.getLogger(__name__)
-
-# YiYi Notes: this is actually for SDXL, put it here for now
-SDXL_INPUTS_SCHEMA = {
-    "prompt": InputParam(
-        "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"
-    ),
-    "prompt_2": InputParam(
-        "prompt_2",
-        type_hint=Union[str, List[str]],
-        description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2",
-    ),
-    "negative_prompt": InputParam(
-        "negative_prompt",
-        type_hint=Union[str, List[str]],
-        description="The prompt or prompts not to guide the image generation",
-    ),
-    "negative_prompt_2": InputParam(
-        "negative_prompt_2",
-        type_hint=Union[str, List[str]],
-        description="The negative prompt or prompts for text_encoder_2",
-    ),
-    "cross_attention_kwargs": InputParam(
-        "cross_attention_kwargs",
-        type_hint=Optional[dict],
-        description="Kwargs dictionary passed to the AttentionProcessor",
-    ),
-    "clip_skip": InputParam(
-        "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"
-    ),
-    "image": InputParam(
-        "image",
-        type_hint=PipelineImageInput,
-        required=True,
-        description="The image(s) to modify for img2img or inpainting",
-    ),
-    "mask_image": InputParam(
-        "mask_image",
-        type_hint=PipelineImageInput,
-        required=True,
-        description="Mask image for inpainting, white pixels will be repainted",
-    ),
-    "generator": InputParam(
-        "generator",
-        type_hint=Optional[Union[torch.Generator, List[torch.Generator]]],
-        description="Generator(s) for deterministic generation",
-    ),
-    "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
-    "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
-    "num_images_per_prompt": InputParam(
-        "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"
-    ),
-    "num_inference_steps": InputParam(
-        "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"
-    ),
-    "timesteps": InputParam(
-        "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"
-    ),
-    "sigmas": InputParam(
-        "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"
-    ),
-    "denoising_end": InputParam(
-        "denoising_end",
-        type_hint=Optional[float],
-        description="Fraction of denoising process to complete before termination",
-    ),
-    # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
-    "strength": InputParam(
-        "strength", type_hint=float, default=0.3, description="How much to transform the reference image"
-    ),
-    "denoising_start": InputParam(
-        "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"
-    ),
-    "latents": InputParam(
-        "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"
-    ),
-    "padding_mask_crop": InputParam(
-        "padding_mask_crop",
-        type_hint=Optional[Tuple[int, int]],
-        description="Size of margin in crop for image and mask",
-    ),
-    "original_size": InputParam(
-        "original_size",
-        type_hint=Optional[Tuple[int, int]],
-        description="Original size of the image for SDXL's micro-conditioning",
-    ),
-    "target_size": InputParam(
-        "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"
-    ),
-    "negative_original_size": InputParam(
-        "negative_original_size",
-        type_hint=Optional[Tuple[int, int]],
-        description="Negative conditioning based on image resolution",
-    ),
-    "negative_target_size": InputParam(
-        "negative_target_size",
-        type_hint=Optional[Tuple[int, int]],
-        description="Negative conditioning based on target resolution",
-    ),
-    "crops_coords_top_left": InputParam(
-        "crops_coords_top_left",
-        type_hint=Tuple[int, int],
-        default=(0, 0),
-        description="Top-left coordinates for SDXL's micro-conditioning",
-    ),
-    "negative_crops_coords_top_left": InputParam(
-        "negative_crops_coords_top_left",
-        type_hint=Tuple[int, int],
-        default=(0, 0),
-        description="Negative conditioning crop coordinates",
-    ),
-    "aesthetic_score": InputParam(
-        "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"
-    ),
-    "negative_aesthetic_score": InputParam(
-        "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"
-    ),
-    "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
-    "output_type": InputParam(
-        "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"
-    ),
-    "ip_adapter_image": InputParam(
-        "ip_adapter_image",
-        type_hint=PipelineImageInput,
-        required=True,
-        description="Image(s) to be used as IP adapter",
-    ),
-    "control_image": InputParam(
-        "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"
-    ),
-    "control_guidance_start": InputParam(
-        "control_guidance_start",
-        type_hint=Union[float, List[float]],
-        default=0.0,
-        description="When ControlNet starts applying",
-    ),
-    "control_guidance_end": InputParam(
-        "control_guidance_end",
-        type_hint=Union[float, List[float]],
-        default=1.0,
-        description="When ControlNet stops applying",
-    ),
-    "controlnet_conditioning_scale": InputParam(
-        "controlnet_conditioning_scale",
-        type_hint=Union[float, List[float]],
-        default=1.0,
-        description="Scale factor for ControlNet outputs",
-    ),
-    "guess_mode": InputParam(
-        "guess_mode",
-        type_hint=bool,
-        default=False,
-        description="Enables ControlNet encoder to recognize input without prompts",
-    ),
-    "control_mode": InputParam(
-        "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet"
-    ),
-}
-
-SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
-    "prompt_embeds": InputParam(
-        "prompt_embeds",
-        type_hint=torch.Tensor,
-        required=True,
-        description="Text embeddings used to guide image generation",
-    ),
-    "negative_prompt_embeds": InputParam(
-        "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
-    ),
-    "pooled_prompt_embeds": InputParam(
-        "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"
-    ),
-    "negative_pooled_prompt_embeds": InputParam(
-        "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
-    ),
-    "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
-    "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
-    "preprocess_kwargs": InputParam(
-        "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"
-    ),
-    "latents": InputParam(
-        "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"
-    ),
-    "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
-    "num_inference_steps": InputParam(
-        "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"
-    ),
-    "latent_timestep": InputParam(
-        "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"
-    ),
-    "image_latents": InputParam(
-        "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"
-    ),
-    "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
-    "masked_image_latents": InputParam(
-        "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
-    ),
-    "add_time_ids": InputParam(
-        "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"
-    ),
-    "negative_add_time_ids": InputParam(
-        "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
-    ),
-    "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
-    "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
-    "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
-    "ip_adapter_embeds": InputParam(
-        "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
-    ),
-    "negative_ip_adapter_embeds": InputParam(
-        "negative_ip_adapter_embeds",
-        type_hint=List[torch.Tensor],
-        description="Negative image embeddings for IP-Adapter",
-    ),
-    "images": InputParam(
-        "images",
-        type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-        required=True,
-        description="Generated images",
-    ),
-}
-
-SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA}
-
-
-DEFAULT_PARAM_MAPS = {
-    "prompt": {
-        "label": "Prompt",
-        "type": "string",
-        "default": "a bear sitting in a chair drinking a milkshake",
-        "display": "textarea",
-    },
-    "negative_prompt": {
-        "label": "Negative Prompt",
-        "type": "string",
-        "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
-        "display": "textarea",
-    },
-    "num_inference_steps": {
-        "label": "Steps",
-        "type": "int",
-        "default": 25,
-        "min": 1,
-        "max": 1000,
-    },
-    "seed": {
-        "label": "Seed",
-        "type": "int",
-        "default": 0,
-        "min": 0,
-        "display": "random",
-    },
-    "width": {
-        "label": "Width",
-        "type": "int",
-        "display": "text",
-        "default": 1024,
-        "min": 8,
-        "max": 8192,
-        "step": 8,
-        "group": "dimensions",
-    },
-    "height": {
-        "label": "Height",
-        "type": "int",
-        "display": "text",
-        "default": 1024,
-        "min": 8,
-        "max": 8192,
-        "step": 8,
-        "group": "dimensions",
-    },
-    "images": {
-        "label": "Images",
-        "type": "image",
-        "display": "output",
-    },
-    "image": {
-        "label": "Image",
-        "type": "image",
-        "display": "input",
-    },
-}
-
-DEFAULT_TYPE_MAPS = {
-    "int": {
-        "type": "int",
-        "default": 0,
-        "min": 0,
-    },
-    "float": {
-        "type": "float",
-        "default": 0.0,
-        "min": 0.0,
-    },
-    "str": {
-        "type": "string",
-        "default": "",
-    },
-    "bool": {
-        "type": "boolean",
-        "default": False,
-    },
-    "image": {
-        "type": "image",
-    },
-}
-
-DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"]
-DEFAULT_CATEGORY = "Modular Diffusers"
-DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"]
-DEFAULT_PARAMS_GROUPS_KEYS = {
-    "text_encoders": ["text_encoder", "tokenizer"],
-    "ip_adapter_embeds": ["ip_adapter_embeds"],
-    "prompt_embeddings": ["prompt_embeds"],
-}
-
-
-def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS):
-    """
-    Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" ->
-    "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None
-    """
-    if name is None:
-        return None
-    for group_name, group_keys in group_params_keys.items():
-        for group_key in group_keys:
-            if group_key in name:
-                return group_name
-    return None
-
-
-class ModularNode(ConfigMixin):
-    """
-    A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper
-    around a ModularPipelineBlocks object.
-
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
-    """
-
-    config_name = "node_config.json"
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: str,
-        trust_remote_code: Optional[bool] = None,
-        **kwargs,
-    ):
-        blocks = ModularPipelineBlocks.from_pretrained(
-            pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
-        )
-        return cls(blocks, **kwargs)
-
-    def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs):
-        self.blocks = blocks
-
-        if label is None:
-            label = self.blocks.__class__.__name__
-        # blocks param name -> mellon param name
-        self.name_mapping = {}
-
-        input_params = {}
-        # pass or create a default param dict for each input
-        # e.g. for prompt,
-        #       prompt = {
-        #               "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers
-        #               "label": "Prompt",
-        #               "type": "string",
-        #               "default": "a bear sitting in a chair drinking a milkshake",
-        #               "display": "textarea"}
-        # if type is not specified, it'll be a "custom" param of its own type
-        # e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
-        #  it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
-        #  name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
-        inputs = self.blocks.inputs + self.blocks.intermediate_inputs
-        for inp in inputs:
-            param = kwargs.pop(inp.name, None)
-            if param:
-                # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...})
-                input_params[inp.name] = param
-                mellon_name = param.pop("name", inp.name)
-                if mellon_name != inp.name:
-                    self.name_mapping[inp.name] = mellon_name
-                continue
-
-            if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name):
-                continue
-
-            if inp.name in DEFAULT_PARAM_MAPS:
-                # first check if it's in the default param map, if so, directly use that
-                param = DEFAULT_PARAM_MAPS[inp.name].copy()
-            elif get_group_name(inp.name):
-                param = get_group_name(inp.name)
-                if inp.name not in self.name_mapping:
-                    self.name_mapping[inp.name] = param
-            else:
-                # if not, check if it's in the SDXL input schema, if so,
-                # 1. use the type hint to determine the type
-                # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}}
-                if inp.type_hint is not None:
-                    type_str = str(inp.type_hint).lower()
-                else:
-                    inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None)
-                    type_str = str(inp_spec.type_hint).lower() if inp_spec else ""
-                for type_key, type_param in DEFAULT_TYPE_MAPS.items():
-                    if type_key in type_str:
-                        param = type_param.copy()
-                        param["label"] = inp.name
-                        param["display"] = "input"
-                        break
-                else:
-                    param = inp.name
-            # add the param dict to the inp_params dict
-            input_params[inp.name] = param
-
-        component_params = {}
-        for comp in self.blocks.expected_components:
-            param = kwargs.pop(comp.name, None)
-            if param:
-                component_params[comp.name] = param
-                mellon_name = param.pop("name", comp.name)
-                if mellon_name != comp.name:
-                    self.name_mapping[comp.name] = mellon_name
-                continue
-
-            to_exclude = False
-            for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS:
-                if exclude_key in comp.name:
-                    to_exclude = True
-                    break
-            if to_exclude:
-                continue
-
-            if get_group_name(comp.name):
-                param = get_group_name(comp.name)
-                if comp.name not in self.name_mapping:
-                    self.name_mapping[comp.name] = param
-            elif comp.name in DEFAULT_MODEL_KEYS:
-                param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"}
-            else:
-                param = comp.name
-            # add the param dict to the model_params dict
-            component_params[comp.name] = param
-
-        output_params = {}
-        if isinstance(self.blocks, SequentialPipelineBlocks):
-            last_block_name = list(self.blocks.sub_blocks.keys())[-1]
-            outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs
-        else:
-            outputs = self.blocks.intermediate_outputs
-
-        for out in outputs:
-            param = kwargs.pop(out.name, None)
-            if param:
-                output_params[out.name] = param
-                mellon_name = param.pop("name", out.name)
-                if mellon_name != out.name:
-                    self.name_mapping[out.name] = mellon_name
-                continue
-
-            if out.name in DEFAULT_PARAM_MAPS:
-                param = DEFAULT_PARAM_MAPS[out.name].copy()
-                param["display"] = "output"
-            else:
-                group_name = get_group_name(out.name)
-                if group_name:
-                    param = group_name
-                    if out.name not in self.name_mapping:
-                        self.name_mapping[out.name] = param
-                else:
-                    param = out.name
-            # add the param dict to the outputs dict
-            output_params[out.name] = param
-
-        if len(kwargs) > 0:
-            logger.warning(f"Unused kwargs: {kwargs}")
-
-        register_dict = {
-            "category": category,
-            "label": label,
-            "input_params": input_params,
-            "component_params": component_params,
-            "output_params": output_params,
-            "name_mapping": self.name_mapping,
-        }
-        self.register_to_config(**register_dict)
-
-    def setup(self, components_manager, collection=None):
-        self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection)
-        self._components_manager = components_manager
-
-    @property
-    def mellon_config(self):
-        return self._convert_to_mellon_config()
-
-    def _convert_to_mellon_config(self):
-        node = {}
-        node["label"] = self.config.label
-        node["category"] = self.config.category
-
-        node_param = {}
-        for inp_name, inp_param in self.config.input_params.items():
-            if inp_name in self.name_mapping:
-                mellon_name = self.name_mapping[inp_name]
-            else:
-                mellon_name = inp_name
-            if isinstance(inp_param, str):
-                param = {
-                    "label": inp_param,
-                    "type": inp_param,
-                    "display": "input",
-                }
-            else:
-                param = inp_param
-
-            if mellon_name not in node_param:
-                node_param[mellon_name] = param
-            else:
-                logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}")
-
-        for comp_name, comp_param in self.config.component_params.items():
-            if comp_name in self.name_mapping:
-                mellon_name = self.name_mapping[comp_name]
-            else:
-                mellon_name = comp_name
-            if isinstance(comp_param, str):
-                param = {
-                    "label": comp_param,
-                    "type": comp_param,
-                    "display": "input",
-                }
-            else:
-                param = comp_param
-
-            if mellon_name not in node_param:
-                node_param[mellon_name] = param
-            else:
-                logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}")
-
-        for out_name, out_param in self.config.output_params.items():
-            if out_name in self.name_mapping:
-                mellon_name = self.name_mapping[out_name]
-            else:
-                mellon_name = out_name
-            if isinstance(out_param, str):
-                param = {
-                    "label": out_param,
-                    "type": out_param,
-                    "display": "output",
-                }
-            else:
-                param = out_param
-
-            if mellon_name not in node_param:
-                node_param[mellon_name] = param
-            else:
-                logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}")
-        node["params"] = node_param
-        return node
-
-    def save_mellon_config(self, file_path):
-        """
-        Save the Mellon configuration to a JSON file.
-
-        Args:
-            file_path (str or Path): Path where the JSON file will be saved
-
-        Returns:
-            Path: Path to the saved config file
-        """
-        file_path = Path(file_path)
-
-        # Create directory if it doesn't exist
-        os.makedirs(file_path.parent, exist_ok=True)
-
-        # Create a combined dictionary with module definition and name mapping
-        config = {"module": self.mellon_config, "name_mapping": self.name_mapping}
-
-        # Save the config to file
-        with open(file_path, "w", encoding="utf-8") as f:
-            json.dump(config, f, indent=2)
-
-        logger.info(f"Mellon config and name mapping saved to {file_path}")
-
-        return file_path
-
-    @classmethod
-    def load_mellon_config(cls, file_path):
-        """
-        Load a Mellon configuration from a JSON file.
-
-        Args:
-            file_path (str or Path): Path to the JSON file containing Mellon config
-
-        Returns:
-            dict: The loaded combined configuration containing 'module' and 'name_mapping'
-        """
-        file_path = Path(file_path)
-
-        if not file_path.exists():
-            raise FileNotFoundError(f"Config file not found: {file_path}")
-
-        with open(file_path, "r", encoding="utf-8") as f:
-            config = json.load(f)
-
-        logger.info(f"Mellon config loaded from {file_path}")
-
-        return config
-
-    def process_inputs(self, **kwargs):
-        params_components = {}
-        for comp_name, comp_param in self.config.component_params.items():
-            logger.debug(f"component: {comp_name}")
-            mellon_comp_name = self.name_mapping.get(comp_name, comp_name)
-            if mellon_comp_name in kwargs:
-                if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]:
-                    comp = kwargs[mellon_comp_name].pop(comp_name)
-                else:
-                    comp = kwargs.pop(mellon_comp_name)
-                if comp:
-                    params_components[comp_name] = self._components_manager.get_one(comp["model_id"])
-
-        params_run = {}
-        for inp_name, inp_param in self.config.input_params.items():
-            logger.debug(f"input: {inp_name}")
-            mellon_inp_name = self.name_mapping.get(inp_name, inp_name)
-            if mellon_inp_name in kwargs:
-                if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]:
-                    inp = kwargs[mellon_inp_name].pop(inp_name)
-                else:
-                    inp = kwargs.pop(mellon_inp_name)
-                if inp is not None:
-                    params_run[inp_name] = inp
-
-        return_output_names = list(self.config.output_params.keys())
-
-        return params_components, params_run, return_output_names
-
-    def execute(self, **kwargs):
-        params_components, params_run, return_output_names = self.process_inputs(**kwargs)
-
-        self.pipeline.update_components(**params_components)
-        output = self.pipeline(**params_run, output=return_output_names)
-        return output
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 738a1e5d151d..606236cfe91b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -577,9 +577,8 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="batch_size", required=True),
-            InputParam(
-                name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input"
-            ),
+            InputParam(name="image_height", required=True),
+            InputParam(name="image_width", required=True),
             InputParam(name="height", required=True),
             InputParam(name="width", required=True),
             InputParam(name="prompt_embeds_mask"),
@@ -612,10 +611,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state = self.get_block_state(state)
 
         # for edit, image size can be different from the target size (height/width)
-        image = (
-            block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image
-        )
-        image_width, image_height = image.size
 
         block_state.img_shapes = [
             [
@@ -624,7 +619,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
                     block_state.height // components.vae_scale_factor // 2,
                     block_state.width // components.vae_scale_factor // 2,
                 ),
-                (1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2),
+                (
+                    1,
+                    block_state.image_height // components.vae_scale_factor // 2,
+                    block_state.image_width // components.vae_scale_factor // 2,
+                ),
             ]
         ] * block_state.batch_size
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 280fa6a152c4..2ab83a03ee55 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -496,7 +496,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         )
 
         if components.requires_unconditional_embeds:
-            negative_prompt = block_state.negative_prompt or ""
+            negative_prompt = block_state.negative_prompt or " "
             block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
                 components.text_encoder,
                 components.processor,
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 2b787c823865..2b229c040b89 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -307,6 +307,13 @@ def inputs(self) -> List[InputParam]:
 
         return inputs
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
+            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
+        ]
+
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return [
@@ -327,6 +334,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             block_state.height = block_state.height or height
             block_state.width = block_state.width or width
 
+            if not hasattr(block_state, "image_height"):
+                block_state.image_height = height
+            if not hasattr(block_state, "image_width"):
+                block_state.image_width = width
+
             # 2. Patchify the image latent tensor
             image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
index a01c742fcf68..9126766cc202 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -511,17 +511,42 @@ def description(self):
         )
 
 
+class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [
+        QwenImageAutoInputStep,
+        QwenImageOptionalControlNetInputStep,
+        QwenImageAutoBeforeDenoiseStep,
+        QwenImageOptionalControlNetBeforeDenoiseStep,
+        QwenImageAutoDenoiseStep,
+    ]
+    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+            + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
+            + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
+            + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+            + " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+        )
+
+
 ## 1.10 QwenImage/auto block & presets
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", QwenImageTextEncoderStep()),
         ("vae_encoder", QwenImageAutoVaeEncoderStep()),
         ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
-        ("input", QwenImageAutoInputStep()),
-        ("controlnet_input", QwenImageOptionalControlNetInputStep()),
-        ("before_denoise", QwenImageAutoBeforeDenoiseStep()),
-        ("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()),
-        ("denoise", QwenImageAutoDenoiseStep()),
+        ("denoise", QwenImageCoreDenoiseStep()),
         ("decode", QwenImageAutoDecodeStep()),
     ]
 )
@@ -699,7 +724,7 @@ def description(self):
 class QwenImageEditAutoInputStep(AutoPipelineBlocks):
     block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep]
     block_names = ["edit_inpaint", "edit"]
-    block_trigger_inputs = ["processed_mask_image", "image"]
+    block_trigger_inputs = ["processed_mask_image", "image_latents"]
 
     @property
     def description(self):
@@ -800,13 +825,34 @@ def description(self):
 
 ## 2.7 QwenImage-Edit/auto blocks & presets
 
+
+class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit"
+    block_classes = [
+        QwenImageEditAutoInputStep,
+        QwenImageEditAutoBeforeDenoiseStep,
+        QwenImageEditAutoDenoiseStep,
+    ]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+            + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n"
+            + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n"
+            + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
+        )
+
+
 EDIT_AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", QwenImageEditVLEncoderStep()),
         ("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
-        ("input", QwenImageEditAutoInputStep()),
-        ("before_denoise", QwenImageEditAutoBeforeDenoiseStep()),
-        ("denoise", QwenImageEditAutoDenoiseStep()),
+        ("denoise", QwenImageEditCoreDenoiseStep()),
         ("decode", QwenImageAutoDecodeStep()),
     ]
 )
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
index fe9757f41bcc..3248d131590f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
     </Tip>
     """
 
+    default_blocks_name = "QwenImageAutoBlocks"
+
     @property
     def default_height(self):
         return self.default_sample_size * self.vae_scale_factor
@@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
     </Tip>
     """
 
+    default_blocks_name = "QwenImageEditAutoBlocks"
+
     # YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
     @property
     def default_height(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/node_utils.py b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
new file mode 100644
index 000000000000..3230ece68abc
--- /dev/null
+++ b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# mellon nodes
+QwenImage_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+            "vae",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": ["controlnet_vae_encoder"],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            "controlnet",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+    },
+}
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
index fefa622f1a61..70cbf0c1c78d 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
@@ -262,37 +262,37 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="negative text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="negative pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "ip_adapter_embeds",
                 type_hint=List[torch.Tensor],
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="image embeddings for IP-Adapter",
             ),
             OutputParam(
                 "negative_ip_adapter_embeds",
                 type_hint=List[torch.Tensor],
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="negative image embeddings for IP-Adapter",
             ),
         ]
@@ -1120,13 +1120,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "add_time_ids",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="The time ids to condition the denoising process",
             ),
             OutputParam(
                 "negative_add_time_ids",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="The negative time ids to condition the denoising process",
             ),
             OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
@@ -1331,13 +1331,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "add_time_ids",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="The time ids to condition the denoising process",
             ),
             OutputParam(
                 "negative_add_time_ids",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="The negative time ids to condition the denoising process",
             ),
             OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
index a2e142059532..8a8025747332 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
@@ -183,14 +183,14 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.",
             ),
             InputParam(
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description=(
                     "All conditional model inputs that need to be prepared with guider. "
                     "It should contain prompt_embeds/negative_prompt_embeds, "
                     "add_time_ids/negative_add_time_ids, "
                     "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
                     "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
-                    "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                    "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
                 ),
             ),
         ]
@@ -307,14 +307,14 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam(
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description=(
                     "All conditional model inputs that need to be prepared with guider. "
                     "It should contain prompt_embeds/negative_prompt_embeds, "
                     "add_time_ids/negative_add_time_ids, "
                     "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
                     "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
-                    "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                    "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
                 ),
             ),
             InputParam(
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
index 1e8921d363c1..90b254b6f5d4 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
@@ -258,25 +258,25 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="negative text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="negative pooled text embeddings used to guide the image generation",
             ),
         ]
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
index c9033856bcc0..68b5e33755b5 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
@@ -82,19 +82,17 @@ def description(self):
 # before_denoise: text2img
 class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        StableDiffusionXLInputStep,
         StableDiffusionXLSetTimestepsStep,
         StableDiffusionXLPrepareLatentsStep,
         StableDiffusionXLPrepareAdditionalConditioningStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
 
     @property
     def description(self):
         return (
             "Before denoise step that prepare the inputs for the denoise step.\n"
             + "This is a sequential pipeline blocks:\n"
-            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n"
             + " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n"
             + " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -104,19 +102,17 @@ def description(self):
 # before_denoise: img2img
 class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        StableDiffusionXLInputStep,
         StableDiffusionXLImg2ImgSetTimestepsStep,
         StableDiffusionXLImg2ImgPrepareLatentsStep,
         StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
 
     @property
     def description(self):
         return (
             "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
             + "This is a sequential pipeline blocks:\n"
-            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
             + " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
             + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -126,19 +122,17 @@ def description(self):
 # before_denoise: inpainting
 class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        StableDiffusionXLInputStep,
         StableDiffusionXLImg2ImgSetTimestepsStep,
         StableDiffusionXLInpaintPrepareLatentsStep,
         StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
 
     @property
     def description(self):
         return (
             "Before denoise step that prepare the inputs for the denoise step for inpainting task.\n"
             + "This is a sequential pipeline blocks:\n"
-            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
             + " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n"
             + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -255,25 +249,48 @@ def description(self):
         )
 
 
+class StableDiffusionXLCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLInputStep,
+        StableDiffusionXLAutoBeforeDenoiseStep,
+        StableDiffusionXLAutoControlNetInputStep,
+        StableDiffusionXLAutoDenoiseStep,
+    ]
+    block_names = ["input", "before_denoise", "controlnet_input", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `StableDiffusionXLInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `StableDiffusionXLAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `StableDiffusionXLAutoControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+            + " - `StableDiffusionXLAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+            + "This step support text-to-image, image-to-image, inpainting, with or without controlnet/controlnet_union/ip_adapter for Stable Diffusion XL:\n"
+            + "- for image-to-image generation, you need to provide `image_latents`\n"
+            + "- for inpainting, you need to provide `mask_image` and `image_latents`\n"
+            + "- to run the controlnet workflow, you need to provide `control_image`\n"
+            + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
+            + "- to run the ip_adapter workflow, you need to load ip_adapter into your unet and provide `ip_adapter_embeds`\n"
+            + "- for text-to-image generation, all you need to provide is prompt embeddings\n"
+        )
+
+
 # ip-adapter, controlnet, text2img, img2img, inpainting
 class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         StableDiffusionXLTextEncoderStep,
         StableDiffusionXLAutoIPAdapterStep,
         StableDiffusionXLAutoVaeEncoderStep,
-        StableDiffusionXLAutoBeforeDenoiseStep,
-        StableDiffusionXLAutoControlNetInputStep,
-        StableDiffusionXLAutoDenoiseStep,
+        StableDiffusionXLCoreDenoiseStep,
         StableDiffusionXLAutoDecodeStep,
     ]
     block_names = [
         "text_encoder",
         "ip_adapter",
-        "image_encoder",
-        "before_denoise",
-        "controlnet_input",
+        "vae_encoder",
         "denoise",
-        "decoder",
+        "decode",
     ]
 
     @property
@@ -321,7 +338,7 @@ def description(self):
 IMAGE2IMAGE_BLOCKS = InsertableDict(
     [
         ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("image_encoder", StableDiffusionXLVaeEncoderStep),
+        ("vae_encoder", StableDiffusionXLVaeEncoderStep),
         ("input", StableDiffusionXLInputStep),
         ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
         ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
@@ -334,7 +351,7 @@ def description(self):
 INPAINT_BLOCKS = InsertableDict(
     [
         ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("image_encoder", StableDiffusionXLInpaintVaeEncoderStep),
+        ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
         ("input", StableDiffusionXLInputStep),
         ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
         ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
@@ -361,10 +378,8 @@ def description(self):
     [
         ("text_encoder", StableDiffusionXLTextEncoderStep),
         ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
-        ("image_encoder", StableDiffusionXLAutoVaeEncoderStep),
-        ("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep),
-        ("controlnet_input", StableDiffusionXLAutoControlNetInputStep),
-        ("denoise", StableDiffusionXLAutoDenoiseStep),
+        ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
+        ("denoise", StableDiffusionXLCoreDenoiseStep),
         ("decode", StableDiffusionXLAutoDecodeStep),
     ]
 )
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
index e84f5cad1ab4..29a717f72e59 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline(
     </Tip>
     """
 
+    default_blocks_name = "StableDiffusionXLAutoBlocks"
+
     @property
     def default_height(self):
         return self.default_sample_size * self.vae_scale_factor
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
new file mode 100644
index 000000000000..3e788bf94741
--- /dev/null
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+SDXL_NODE_TYPES_PARAMS_MAP = {
+    "controlnet": {
+        "inputs": [
+            "control_image",
+            "controlnet_conditioning_scale",
+            "control_guidance_start",
+            "control_guidance_end",
+            "height",
+            "width",
+        ],
+        "model_inputs": [
+            "controlnet",
+        ],
+        "outputs": [
+            "controlnet_out",
+        ],
+        "block_names": [None],
+    },
+    "denoise": {
+        "inputs": [
+            "embeddings",
+            "width",
+            "height",
+            "seed",
+            "num_inference_steps",
+            "guidance_scale",
+            "image_latents",
+            "strength",
+            # custom adapters coming in as inputs
+            "controlnet",
+            # ip_adapter is optional and custom; include if available
+            "ip_adapter",
+        ],
+        "model_inputs": [
+            "unet",
+            "guider",
+            "scheduler",
+        ],
+        "outputs": [
+            "latents",
+            "latents_preview",
+        ],
+        "block_names": ["denoise"],
+    },
+    "vae_encoder": {
+        "inputs": [
+            "image",
+            "width",
+            "height",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "image_latents",
+        ],
+        "block_names": ["vae_encoder"],
+    },
+    "text_encoder": {
+        "inputs": [
+            "prompt",
+            "negative_prompt",
+        ],
+        "model_inputs": [
+            "text_encoders",
+        ],
+        "outputs": [
+            "embeddings",
+        ],
+        "block_names": ["text_encoder"],
+    },
+    "decoder": {
+        "inputs": [
+            "latents",
+        ],
+        "model_inputs": [
+            "vae",
+        ],
+        "outputs": [
+            "images",
+        ],
+        "block_names": ["decode"],
+    },
+}
diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py
index 2b9889f8778a..d48f678edd59 100644
--- a/src/diffusers/modular_pipelines/wan/before_denoise.py
+++ b/src/diffusers/modular_pipelines/wan/before_denoise.py
@@ -146,13 +146,13 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
                 description="negative text embeddings used to guide the image generation",
             ),
         ]
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
index 5f578609c24f..66c51493bd6a 100644
--- a/src/diffusers/modular_pipelines/wan/denoise.py
+++ b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -79,11 +79,11 @@ def intermediate_inputs(self) -> List[str]:
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam(
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description=(
                     "All conditional model inputs that need to be prepared with guider. "
                     "It should contain prompt_embeds/negative_prompt_embeds. "
-                    "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                    "Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
                 ),
             ),
         ]
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
index a0bf76b99b55..cb2fc242383c 100644
--- a/src/diffusers/modular_pipelines/wan/encoders.py
+++ b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -89,13 +89,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "negative_prompt_embeds",
                 type_hint=torch.Tensor,
-                kwargs_type="guider_input_fields",
+                kwargs_type="denoiser_input_fields",
                 description="negative text embeddings used to guide the image generation",
             ),
         ]
diff --git a/src/diffusers/modular_pipelines/wan/modular_pipeline.py b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
index 4d86e0d08e59..da4aada43839 100644
--- a/src/diffusers/modular_pipelines/wan/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
@@ -37,6 +37,8 @@ class WanModularPipeline(
     </Tip>
     """
 
+    default_blocks_name = "WanAutoBlocks"
+
     @property
     def default_height(self):
         return self.default_sample_height * self.vae_scale_factor_spatial

From 20fd00b14b6ad0ec3fa1b67e26323e135bf9fe8a Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 30 Sep 2025 09:58:34 +0200
Subject: [PATCH 18/69] [Tests] Add single file tester mixin for Models and
 remove unittest dependency (#12352)

* update

* update

* update

* update

* update
---
 .../single_file/single_file_testing_utils.py  | 91 +++++++++++++++++++
 tests/single_file/test_lumina2_transformer.py | 41 +--------
 .../test_model_autoencoder_dc_single_file.py  | 32 +------
 .../test_model_controlnet_single_file.py      | 33 +------
 ...test_model_flux_transformer_single_file.py | 38 +-------
 .../test_model_motion_adapter_single_file.py  |  3 +-
 .../test_model_sd_cascade_unet_single_file.py |  9 +-
 .../single_file/test_model_vae_single_file.py | 32 +------
 .../test_model_wan_autoencoder_single_file.py | 32 +------
 ...est_model_wan_transformer3d_single_file.py | 58 +-----------
 tests/single_file/test_sana_transformer.py    | 42 +--------
 ...iffusion_controlnet_img2img_single_file.py |  9 +-
 ...iffusion_controlnet_inpaint_single_file.py | 14 ++-
 ...stable_diffusion_controlnet_single_file.py |  9 +-
 ...st_stable_diffusion_img2img_single_file.py | 17 ++--
 ...st_stable_diffusion_inpaint_single_file.py | 22 ++---
 .../test_stable_diffusion_single_file.py      | 25 ++---
 ...st_stable_diffusion_upscale_single_file.py |  9 +-
 ...stable_diffusion_xl_adapter_single_file.py |  9 +-
 ...ble_diffusion_xl_controlnet_single_file.py |  9 +-
 ...stable_diffusion_xl_img2img_single_file.py | 11 +--
 ...st_stable_diffusion_xl_instruct_pix2pix.py |  9 +-
 .../test_stable_diffusion_xl_single_file.py   |  9 +-
 23 files changed, 173 insertions(+), 390 deletions(-)

diff --git a/tests/single_file/single_file_testing_utils.py b/tests/single_file/single_file_testing_utils.py
index 3510d3371ca5..52fd2f5bfc7f 100644
--- a/tests/single_file/single_file_testing_utils.py
+++ b/tests/single_file/single_file_testing_utils.py
@@ -1,3 +1,4 @@
+import gc
 import tempfile
 from io import BytesIO
 
@@ -9,7 +10,10 @@
 from diffusers.models.attention_processor import AttnProcessor
 
 from ..testing_utils import (
+    backend_empty_cache,
+    nightly,
     numpy_cosine_similarity_distance,
+    require_torch_accelerator,
     torch_device,
 )
 
@@ -47,6 +51,93 @@ def download_diffusers_config(repo_id, tmpdir):
     return path
 
 
+@nightly
+@require_torch_accelerator
+class SingleFileModelTesterMixin:
+    def setup_method(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def teardown_method(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def test_single_file_model_config(self):
+        pretrained_kwargs = {}
+        single_file_kwargs = {}
+
+        if hasattr(self, "subfolder") and self.subfolder:
+            pretrained_kwargs["subfolder"] = self.subfolder
+
+        if hasattr(self, "torch_dtype") and self.torch_dtype:
+            pretrained_kwargs["torch_dtype"] = self.torch_dtype
+            single_file_kwargs["torch_dtype"] = self.torch_dtype
+
+        model = self.model_class.from_pretrained(self.repo_id, **pretrained_kwargs)
+        model_single_file = self.model_class.from_single_file(self.ckpt_path, **single_file_kwargs)
+
+        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
+        for param_name, param_value in model_single_file.config.items():
+            if param_name in PARAMS_TO_IGNORE:
+                continue
+            assert model.config[param_name] == param_value, (
+                f"{param_name} differs between pretrained loading and single file loading"
+            )
+
+    def test_single_file_model_parameters(self):
+        pretrained_kwargs = {}
+        single_file_kwargs = {}
+
+        if hasattr(self, "subfolder") and self.subfolder:
+            pretrained_kwargs["subfolder"] = self.subfolder
+
+        if hasattr(self, "torch_dtype") and self.torch_dtype:
+            pretrained_kwargs["torch_dtype"] = self.torch_dtype
+            single_file_kwargs["torch_dtype"] = self.torch_dtype
+
+        model = self.model_class.from_pretrained(self.repo_id, **pretrained_kwargs)
+        model_single_file = self.model_class.from_single_file(self.ckpt_path, **single_file_kwargs)
+
+        state_dict = model.state_dict()
+        state_dict_single_file = model_single_file.state_dict()
+
+        assert set(state_dict.keys()) == set(state_dict_single_file.keys()), (
+            "Model parameters keys differ between pretrained and single file loading"
+        )
+
+        for key in state_dict.keys():
+            param = state_dict[key]
+            param_single_file = state_dict_single_file[key]
+
+            assert param.shape == param_single_file.shape, (
+                f"Parameter shape mismatch for {key}: "
+                f"pretrained {param.shape} vs single file {param_single_file.shape}"
+            )
+
+            assert torch.allclose(param, param_single_file, rtol=1e-5, atol=1e-5), (
+                f"Parameter values differ for {key}: "
+                f"max difference {torch.max(torch.abs(param - param_single_file)).item()}"
+            )
+
+    def test_checkpoint_altered_keys_loading(self):
+        # Test loading with checkpoints that have altered keys
+        if not hasattr(self, "alternate_keys_ckpt_paths") or not self.alternate_keys_ckpt_paths:
+            return
+
+        for ckpt_path in self.alternate_keys_ckpt_paths:
+            backend_empty_cache(torch_device)
+
+            single_file_kwargs = {}
+            if hasattr(self, "torch_dtype") and self.torch_dtype:
+                single_file_kwargs["torch_dtype"] = self.torch_dtype
+
+            model = self.model_class.from_single_file(ckpt_path, **single_file_kwargs)
+
+            del model
+            gc.collect()
+            backend_empty_cache(torch_device)
+
+
 class SDSingleFileTesterMixin:
     single_file_kwargs = {}
 
diff --git a/tests/single_file/test_lumina2_transformer.py b/tests/single_file/test_lumina2_transformer.py
index 99d9b71395c6..bb5a0bf473b6 100644
--- a/tests/single_file/test_lumina2_transformer.py
+++ b/tests/single_file/test_lumina2_transformer.py
@@ -13,26 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 from diffusers import (
     Lumina2Transformer2DModel,
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
-    torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@require_torch_accelerator
-class Lumina2Transformer2DModelSingleFileTests(unittest.TestCase):
+class TestLumina2Transformer2DModelSingleFile(SingleFileModelTesterMixin):
     model_class = Lumina2Transformer2DModel
     ckpt_path = "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors"
     alternate_keys_ckpt_paths = [
@@ -40,34 +35,4 @@ class Lumina2Transformer2DModelSingleFileTests(unittest.TestCase):
     ]
 
     repo_id = "Alpha-VLLM/Lumina-Image-2.0"
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer")
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
-
-    def test_checkpoint_loading(self):
-        for ckpt_path in self.alternate_keys_ckpt_paths:
-            backend_empty_cache(torch_device)
-            model = self.model_class.from_single_file(ckpt_path)
-
-            del model
-            gc.collect()
-            backend_empty_cache(torch_device)
+    subfolder = "transformer"
diff --git a/tests/single_file/test_model_autoencoder_dc_single_file.py b/tests/single_file/test_model_autoencoder_dc_single_file.py
index 5195f8e52f8d..444ca4046977 100644
--- a/tests/single_file/test_model_autoencoder_dc_single_file.py
+++ b/tests/single_file/test_model_autoencoder_dc_single_file.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 import torch
 
@@ -23,38 +21,24 @@
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
     load_hf_numpy,
     numpy_cosine_similarity_distance,
-    require_torch_accelerator,
-    slow,
     torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@slow
-@require_torch_accelerator
-class AutoencoderDCSingleFileTests(unittest.TestCase):
+class TestAutoencoderDCSingleFile(SingleFileModelTesterMixin):
     model_class = AutoencoderDC
     ckpt_path = "https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0/blob/main/model.safetensors"
     repo_id = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
     main_input_name = "sample"
     base_precision = 1e-2
 
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
     def get_file_format(self, seed, shape):
         return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
 
@@ -80,18 +64,6 @@ def test_single_file_inference_same_as_pretrained(self):
 
         assert numpy_cosine_similarity_distance(output_slice_1, output_slice_2) < 1e-4
 
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id)
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between pretrained loading and single file loading"
-            )
-
     def test_single_file_in_type_variant_components(self):
         # `in` variant checkpoints require passing in a `config` parameter
         # in order to set the scaling factor correctly.
diff --git a/tests/single_file/test_model_controlnet_single_file.py b/tests/single_file/test_model_controlnet_single_file.py
index e5214fe3f209..2fa81fe3ae55 100644
--- a/tests/single_file/test_model_controlnet_single_file.py
+++ b/tests/single_file/test_model_controlnet_single_file.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 import torch
 
@@ -23,46 +21,19 @@
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
-    slow,
-    torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@slow
-@require_torch_accelerator
-class ControlNetModelSingleFileTests(unittest.TestCase):
+class TestControlNetModelSingleFile(SingleFileModelTesterMixin):
     model_class = ControlNetModel
     ckpt_path = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
     repo_id = "lllyasviel/control_v11p_sd15_canny"
 
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id)
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
-
     def test_single_file_arguments(self):
         model_default = self.model_class.from_single_file(self.ckpt_path)
 
diff --git a/tests/single_file/test_model_flux_transformer_single_file.py b/tests/single_file/test_model_flux_transformer_single_file.py
index 8290c339b931..0642a71c5756 100644
--- a/tests/single_file/test_model_flux_transformer_single_file.py
+++ b/tests/single_file/test_model_flux_transformer_single_file.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import gc
-import unittest
 
 from diffusers import (
     FluxTransformer2DModel,
@@ -23,52 +22,21 @@
 from ..testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
     torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@require_torch_accelerator
-class FluxTransformer2DModelSingleFileTests(unittest.TestCase):
+class TestFluxTransformer2DModelSingleFile(SingleFileModelTesterMixin):
     model_class = FluxTransformer2DModel
     ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors"
     alternate_keys_ckpt_paths = ["https://huggingface.co/Comfy-Org/flux1-dev/blob/main/flux1-dev-fp8.safetensors"]
 
     repo_id = "black-forest-labs/FLUX.1-dev"
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer")
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
-
-    def test_checkpoint_loading(self):
-        for ckpt_path in self.alternate_keys_ckpt_paths:
-            backend_empty_cache(torch_device)
-            model = self.model_class.from_single_file(ckpt_path)
-
-            del model
-            gc.collect()
-            backend_empty_cache(torch_device)
+    subfolder = "transformer"
 
     def test_device_map_cuda(self):
         backend_empty_cache(torch_device)
diff --git a/tests/single_file/test_model_motion_adapter_single_file.py b/tests/single_file/test_model_motion_adapter_single_file.py
index 7aaf4b577e4b..a047c81b47aa 100644
--- a/tests/single_file/test_model_motion_adapter_single_file.py
+++ b/tests/single_file/test_model_motion_adapter_single_file.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 
 from diffusers import (
     MotionAdapter,
@@ -27,7 +26,7 @@
 enable_full_determinism()
 
 
-class MotionAdapterSingleFileTests(unittest.TestCase):
+class MotionAdapterSingleFileTests:
     model_class = MotionAdapter
 
     def test_single_file_components_version_v1_5(self):
diff --git a/tests/single_file/test_model_sd_cascade_unet_single_file.py b/tests/single_file/test_model_sd_cascade_unet_single_file.py
index a5ec9dba30df..7472122710eb 100644
--- a/tests/single_file/test_model_sd_cascade_unet_single_file.py
+++ b/tests/single_file/test_model_sd_cascade_unet_single_file.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import gc
-import unittest
 
 import torch
 
@@ -37,14 +36,12 @@
 
 @slow
 @require_torch_accelerator
-class StableCascadeUNetSingleFileTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
+class StableCascadeUNetSingleFileTest:
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_model_vae_single_file.py b/tests/single_file/test_model_vae_single_file.py
index 3b9e619f13e6..9198d9b16337 100644
--- a/tests/single_file/test_model_vae_single_file.py
+++ b/tests/single_file/test_model_vae_single_file.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 import torch
 
@@ -23,22 +21,18 @@
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
     load_hf_numpy,
     numpy_cosine_similarity_distance,
-    require_torch_accelerator,
-    slow,
     torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@slow
-@require_torch_accelerator
-class AutoencoderKLSingleFileTests(unittest.TestCase):
+class TestAutoencoderKLSingleFile(SingleFileModelTesterMixin):
     model_class = AutoencoderKL
     ckpt_path = (
         "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"
@@ -47,16 +41,6 @@ class AutoencoderKLSingleFileTests(unittest.TestCase):
     main_input_name = "sample"
     base_precision = 1e-2
 
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
     def get_file_format(self, seed, shape):
         return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
 
@@ -84,18 +68,6 @@ def test_single_file_inference_same_as_pretrained(self):
 
         assert numpy_cosine_similarity_distance(output_slice_1, output_slice_2) < 1e-4
 
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id)
-        model_single_file = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between pretrained loading and single file loading"
-            )
-
     def test_single_file_arguments(self):
         model_default = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id)
 
diff --git a/tests/single_file/test_model_wan_autoencoder_single_file.py b/tests/single_file/test_model_wan_autoencoder_single_file.py
index a1f7155c1072..0babf302348f 100644
--- a/tests/single_file/test_model_wan_autoencoder_single_file.py
+++ b/tests/single_file/test_model_wan_autoencoder_single_file.py
@@ -13,50 +13,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 from diffusers import (
     AutoencoderKLWan,
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
-    torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@require_torch_accelerator
-class AutoencoderKLWanSingleFileTests(unittest.TestCase):
+class TestAutoencoderKLWanSingleFile(SingleFileModelTesterMixin):
     model_class = AutoencoderKLWan
     ckpt_path = (
         "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
     )
     repo_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="vae")
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
+    subfolder = "vae"
diff --git a/tests/single_file/test_model_wan_transformer3d_single_file.py b/tests/single_file/test_model_wan_transformer3d_single_file.py
index d7c758d3d933..b76909206073 100644
--- a/tests/single_file/test_model_wan_transformer3d_single_file.py
+++ b/tests/single_file/test_model_wan_transformer3d_single_file.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import unittest
 
 import torch
 
@@ -23,72 +21,26 @@
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
     require_big_accelerator,
-    require_torch_accelerator,
-    torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@require_torch_accelerator
-class WanTransformer3DModelText2VideoSingleFileTest(unittest.TestCase):
+class TestWanTransformer3DModelText2VideoSingleFile(SingleFileModelTesterMixin):
     model_class = WanTransformer3DModel
     ckpt_path = "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors"
     repo_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer")
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
+    subfolder = "transformer"
 
 
 @require_big_accelerator
-@require_torch_accelerator
-class WanTransformer3DModelImage2VideoSingleFileTest(unittest.TestCase):
+class TestWanTransformer3DModelImage2VideoSingleFile(SingleFileModelTesterMixin):
     model_class = WanTransformer3DModel
     ckpt_path = "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_i2v_480p_14B_fp8_e4m3fn.safetensors"
     repo_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
     torch_dtype = torch.float8_e4m3fn
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer", torch_dtype=self.torch_dtype)
-        model_single_file = self.model_class.from_single_file(self.ckpt_path, torch_dtype=self.torch_dtype)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
+    subfolder = "transformer"
diff --git a/tests/single_file/test_sana_transformer.py b/tests/single_file/test_sana_transformer.py
index c1543ba17137..9e2adb93bf2b 100644
--- a/tests/single_file/test_sana_transformer.py
+++ b/tests/single_file/test_sana_transformer.py
@@ -1,23 +1,17 @@
-import gc
-import unittest
-
 from diffusers import (
     SanaTransformer2DModel,
 )
 
 from ..testing_utils import (
-    backend_empty_cache,
     enable_full_determinism,
-    require_torch_accelerator,
-    torch_device,
 )
+from .single_file_testing_utils import SingleFileModelTesterMixin
 
 
 enable_full_determinism()
 
 
-@require_torch_accelerator
-class SanaTransformer2DModelSingleFileTests(unittest.TestCase):
+class TestSanaTransformer2DModelSingleFile(SingleFileModelTesterMixin):
     model_class = SanaTransformer2DModel
     ckpt_path = (
         "https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px/blob/main/checkpoints/Sana_1600M_1024px.pth"
@@ -27,34 +21,4 @@ class SanaTransformer2DModelSingleFileTests(unittest.TestCase):
     ]
 
     repo_id = "Efficient-Large-Model/Sana_1600M_1024px_diffusers"
-
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_single_file_components(self):
-        model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer")
-        model_single_file = self.model_class.from_single_file(self.ckpt_path)
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in model_single_file.config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-            assert model.config[param_name] == param_value, (
-                f"{param_name} differs between single file loading and pretrained loading"
-            )
-
-    def test_checkpoint_loading(self):
-        for ckpt_path in self.alternate_keys_ckpt_paths:
-            backend_empty_cache(torch_device)
-            model = self.model_class.from_single_file(ckpt_path)
-
-            del model
-            gc.collect()
-            backend_empty_cache(torch_device)
+    subfolder = "transformer"
diff --git a/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py
index e558eeaf6f47..141748b084a0 100644
--- a/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py
+++ b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py
@@ -1,6 +1,5 @@
 import gc
 import tempfile
-import unittest
 
 import torch
 
@@ -29,7 +28,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionControlNetPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionControlNetPipeline
     ckpt_path = (
         "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
@@ -39,13 +38,11 @@ class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SD
     )
     repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py
index 54224f51a9b5..8238866cbfb3 100644
--- a/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py
+++ b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py
@@ -1,7 +1,7 @@
 import gc
 import tempfile
-import unittest
 
+import pytest
 import torch
 
 from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline
@@ -29,19 +29,17 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionControlNetInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionControlNetInpaintPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionControlNetInpaintPipeline
     ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
     original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml"
     repo_id = "stable-diffusion-v1-5/stable-diffusion-inpainting"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -115,7 +113,7 @@ def test_single_file_components_local_files_only(self):
 
         super()._compare_component_configs(pipe, pipe_single_file)
 
-    @unittest.skip("runwayml original config repo does not exist")
+    @pytest.mark.skip(reason="runwayml original config repo does not exist")
     def test_single_file_components_with_original_config(self):
         controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16")
         pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet)
@@ -125,7 +123,7 @@ def test_single_file_components_with_original_config(self):
 
         super()._compare_component_configs(pipe, pipe_single_file)
 
-    @unittest.skip("runwayml original config repo does not exist")
+    @pytest.mark.skip(reason="runwayml original config repo does not exist")
     def test_single_file_components_with_original_config_local_files_only(self):
         controlnet = ControlNetModel.from_pretrained(
             "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16"
diff --git a/tests/single_file/test_stable_diffusion_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_single_file.py
index e90e648a9de9..80ef6c2574c2 100644
--- a/tests/single_file/test_stable_diffusion_controlnet_single_file.py
+++ b/tests/single_file/test_stable_diffusion_controlnet_single_file.py
@@ -1,6 +1,5 @@
 import gc
 import tempfile
-import unittest
 
 import torch
 
@@ -29,7 +28,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionControlNetPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionControlNetPipeline
     ckpt_path = (
         "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
@@ -39,13 +38,11 @@ class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SD
     )
     repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_img2img_single_file.py b/tests/single_file/test_stable_diffusion_img2img_single_file.py
index 387f09471dd7..e76846c800a8 100644
--- a/tests/single_file/test_stable_diffusion_img2img_single_file.py
+++ b/tests/single_file/test_stable_diffusion_img2img_single_file.py
@@ -1,5 +1,4 @@
 import gc
-import unittest
 
 import torch
 
@@ -23,7 +22,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionImg2ImgPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionImg2ImgPipeline
     ckpt_path = (
         "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
@@ -33,13 +32,11 @@ class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSin
     )
     repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -66,19 +63,17 @@ def test_single_file_format_inference_is_same_as_pretrained(self):
 
 @slow
 @require_torch_accelerator
-class StableDiffusion21Img2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusion21Img2ImgPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionImg2ImgPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
     original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
     repo_id = "stabilityai/stable-diffusion-2-1"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_inpaint_single_file.py
index 84636ec0f0fa..6e5d27cdffef 100644
--- a/tests/single_file/test_stable_diffusion_inpaint_single_file.py
+++ b/tests/single_file/test_stable_diffusion_inpaint_single_file.py
@@ -1,6 +1,6 @@
 import gc
-import unittest
 
+import pytest
 import torch
 
 from diffusers import (
@@ -23,19 +23,17 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionInpaintPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionInpaintPipeline
     ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
     original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml"
     repo_id = "botp/stable-diffusion-v1-5-inpainting"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -70,18 +68,18 @@ def test_single_file_loading_4_channel_unet(self):
 
         assert pipe.unet.config.in_channels == 4
 
-    @unittest.skip("runwayml original config has been removed")
+    @pytest.mark.skip(reason="runwayml original config has been removed")
     def test_single_file_components_with_original_config(self):
         return
 
-    @unittest.skip("runwayml original config has been removed")
+    @pytest.mark.skip(reason="runwayml original config has been removed")
     def test_single_file_components_with_original_config_local_files_only(self):
         return
 
 
 @slow
 @require_torch_accelerator
-class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusion21InpaintPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionInpaintPipeline
     ckpt_path = (
         "https://huggingface.co/stabilityai/stable-diffusion-2-inpainting/blob/main/512-inpainting-ema.safetensors"
@@ -89,13 +87,11 @@ class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDS
     original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inpainting-inference.yaml"
     repo_id = "stabilityai/stable-diffusion-2-inpainting"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py
index 4601b75c3ab6..377dedbc5731 100644
--- a/tests/single_file/test_stable_diffusion_single_file.py
+++ b/tests/single_file/test_stable_diffusion_single_file.py
@@ -1,6 +1,5 @@
 import gc
 import tempfile
-import unittest
 
 import torch
 
@@ -28,7 +27,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionPipeline
     ckpt_path = (
         "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
@@ -38,13 +37,11 @@ class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFile
     )
     repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -90,19 +87,17 @@ def test_single_file_legacy_scaling_factor(self):
 
 
 @slow
-class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusion21PipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
     original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
     repo_id = "stabilityai/stable-diffusion-2-1"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -125,7 +120,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self):
 @nightly
 @slow
 @require_torch_accelerator
-class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionInstructPix2PixPipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionInstructPix2PixPipeline
     ckpt_path = "https://huggingface.co/timbrooks/instruct-pix2pix/blob/main/instruct-pix2pix-00-22000.safetensors"
     original_config = (
@@ -134,13 +129,11 @@ class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCas
     repo_id = "timbrooks/instruct-pix2pix"
     single_file_kwargs = {"extract_ema": True}
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_upscale_single_file.py b/tests/single_file/test_stable_diffusion_upscale_single_file.py
index 39ec7b0194a6..ba4819fadf85 100644
--- a/tests/single_file/test_stable_diffusion_upscale_single_file.py
+++ b/tests/single_file/test_stable_diffusion_upscale_single_file.py
@@ -1,5 +1,4 @@
 import gc
-import unittest
 
 import pytest
 import torch
@@ -25,19 +24,17 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionUpscalePipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
+class TestStableDiffusionUpscalePipelineSingleFileSlow(SDSingleFileTesterMixin):
     pipeline_class = StableDiffusionUpscalePipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors"
     original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
     repo_id = "stabilityai/stable-diffusion-x4-upscaler"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py
index 3de9ee736417..3d124fa8c23c 100644
--- a/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py
+++ b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py
@@ -1,6 +1,5 @@
 import gc
 import tempfile
-import unittest
 
 import torch
 
@@ -32,7 +31,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+class TestStableDiffusionXLAdapterPipelineSingleFileSlow(SDXLSingleFileTesterMixin):
     pipeline_class = StableDiffusionXLAdapterPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -40,13 +39,11 @@ class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDX
         "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
     )
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py
index a0a1aba1030f..6f503702610a 100644
--- a/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py
+++ b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py
@@ -1,6 +1,5 @@
 import gc
 import tempfile
-import unittest
 
 import torch
 
@@ -28,7 +27,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+class TestStableDiffusionXLControlNetPipelineSingleFileSlow(SDXLSingleFileTesterMixin):
     pipeline_class = StableDiffusionXLControlNetPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -36,13 +35,11 @@ class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase,
         "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
     )
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py
index 810f412f8def..56657f37d912 100644
--- a/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py
+++ b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py
@@ -1,5 +1,4 @@
 import gc
-import unittest
 
 import torch
 
@@ -25,7 +24,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+class TestStableDiffusionXLImg2ImgPipelineSingleFileSlow(SDXLSingleFileTesterMixin):
     pipeline_class = StableDiffusionXLImg2ImgPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -33,13 +32,11 @@ class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDX
         "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
     )
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
@@ -66,7 +63,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self):
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLImg2ImgRefinerPipelineSingleFileSlowTests(unittest.TestCase):
+class StableDiffusionXLImg2ImgRefinerPipelineSingleFileSlowTests:
     pipeline_class = StableDiffusionXLImg2ImgPipeline
     ckpt_path = (
         "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors"
diff --git a/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py
index 011d59222a5b..d755b7010516 100644
--- a/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py
+++ b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py
@@ -1,5 +1,4 @@
 import gc
-import unittest
 
 import torch
 
@@ -19,19 +18,17 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLInstructPix2PixPipeline(unittest.TestCase):
+class StableDiffusionXLInstructPix2PixPipeline:
     pipeline_class = StableDiffusionXLInstructPix2PixPipeline
     ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors"
     original_config = None
     repo_id = "diffusers/sdxl-instructpix2pix-768"
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
diff --git a/tests/single_file/test_stable_diffusion_xl_single_file.py b/tests/single_file/test_stable_diffusion_xl_single_file.py
index 0ad180de17db..4e5319ca25c7 100644
--- a/tests/single_file/test_stable_diffusion_xl_single_file.py
+++ b/tests/single_file/test_stable_diffusion_xl_single_file.py
@@ -1,5 +1,4 @@
 import gc
-import unittest
 
 import torch
 
@@ -22,7 +21,7 @@
 
 @slow
 @require_torch_accelerator
-class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin):
+class TestStableDiffusionXLPipelineSingleFileSlow(SDXLSingleFileTesterMixin):
     pipeline_class = StableDiffusionXLPipeline
     ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
     repo_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -30,13 +29,11 @@ class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingle
         "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
     )
 
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
-        super().tearDown()
+    def teardown_method(self):
         gc.collect()
         backend_empty_cache(torch_device)
 

From 0e12ba74542c6ecb02719ec3e5c6e993b85556e3 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Tue, 30 Sep 2025 01:37:48 -0700
Subject: [PATCH 19/69] fix 3 xpu failures uts w/ latest pytorch (#12408)

fix xpu ut failures w/ latest pytorch

Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
---
 .../test_controlnet_hunyuandit.py             |  2 +-
 tests/pipelines/flux/test_pipeline_flux.py    | 11 ++--
 tests/quantization/gguf/test_gguf.py          | 50 +++++++++----------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
index 961984377901..bf31f2abcffb 100644
--- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
+++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
@@ -155,7 +155,7 @@ def test_controlnet_hunyuandit(self):
 
         if torch_device == "xpu":
             expected_slice = np.array(
-                [0.6376953, 0.84375, 0.58691406, 0.48046875, 0.43652344, 0.5517578, 0.54248047, 0.5644531, 0.48217773]
+                [0.6948242, 0.89160156, 0.59375, 0.5078125, 0.57910156, 0.6035156, 0.58447266, 0.53564453, 0.52246094]
             )
         else:
             expected_slice = np.array(
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index c3e8517d6407..1ddbd4ba3df8 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -15,6 +15,7 @@
 )
 
 from ...testing_utils import (
+    Expectations,
     backend_empty_cache,
     nightly,
     numpy_cosine_similarity_distance,
@@ -276,10 +277,14 @@ def test_flux_inference(self):
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
         # fmt: off
-        expected_slice = np.array(
-            [0.3242, 0.3203, 0.3164, 0.3164, 0.3125, 0.3125, 0.3281, 0.3242, 0.3203, 0.3301, 0.3262, 0.3242, 0.3281, 0.3242, 0.3203, 0.3262, 0.3262, 0.3164, 0.3262, 0.3281, 0.3184, 0.3281, 0.3281, 0.3203, 0.3281, 0.3281, 0.3164, 0.3320, 0.3320, 0.3203],
-            dtype=np.float32,
+
+        expected_slices = Expectations(
+            {
+                ("cuda", None): np.array([0.3242, 0.3203, 0.3164, 0.3164, 0.3125, 0.3125, 0.3281, 0.3242, 0.3203, 0.3301, 0.3262, 0.3242, 0.3281, 0.3242, 0.3203, 0.3262, 0.3262, 0.3164, 0.3262, 0.3281, 0.3184, 0.3281, 0.3281, 0.3203, 0.3281, 0.3281, 0.3164, 0.3320, 0.3320, 0.3203], dtype=np.float32,),
+                ("xpu", 3): np.array([0.3301, 0.3281, 0.3359, 0.3203, 0.3203, 0.3281, 0.3281, 0.3301, 0.3340, 0.3281, 0.3320, 0.3359, 0.3281, 0.3301, 0.3320, 0.3242, 0.3301, 0.3281, 0.3242, 0.3320, 0.3320, 0.3281, 0.3320, 0.3320, 0.3262, 0.3320, 0.3301, 0.3301, 0.3359, 0.3320], dtype=np.float32,),
+            }
         )
+        expected_slice = expected_slices.get_expectation()
         # fmt: on
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 38322459e761..0f4fd408a7c1 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -360,33 +360,33 @@ def test_pipeline_inference(self):
             {
                 ("xpu", 3): np.array(
                     [
-                        0.1953125,
-                        0.3125,
-                        0.31445312,
-                        0.13085938,
-                        0.30664062,
-                        0.29296875,
-                        0.11523438,
-                        0.2890625,
+                        0.16796875,
+                        0.27929688,
                         0.28320312,
-                        0.16601562,
-                        0.3046875,
-                        0.328125,
-                        0.140625,
-                        0.31640625,
+                        0.11328125,
+                        0.27539062,
+                        0.26171875,
+                        0.10742188,
+                        0.26367188,
+                        0.26171875,
+                        0.1484375,
+                        0.2734375,
+                        0.296875,
+                        0.13476562,
+                        0.2890625,
+                        0.30078125,
+                        0.1171875,
+                        0.28125,
+                        0.28125,
+                        0.16015625,
+                        0.31445312,
+                        0.30078125,
+                        0.15625,
                         0.32421875,
-                        0.12304688,
-                        0.3046875,
-                        0.3046875,
-                        0.17578125,
-                        0.3359375,
-                        0.3203125,
-                        0.16601562,
-                        0.34375,
-                        0.31640625,
-                        0.15429688,
-                        0.328125,
-                        0.31054688,
+                        0.296875,
+                        0.14453125,
+                        0.30859375,
+                        0.2890625,
                     ]
                 ),
                 ("cuda", 7): np.array(

From b59654544bbaf5c6040e670397351abe0e543a75 Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Tue, 30 Sep 2025 13:32:33 +0200
Subject: [PATCH 20/69] Install latest prerelease from huggingface_hub when
 installing transformers from main (#12395)

* Allow prerelease when installing transformers from main

* maybe better

* maybe better

* and now?

* just bored

* should be better

* works now
---
 .github/workflows/pr_modular_tests.yml |  5 ++---
 .github/workflows/pr_tests.yml         | 12 +++++-------
 .github/workflows/pr_tests_gpu.yml     | 13 +++++--------
 tests/models/test_modeling_common.py   |  8 +++++---
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml
index c6e87e642dc5..75258771e4dc 100644
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -110,9 +110,8 @@ jobs:
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
       run: |
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index ebfe9f442f30..1543b264b0cc 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -116,9 +116,8 @@ jobs:
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
       run: |
@@ -254,10 +253,9 @@ jobs:
         python -m uv pip install -e [quality,test]
         # TODO (sayakpaul, DN6): revisit `--no-deps`
         python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # python -m uv pip install -U tokenizers
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        python -m uv pip install -U tokenizers
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
 
     - name: Environment
       run: |
diff --git a/.github/workflows/pr_tests_gpu.yml b/.github/workflows/pr_tests_gpu.yml
index 1a8d5f6b815e..89b6abe20d1e 100644
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -132,9 +132,8 @@ jobs:
         run: |
           python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
           python -m uv pip install -e [quality,test]
-          # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-          # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
 
       - name: Environment
         run: |
@@ -204,9 +203,8 @@ jobs:
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
         python -m uv pip install -e [quality,test]
         python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
 
     - name: Environment
       run: |
@@ -268,8 +266,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall transformers -y && pip uninstall huggingface_hub -y && python -m uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
         python -m uv pip install -e [quality,test,training]
 
     - name: Environment
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 90ded6a7ecb2..9b1c6b50dc8f 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -243,8 +243,8 @@ def load_model(path):
             else:
                 _ = load_model(repo_id)
 
-        warning_message = str(warning.warnings[0].message)
-        self.assertIn("This serialization format is now deprecated to standardize the serialization", warning_message)
+        warning_messages = " ".join(str(w.message) for w in warning.warnings)
+        self.assertIn("This serialization format is now deprecated to standardize the serialization", warning_messages)
 
     # Local tests are already covered down below.
     @parameterized.expand(
@@ -298,11 +298,13 @@ def test_local_files_only_with_sharded_checkpoint(self):
             raise_for_status=mock.Mock(side_effect=HfHubHTTPError("Server down", response=mock.Mock())),
             json=mock.Mock(return_value={}),
         )
+        client_mock = mock.Mock()
+        client_mock.get.return_value = error_response
 
         with tempfile.TemporaryDirectory() as tmpdir:
             model = FluxTransformer2DModel.from_pretrained(repo_id, subfolder="transformer", cache_dir=tmpdir)
 
-            with mock.patch("requests.Session.get", return_value=error_response):
+            with mock.patch("huggingface_hub.hf_api.get_session", return_value=client_mock):
                 # Should fail with local_files_only=False (network required)
                 # We would make a network call with model_info
                 with self.assertRaises(OSError):

From d7a1a0363feb7694568695394a7edfdd44a18bca Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 09:33:41 -0700
Subject: [PATCH 21/69] [docs] CP (#12331)

* init

* feedback

* feedback

* feedback

* feedback

* feedback

* feedback
---
 docs/source/en/_toctree.yml                   |  4 +-
 docs/source/en/api/parallel.md                |  2 +-
 .../en/training/distributed_inference.md      | 64 +++++++++++++++++--
 3 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ada5e3889581..fb4fdf2098e6 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -70,8 +70,6 @@
     title: Reduce memory usage
   - local: optimization/speed-memory-optims
     title: Compiling and offloading quantized models
-  - local: api/parallel
-    title: Parallel inference
   - title: Community optimizations
     sections:
     - local: optimization/pruna
@@ -282,6 +280,8 @@
       title: Outputs
     - local: api/quantization
       title: Quantization
+    - local: api/parallel
+      title: Parallel inference
   - title: Modular
     sections:
     - local: api/modular_diffusers/pipeline
diff --git a/docs/source/en/api/parallel.md b/docs/source/en/api/parallel.md
index e38ffe571eac..f2a6bee3910e 100644
--- a/docs/source/en/api/parallel.md
+++ b/docs/source/en/api/parallel.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # Parallelism
 
-Parallelism strategies help speed up diffusion transformers by distributing computations across multiple devices, allowing for faster inference/training times.
+Parallelism strategies help speed up diffusion transformers by distributing computations across multiple devices, allowing for faster inference/training times. Refer to the [Distributed inferece](../training/distributed_inference) guide to learn more.
 
 ## ParallelConfig
 
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
index 58ec77f75bf3..586f765709b7 100644
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -226,8 +226,64 @@ with torch.no_grad():
     image[0].save("split_transformer.png")
 ```
 
-## Resources
+By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
 
-- Take a look at this [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for a minimal example of distributed inference with Accelerate.
-- For more details, check out Accelerate's [Distributed inference](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
-- The `device_map` argument assign models or an entire pipeline to devices. Refer to the [device placement](../using-diffusers/loading#device-placement) docs for more information.
\ No newline at end of file
+## Context parallelism
+
+[Context parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=context_parallelism) splits input sequences across multiple GPUs to reduce memory usage. Each GPU processes its own slice of the sequence.
+
+Use [`~ModelMixin.set_attention_backend`] to switch to a more optimized attention backend. Refer to this [table](../optimization/attention_backends#available-backends) for a complete list of available backends.
+
+### Ring Attention
+
+Key (K) and value (V) representations communicate between devices using [Ring Attention](https://huggingface.co/papers/2310.01889). This ensures each split sees every other token's K/V. Each GPU computes attention for its local K/V and passes it to the next GPU in the ring. No single GPU holds the full sequence, which reduces communication latency.
+
+Pass a [`ContextParallelConfig`] to the `parallel_config` argument of the transformer model. The config supports the `ring_degree` argument that determines how many devices to use for Ring Attention.
+
+```py
+import torch
+from diffusers import AutoModel, QwenImagePipeline, ContextParallelConfig
+
+try:
+    torch.distributed.init_process_group("nccl")
+    rank = torch.distributed.get_rank()
+    device = torch.device("cuda", rank % torch.cuda.device_count())
+    torch.cuda.set_device(device)
+    
+    transformer = AutoModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, parallel_config=ContextParallelConfig(ring_degree=2))
+    pipeline = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", transformer=transformer, torch_dtype=torch.bfloat16, device_map="cuda")
+    pipeline.transformer.set_attention_backend("flash")
+
+    prompt = """
+    cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+    highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+    """
+    
+    # Must specify generator so all ranks start with same latents (or pass your own)
+    generator = torch.Generator().manual_seed(42)
+    image = pipeline(prompt, num_inference_steps=50, generator=generator).images[0]
+    
+    if rank == 0:
+        image.save("output.png")
+
+except Exception as e:
+    print(f"An error occurred: {e}")
+    torch.distributed.breakpoint()
+    raise
+
+finally:
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+```
+
+### Ulysses Attention
+
+[Ulysses Attention](https://huggingface.co/papers/2309.14509) splits a sequence across GPUs and performs an *all-to-all* communication (every device sends/receives data to every other device). Each GPU ends up with all tokens for only a subset of attention heads. Each GPU computes attention locally on all tokens for its head, then performs another all-to-all to regroup results by tokens for the next layer.
+
+[`ContextParallelConfig`] supports Ulysses Attention through the `ulysses_degree` argument. This determines how many devices to use for Ulysses Attention.
+
+Pass the [`ContextParallelConfig`] to [`~ModelMixin.enable_parallelism`].
+
+```py
+pipeline.transformer.enable_parallelism(config=ContextParallelConfig(ulysses_degree=2))
+```
\ No newline at end of file

From cc5b31ffc985efff096735ec290876617e0cabbb Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 10:11:19 -0700
Subject: [PATCH 22/69] [docs] Migrate syntax (#12390)

* change syntax

* make style
---
 docs/source/en/api/configuration.md           |   7 +-
 docs/source/en/api/loaders/ip_adapter.md      |   7 +-
 docs/source/en/api/loaders/lora.md            |   7 +-
 docs/source/en/api/loaders/peft.md            |   7 +-
 .../en/api/loaders/textual_inversion.md       |   7 +-
 docs/source/en/api/loaders/transformer_sd3.md |   7 +-
 docs/source/en/api/loaders/unet.md            |   7 +-
 .../en/api/models/consistency_decoder_vae.md  |   7 +-
 docs/source/en/api/models/transformer2d.md    |   7 +-
 docs/source/en/api/outputs.md                 |   7 +-
 docs/source/en/api/pipelines/allegro.md       |   7 +-
 docs/source/en/api/pipelines/animatediff.md   |  21 +-
 .../en/api/pipelines/attend_and_excite.md     |   7 +-
 docs/source/en/api/pipelines/audioldm.md      |   7 +-
 docs/source/en/api/pipelines/audioldm2.md     |   7 +-
 docs/source/en/api/pipelines/aura_flow.md     |   7 +-
 .../source/en/api/pipelines/blip_diffusion.md |   7 +-
 docs/source/en/api/pipelines/chroma.md        |   7 +-
 docs/source/en/api/pipelines/cogview3.md      |   7 +-
 docs/source/en/api/pipelines/cogview4.md      |   7 +-
 docs/source/en/api/pipelines/consisid.md      |   7 +-
 .../en/api/pipelines/control_flux_inpaint.md  |   7 +-
 docs/source/en/api/pipelines/controlnet.md    |   7 +-
 .../en/api/pipelines/controlnet_flux.md       |   7 +-
 .../en/api/pipelines/controlnet_hunyuandit.md |   7 +-
 .../source/en/api/pipelines/controlnet_sd3.md |   7 +-
 .../en/api/pipelines/controlnet_sdxl.md       |  14 +-
 docs/source/en/api/pipelines/controlnetxs.md  |   7 +-
 .../en/api/pipelines/controlnetxs_sdxl.md     |  14 +-
 docs/source/en/api/pipelines/cosmos.md        |   7 +-
 .../en/api/pipelines/dance_diffusion.md       |   7 +-
 docs/source/en/api/pipelines/ddpm.md          |   7 +-
 docs/source/en/api/pipelines/dit.md           |   7 +-
 docs/source/en/api/pipelines/flux.md          |  23 +-
 docs/source/en/api/pipelines/framepack.md     |   7 +-
 docs/source/en/api/pipelines/hidream.md       |   7 +-
 docs/source/en/api/pipelines/hunyuandit.md    |  14 +-
 docs/source/en/api/pipelines/i2vgenxl.md      |   7 +-
 docs/source/en/api/pipelines/kandinsky.md     |  14 +-
 docs/source/en/api/pipelines/kandinsky3.md    |  14 +-
 docs/source/en/api/pipelines/kandinsky_v22.md |  14 +-
 docs/source/en/api/pipelines/kolors.md        |  14 +-
 .../en/api/pipelines/latent_diffusion.md      |   7 +-
 docs/source/en/api/pipelines/latte.md         |   7 +-
 docs/source/en/api/pipelines/ledits_pp.md     |  14 +-
 docs/source/en/api/pipelines/lumina.md        |   7 +-
 docs/source/en/api/pipelines/lumina2.md       |   7 +-
 docs/source/en/api/pipelines/marigold.md      |  49 +-
 docs/source/en/api/pipelines/mochi.md         |  19 +-
 docs/source/en/api/pipelines/musicldm.md      |   7 +-
 docs/source/en/api/pipelines/omnigen.md       |   7 +-
 docs/source/en/api/pipelines/overview.md      |  15 +-
 docs/source/en/api/pipelines/pag.md           |   7 +-
 .../en/api/pipelines/paint_by_example.md      |   7 +-
 docs/source/en/api/pipelines/panorama.md      |   7 +-
 docs/source/en/api/pipelines/pia.md           |  14 +-
 docs/source/en/api/pipelines/pix2pix.md       |   7 +-
 docs/source/en/api/pipelines/pixart.md        |  21 +-
 docs/source/en/api/pipelines/pixart_sigma.md  |  28 +-
 docs/source/en/api/pipelines/qwenimage.md     |  14 +-
 docs/source/en/api/pipelines/sana.md          |  14 +-
 docs/source/en/api/pipelines/sana_sprint.md   |   7 +-
 .../api/pipelines/self_attention_guidance.md  |   7 +-
 .../pipelines/semantic_stable_diffusion.md    |   7 +-
 docs/source/en/api/pipelines/shap_e.md        |   7 +-
 .../source/en/api/pipelines/stable_cascade.md |  15 +-
 .../pipelines/stable_diffusion/depth2img.md   |  11 +-
 .../api/pipelines/stable_diffusion/gligen.md  |  11 +-
 .../stable_diffusion/image_variation.md       |   7 +-
 .../api/pipelines/stable_diffusion/img2img.md |   7 +-
 .../api/pipelines/stable_diffusion/inpaint.md |  11 +-
 .../stable_diffusion/latent_upscale.md        |  11 +-
 .../stable_diffusion/ldm3d_diffusion.md       |   7 +-
 .../pipelines/stable_diffusion/sdxl_turbo.md  |  11 +-
 .../stable_diffusion/stable_diffusion_2.md    |  11 +-
 .../stable_diffusion/stable_diffusion_3.md    |  21 +-
 .../stable_diffusion/stable_diffusion_safe.md |   7 +-
 .../stable_diffusion/stable_diffusion_xl.md   |  11 +-
 .../en/api/pipelines/stable_diffusion/svd.md  |  15 +-
 .../pipelines/stable_diffusion/text2img.md    |  11 +-
 .../api/pipelines/stable_diffusion/upscale.md |  11 +-
 docs/source/en/api/pipelines/stable_unclip.md |  14 +-
 docs/source/en/api/pipelines/text_to_video.md |   7 +-
 .../en/api/pipelines/text_to_video_zero.md    |   7 +-
 docs/source/en/api/pipelines/unclip.md        |   7 +-
 docs/source/en/api/pipelines/unidiffuser.md   |  14 +-
 .../en/api/pipelines/value_guided_sampling.md |  14 +-
 docs/source/en/api/quantization.md            |   7 +-
 docs/source/en/api/schedulers/ddim.md         |   7 +-
 docs/source/en/api/schedulers/score_sde_vp.md |   7 +-
 docs/source/en/conceptual/evaluation.md       |  58 +-
 docs/source/en/optimization/coreml.md         |   7 +-
 docs/source/en/optimization/fp16.md           |   7 +-
 docs/source/en/optimization/mps.md            |   7 +-
 docs/source/en/optimization/neuron.md         |   7 +-
 docs/source/en/optimization/onnx.md           |   7 +-
 docs/source/en/optimization/xformers.md       |  14 +-
 docs/source/en/quantization/bitsandbytes.md   |  21 +-
 docs/source/en/training/controlnet.md         |  21 +-
 docs/source/en/training/create_dataset.md     |  14 +-
 docs/source/en/training/custom_diffusion.md   |  38 +-
 .../en/training/distributed_inference.md      |   3 +
 docs/source/en/training/dreambooth.md         |  64 +-
 docs/source/en/training/instructpix2pix.md    |  29 +-
 docs/source/en/training/kandinsky.md          |  35 +-
 docs/source/en/training/lcm_distill.md        |  14 +-
 docs/source/en/training/lora.md               |  35 +-
 docs/source/en/training/sdxl.md               |  28 +-
 docs/source/en/training/t2i_adapters.md       |  21 +-
 docs/source/en/training/text2image.md         |  28 +-
 docs/source/en/training/text_inversion.md     |  21 +-
 .../en/training/unconditional_training.md     |  21 +-
 docs/source/en/training/wuerstchen.md         |  21 +-
 docs/source/en/tutorials/basic_training.md    |  21 +-
 .../conditional_image_generation.md           |  21 +-
 .../using-diffusers/controlling_generation.md |  20 +-
 docs/source/en/using-diffusers/diffedit.md    |   7 +-
 docs/source/en/using-diffusers/img2img.md     |  21 +-
 .../inference_with_tcd_lora.md                |   5 +-
 docs/source/en/using-diffusers/inpaint.md     |  14 +-
 docs/source/en/using-diffusers/kandinsky.md   |  47 +-
 docs/source/en/using-diffusers/pag.md         |   7 +-
 docs/source/en/using-diffusers/sdxl.md        |  36 +-
 docs/source/en/using-diffusers/shap-e.md      |   7 +-
 .../unconditional_image_generation.md         |   7 +-
 .../en/using-diffusers/weighted_prompts.md    |   7 +-
 .../en/using-diffusers/write_own_pipeline.md  |  29 +-
 docs/source/ja/installation.md                |   7 +-
 docs/source/ja/quicktour.md                   |  28 +-
 docs/source/ja/stable_diffusion.md            |  14 +-
 docs/source/ja/tutorials/autopipeline.md      |   7 +-
 .../stable_diffusion/stable_diffusion_xl.md   |  14 +-
 docs/source/ko/conceptual/evaluation.md       |  44 +-
 docs/source/ko/installation.md                |   7 +-
 docs/source/ko/optimization/coreml.md         |   7 +-
 docs/source/ko/optimization/fp16.md           |  22 +-
 docs/source/ko/optimization/mps.md            |   7 +-
 docs/source/ko/optimization/xformers.md       |  14 +-
 docs/source/ko/quicktour.md                   |  28 +-
 docs/source/ko/stable_diffusion.md            |  14 +-
 docs/source/ko/training/controlnet.md         |   7 +-
 docs/source/ko/training/create_dataset.md     |  14 +-
 .../ko/training/distributed_inference.md      |   5 +-
 docs/source/ko/training/dreambooth.md         |  14 +-
 docs/source/ko/training/lora.md               |  21 +-
 docs/source/ko/training/text2image.md         |   7 +-
 docs/source/ko/training/text_inversion.md     |  36 +-
 .../ko/training/unconditional_training.md     |   7 +-
 docs/source/ko/tutorials/basic_training.md    |   7 +-
 .../using-diffusers/controlling_generation.md |  20 +-
 .../custom_pipeline_overview.md               |   7 +-
 docs/source/ko/using-diffusers/diffedit.md    |   7 +-
 docs/source/ko/using-diffusers/img2img.md     |   7 +-
 docs/source/ko/using-diffusers/inpaint.md     |   7 +-
 docs/source/ko/using-diffusers/kandinsky.md   |  47 +-
 docs/source/ko/using-diffusers/loading.md     |  14 +-
 .../ko/using-diffusers/loading_adapters.md    |  39 +-
 .../ko/using-diffusers/other-formats.md       |   7 +-
 docs/source/ko/using-diffusers/schedulers.md  |  13 +-
 docs/source/ko/using-diffusers/shap-e.md      |   7 +-
 .../unconditional_image_generation.md         |   7 +-
 .../ko/using-diffusers/write_own_pipeline.md  |  29 +-
 docs/source/pt/installation.md                |   7 +-
 docs/source/pt/quicktour.md                   |  28 +-
 docs/source/zh/conceptual/evaluation.md       |  44 +-
 docs/source/zh/installation.md                |   7 +-
 docs/source/zh/optimization/coreml.md         |   7 +-
 docs/source/zh/optimization/fp16.md           |   7 +-
 docs/source/zh/optimization/mps.md            |   7 +-
 docs/source/zh/optimization/neuron.md         |   7 +-
 docs/source/zh/optimization/onnx.md           |   7 +-
 docs/source/zh/optimization/xformers.md       |  14 +-
 docs/source/zh/quicktour.md                   |  31 +-
 docs/source/zh/stable_diffusion.md            | 522 +++++++-------
 docs/source/zh/training/controlnet.md         |  28 +-
 .../zh/training/distributed_inference.md      |   7 +-
 docs/source/zh/training/dreambooth.md         |  64 +-
 docs/source/zh/training/instructpix2pix.md    |  29 +-
 docs/source/zh/training/kandinsky.md          |  39 +-
 docs/source/zh/training/lora.md               |  35 +-
 docs/source/zh/training/text2image.md         |  35 +-
 docs/source/zh/training/text_inversion.md     |  21 +-
 docs/source/zh/training/wuerstchen.md         |  21 +-
 examples/community/matryoshka.py              |  21 +-
 .../pipeline_stable_diffusion_boxdiff.py      |  14 +-
 .../pipeline_stable_diffusion_pag.py          |  10 +-
 examples/model_search/pipeline_easy.py        |  54 +-
 src/diffusers/guiders/guider_utils.py         |  10 +-
 src/diffusers/loaders/lora_base.py            |  12 +-
 src/diffusers/loaders/lora_pipeline.py        |  36 +-
 src/diffusers/models/attention.py             |   6 +-
 src/diffusers/models/attention_processor.py   |   6 +-
 src/diffusers/models/auto_model.py            |  10 +-
 .../models/autoencoders/autoencoder_kl.py     |  12 +-
 .../models/controlnets/controlnet_sd3.py      |  12 +-
 .../models/controlnets/controlnet_xs.py       |  12 +-
 src/diffusers/models/modeling_flax_utils.py   |  12 +-
 src/diffusers/models/modeling_utils.py        |  18 +-
 .../transformers/auraflow_transformer_2d.py   |  12 +-
 .../transformers/cogvideox_transformer_3d.py  |  12 +-
 .../transformers/hunyuan_transformer_2d.py    |  12 +-
 .../transformers/pixart_transformer_2d.py     |  12 +-
 .../models/transformers/transformer_sd3.py    |  12 +-
 .../models/unets/unet_2d_condition.py         |  12 +-
 .../models/unets/unet_3d_condition.py         |  12 +-
 src/diffusers/models/unets/unet_i2vgen_xl.py  |  12 +-
 .../models/unets/unet_motion_model.py         |  12 +-
 .../modular_pipelines/components_manager.py   |   6 +-
 .../flux/modular_pipeline.py                  |   6 +-
 .../modular_pipelines/modular_pipeline.py     |  38 +-
 src/diffusers/modular_pipelines/node_utils.py | 661 ++++++++++++++++++
 .../qwenimage/modular_pipeline.py             |  12 +-
 .../stable_diffusion_xl/modular_pipeline.py   |   6 +-
 .../modular_pipelines/wan/modular_pipeline.py |   6 +-
 src/diffusers/pipelines/auto_pipeline.py      |  24 +-
 .../controlnet/pipeline_controlnet_inpaint.py |  13 +-
 .../controlnet/pipeline_flax_controlnet.py    |   8 +-
 .../versatile_diffusion/modeling_text_unet.py |  12 +-
 .../pag/pipeline_pag_controlnet_sd_inpaint.py |  14 +-
 .../pipeline_paint_by_example.py              |   6 +-
 .../pipelines/pipeline_flax_utils.py          |   8 +-
 src/diffusers/pipelines/pipeline_utils.py     |  67 +-
 .../pipeline_flax_stable_diffusion.py         |   8 +-
 .../pipeline_flax_stable_diffusion_img2img.py |   8 +-
 .../pipeline_flax_stable_diffusion_inpaint.py |  14 +-
 .../pipeline_stable_diffusion_diffedit.py     |   6 +-
 .../pipeline_stable_diffusion_k_diffusion.py  |   6 +-
 .../deprecated/scheduling_karras_ve.py        |  10 +-
 .../scheduling_consistency_models.py          |   6 +-
 .../scheduling_cosine_dpmsolver_multistep.py  |   8 +-
 .../scheduling_dpmsolver_multistep.py         |   8 +-
 .../scheduling_dpmsolver_multistep_inverse.py |   8 +-
 .../scheduling_dpmsolver_singlestep.py        |   8 +-
 .../scheduling_edm_dpmsolver_multistep.py     |   8 +-
 .../schedulers/scheduling_sasolver.py         |   8 +-
 src/diffusers/schedulers/scheduling_utils.py  |  10 +-
 .../schedulers/scheduling_utils_flax.py       |  17 +-
 src/diffusers/utils/dynamic_modules_utils.py  |  24 +-
 src/diffusers/utils/outputs.py                |   8 +-
 239 files changed, 1948 insertions(+), 2657 deletions(-)
 create mode 100644 src/diffusers/modular_pipelines/node_utils.py

diff --git a/docs/source/en/api/configuration.md b/docs/source/en/api/configuration.md
index bc58e190b8da..328e109e1e4c 100644
--- a/docs/source/en/api/configuration.md
+++ b/docs/source/en/api/configuration.md
@@ -14,11 +14,8 @@ specific language governing permissions and limitations under the License.
 
 Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which stores all the parameters that are passed to their respective `__init__` methods in a JSON-configuration file.
 
-<Tip>
-
-To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf auth login`.
-
-</Tip>
+> [!TIP]
+> To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf auth login`.
 
 ## ConfigMixin
 
diff --git a/docs/source/en/api/loaders/ip_adapter.md b/docs/source/en/api/loaders/ip_adapter.md
index 0c94bcb2208b..c2c45bee1022 100644
--- a/docs/source/en/api/loaders/ip_adapter.md
+++ b/docs/source/en/api/loaders/ip_adapter.md
@@ -14,11 +14,8 @@ specific language governing permissions and limitations under the License.
 
 [IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.
 
-<Tip>
-
-Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
-
-</Tip>
+> [!TIP]
+> Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading](../../using-diffusers/loading_adapters#ip-adapter) guide, and you can see how to use it in the [usage](../../using-diffusers/ip_adapter) guide.
 
 ## IPAdapterMixin
 
diff --git a/docs/source/en/api/loaders/lora.md b/docs/source/en/api/loaders/lora.md
index da5c3842c641..bf22a32d74f3 100644
--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -33,11 +33,8 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen)
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
-<Tip>
-
-To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
-
-</Tip>
+> [!TIP]
+> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
 
 ## LoraBaseMixin
 
diff --git a/docs/source/en/api/loaders/peft.md b/docs/source/en/api/loaders/peft.md
index a371ab9c8ea3..5508509c8823 100644
--- a/docs/source/en/api/loaders/peft.md
+++ b/docs/source/en/api/loaders/peft.md
@@ -14,11 +14,8 @@ specific language governing permissions and limitations under the License.
 
 Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter.
 
-<Tip>
-
-Refer to the [Inference with PEFT](../../tutorials/using_peft_for_inference.md) tutorial for an overview of how to use PEFT in Diffusers for inference.
-
-</Tip>
+> [!TIP]
+> Refer to the [Inference with PEFT](../../tutorials/using_peft_for_inference.md) tutorial for an overview of how to use PEFT in Diffusers for inference.
 
 ## PeftAdapterMixin
 
diff --git a/docs/source/en/api/loaders/textual_inversion.md b/docs/source/en/api/loaders/textual_inversion.md
index 30d8f5b8d57a..2cb54ce4ea3a 100644
--- a/docs/source/en/api/loaders/textual_inversion.md
+++ b/docs/source/en/api/loaders/textual_inversion.md
@@ -16,11 +16,8 @@ Textual Inversion is a training method for personalizing models by learning new
 
 [`TextualInversionLoaderMixin`] provides a function for loading Textual Inversion embeddings from Diffusers and Automatic1111 into the text encoder and loading a special token to activate the embeddings.
 
-<Tip>
-
-To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/loading_adapters#textual-inversion) loading guide.
-
-</Tip>
+> [!TIP]
+> To learn more about how to load Textual Inversion embeddings, see the [Textual Inversion](../../using-diffusers/loading_adapters#textual-inversion) loading guide.
 
 ## TextualInversionLoaderMixin
 
diff --git a/docs/source/en/api/loaders/transformer_sd3.md b/docs/source/en/api/loaders/transformer_sd3.md
index 0e7664cdd16e..cc3ec0da145e 100644
--- a/docs/source/en/api/loaders/transformer_sd3.md
+++ b/docs/source/en/api/loaders/transformer_sd3.md
@@ -16,11 +16,8 @@ This class is useful when *only* loading weights into a [`SD3Transformer2DModel`
 
 The [`SD3Transformer2DLoadersMixin`] class currently only loads IP-Adapter weights, but will be used in the future to save weights and load LoRAs.
 
-<Tip>
-
-To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
-
-</Tip>
+> [!TIP]
+> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
 
 ## SD3Transformer2DLoadersMixin
 
diff --git a/docs/source/en/api/loaders/unet.md b/docs/source/en/api/loaders/unet.md
index 51b4c4ef48d9..7450e03e580b 100644
--- a/docs/source/en/api/loaders/unet.md
+++ b/docs/source/en/api/loaders/unet.md
@@ -16,11 +16,8 @@ Some training methods - like LoRA and Custom Diffusion - typically target the UN
 
 The [`UNet2DConditionLoadersMixin`] class provides functions for loading and saving weights, fusing and unfusing LoRAs, disabling and enabling LoRAs, and setting and deleting adapters.
 
-<Tip>
-
-To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
-
-</Tip>
+> [!TIP]
+> To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
 
 ## UNet2DConditionLoadersMixin
 
diff --git a/docs/source/en/api/models/consistency_decoder_vae.md b/docs/source/en/api/models/consistency_decoder_vae.md
index cf4955a07462..fe039df7f9bf 100644
--- a/docs/source/en/api/models/consistency_decoder_vae.md
+++ b/docs/source/en/api/models/consistency_decoder_vae.md
@@ -16,11 +16,8 @@ Consistency decoder can be used to decode the latents from the denoising UNet in
 
 The original codebase can be found at [openai/consistencydecoder](https://github.com/openai/consistencydecoder).
 
-<Tip warning={true}>
-
-Inference is only supported for 2 iterations as of now.
-
-</Tip>
+> [!WARNING]
+> Inference is only supported for 2 iterations as of now.
 
 The pipeline could not have been contributed without the help of [madebyollin](https://github.com/madebyollin) and [mrsteyk](https://github.com/mrsteyk) from [this issue](https://github.com/openai/consistencydecoder/issues/1).
 
diff --git a/docs/source/en/api/models/transformer2d.md b/docs/source/en/api/models/transformer2d.md
index 16ae6ace97db..d8e0a858b0e7 100644
--- a/docs/source/en/api/models/transformer2d.md
+++ b/docs/source/en/api/models/transformer2d.md
@@ -22,11 +22,8 @@ When the input is **continuous**:
 
 When the input is **discrete**:
 
-<Tip>
-
-It is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised image don't contain a prediction for the masked pixel because the unnoised image cannot be masked.
-
-</Tip>
+> [!TIP]
+> It is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised image don't contain a prediction for the masked pixel because the unnoised image cannot be masked.
 
 1. Convert input (classes of latent pixels) to embeddings and apply positional embeddings.
 2. Apply the Transformer blocks in the standard way.
diff --git a/docs/source/en/api/outputs.md b/docs/source/en/api/outputs.md
index a13bc89f2bea..0fba1ab2fae8 100644
--- a/docs/source/en/api/outputs.md
+++ b/docs/source/en/api/outputs.md
@@ -39,11 +39,8 @@ For instance, retrieving an image by indexing into it returns the tuple `(output
 outputs[:1]
 ```
 
-<Tip>
-
-To check a specific pipeline or model output, refer to its corresponding API documentation.
-
-</Tip>
+> [!TIP]
+> To check a specific pipeline or model output, refer to its corresponding API documentation.
 
 ## BaseOutput
 
diff --git a/docs/source/en/api/pipelines/allegro.md b/docs/source/en/api/pipelines/allegro.md
index 09313c2db093..a981fb1f94f7 100644
--- a/docs/source/en/api/pipelines/allegro.md
+++ b/docs/source/en/api/pipelines/allegro.md
@@ -17,11 +17,8 @@ The abstract from the paper is:
 
 *Significant advancements have been made in the field of video generation, with the open-source community contributing a wealth of research papers and tools for training high-quality models. However, despite these efforts, the available information and resources remain insufficient for achieving commercial-level performance. In this report, we open the black box and introduce Allegro, an advanced video generation model that excels in both quality and temporal consistency. We also highlight the current limitations in the field and present a comprehensive methodology for training high-performance, commercial-level video generation models, addressing key aspects such as data, model architecture, training pipeline, and evaluation. Our user study shows that Allegro surpasses existing open-source models and most commercial models, ranking just behind Hailuo and Kling. Code: https://github.com/rhymes-ai/Allegro , Model: https://huggingface.co/rhymes-ai/Allegro , Gallery: https://rhymes.ai/allegro_gallery .*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Quantization
 
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index aeec3254ca46..f0188f3c36fb 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -102,11 +102,8 @@ Here are some sample outputs:
     </tr>
 </table>
 
-<Tip>
-
-AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the AnimateDiff checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
-
-</Tip>
+> [!TIP]
+> AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the AnimateDiff checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
 
 ### AnimateDiffControlNetPipeline
 
@@ -799,17 +796,11 @@ frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
 ```
 
-<Tip warning={true}>
-
-FreeInit is not really free - the improved quality comes at the cost of extra computation. It requires sampling a few extra times depending on the `num_iters` parameter that is set when enabling it. Setting the `use_fast_sampling` parameter to `True` can improve the overall performance (at the cost of lower quality compared to when `use_fast_sampling=False` but still better results than vanilla video generation models).
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+> [!WARNING]
+> FreeInit is not really free - the improved quality comes at the cost of extra computation. It requires sampling a few extra times depending on the `num_iters` parameter that is set when enabling it. Setting the `use_fast_sampling` parameter to `True` can improve the overall performance (at the cost of lower quality compared to when `use_fast_sampling=False` but still better results than vanilla video generation models).
 
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 <table>
     <tr>
diff --git a/docs/source/en/api/pipelines/attend_and_excite.md b/docs/source/en/api/pipelines/attend_and_excite.md
index b5ce3bb767c3..e7d1e1d2b87c 100644
--- a/docs/source/en/api/pipelines/attend_and_excite.md
+++ b/docs/source/en/api/pipelines/attend_and_excite.md
@@ -23,11 +23,8 @@ The abstract from the paper is:
 
 You can find additional information about Attend-and-Excite on the [project page](https://attendandexcite.github.io/Attend-and-Excite/), the [original codebase](https://github.com/AttendAndExcite/Attend-and-Excite), or try it out in a [demo](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionAttendAndExcitePipeline
 
diff --git a/docs/source/en/api/pipelines/audioldm.md b/docs/source/en/api/pipelines/audioldm.md
index 6b143d299037..c8073a14ef0a 100644
--- a/docs/source/en/api/pipelines/audioldm.md
+++ b/docs/source/en/api/pipelines/audioldm.md
@@ -38,11 +38,8 @@ During inference:
 * The _quality_ of the predicted audio sample can be controlled by the `num_inference_steps` argument; higher steps give higher quality audio at the expense of slower inference.
 * The _length_ of the predicted audio sample can be controlled by varying the `audio_length_in_s` argument.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## AudioLDMPipeline
 [[autodoc]] AudioLDMPipeline
diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md
index 1a196099d712..45a9002ea070 100644
--- a/docs/source/en/api/pipelines/audioldm2.md
+++ b/docs/source/en/api/pipelines/audioldm2.md
@@ -58,11 +58,8 @@ See table below for details on the three checkpoints:
 
 The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## AudioLDM2Pipeline
 [[autodoc]] AudioLDM2Pipeline
diff --git a/docs/source/en/api/pipelines/aura_flow.md b/docs/source/en/api/pipelines/aura_flow.md
index 1d6002335ce3..67951859b962 100644
--- a/docs/source/en/api/pipelines/aura_flow.md
+++ b/docs/source/en/api/pipelines/aura_flow.md
@@ -16,11 +16,8 @@ AuraFlow is inspired by [Stable Diffusion 3](../pipelines/stable_diffusion/stabl
 
 It was developed by the Fal team and more details about it can be found in [this blog post](https://blog.fal.ai/auraflow/).
 
-<Tip>
-
-AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details.
-
-</Tip>
+> [!TIP]
+> AuraFlow can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details.
 
 ## Quantization
 
diff --git a/docs/source/en/api/pipelines/blip_diffusion.md b/docs/source/en/api/pipelines/blip_diffusion.md
index d94281a4a91a..b9c6ed7b5fbf 100644
--- a/docs/source/en/api/pipelines/blip_diffusion.md
+++ b/docs/source/en/api/pipelines/blip_diffusion.md
@@ -26,11 +26,8 @@ The original codebase can be found at [salesforce/LAVIS](https://github.com/sale
 
 `BlipDiffusionPipeline` and `BlipDiffusionControlNetPipeline` were contributed by [`ayushtues`](https://github.com/ayushtues/).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 
 ## BlipDiffusionPipeline
diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
index 40e290e4bdd6..df03fbb325d7 100644
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -21,11 +21,8 @@ Chroma is a text to image generation model based on Flux.
 
 Original model checkpoints for Chroma can be found [here](https://huggingface.co/lodestones/Chroma).
 
-<Tip>
-
-Chroma can use all the same optimizations as Flux.
-
-</Tip>
+> [!TIP]
+> Chroma can use all the same optimizations as Flux.
 
 ## Inference
 
diff --git a/docs/source/en/api/pipelines/cogview3.md b/docs/source/en/api/pipelines/cogview3.md
index 0180fee3002d..5ee02e1a7039 100644
--- a/docs/source/en/api/pipelines/cogview3.md
+++ b/docs/source/en/api/pipelines/cogview3.md
@@ -21,11 +21,8 @@ The abstract from the paper is:
 
 *Recent advancements in text-to-image generative systems have been largely driven by diffusion models. However, single-stage text-to-image diffusion models still face challenges, in terms of computational efficiency and the refinement of image details. To tackle the issue, we propose CogView3, an innovative cascaded framework that enhances the performance of text-to-image diffusion. CogView3 is the first model implementing relay diffusion in the realm of text-to-image generation, executing the task by first creating low-resolution images and subsequently applying relay-based super-resolution. This methodology not only results in competitive text-to-image outputs but also greatly reduces both training and inference costs. Our experimental results demonstrate that CogView3 outperforms SDXL, the current state-of-the-art open-source text-to-image diffusion model, by 77.0% in human evaluations, all while requiring only about 1/2 of the inference time. The distilled variant of CogView3 achieves comparable performance while only utilizing 1/10 of the inference time by SDXL.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
 
diff --git a/docs/source/en/api/pipelines/cogview4.md b/docs/source/en/api/pipelines/cogview4.md
index 50ba5baa6210..7857dc8c9476 100644
--- a/docs/source/en/api/pipelines/cogview4.md
+++ b/docs/source/en/api/pipelines/cogview4.md
@@ -15,11 +15,8 @@
 
 # CogView4
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
 
diff --git a/docs/source/en/api/pipelines/consisid.md b/docs/source/en/api/pipelines/consisid.md
index db6b5e59aca3..bba047292413 100644
--- a/docs/source/en/api/pipelines/consisid.md
+++ b/docs/source/en/api/pipelines/consisid.md
@@ -25,11 +25,8 @@ The abstract from the paper is:
 
 *Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in the literature: (1) A tuning-free pipeline without tedious case-by-case finetuning, and (2) A frequency-aware heuristic identity-preserving Diffusion Transformer (DiT)-based control scheme. To achieve these goals, we propose **ConsisID**, a tuning-free DiT-based controllable IPT2V model to keep human-**id**entity **consis**tent in the generated video. Inspired by prior findings in frequency analysis of vision/diffusion transformers, it employs identity-control signals in the frequency domain, where facial features can be decomposed into low-frequency global features (e.g., profile, proportions) and high-frequency intrinsic features (e.g., identity markers that remain unaffected by pose changes). First, from a low-frequency perspective, we introduce a global facial extractor, which encodes the reference image and facial key points into a latent space, generating features enriched with low-frequency information. These features are then integrated into the shallow layers of the network to alleviate training challenges associated with DiT. Second, from a high-frequency perspective, we design a local facial extractor to capture high-frequency details and inject them into the transformer blocks, enhancing the model's ability to preserve fine-grained features. To leverage the frequency information for identity preservation, we propose a hierarchical training strategy, transforming a vanilla pre-trained video generation model into an IPT2V model. Extensive experiments demonstrate that our frequency-aware heuristic scheme provides an optimal control solution for DiT-based models. Thanks to this scheme, our **ConsisID** achieves excellent results in generating high-quality, identity-preserving videos, making strides towards more effective IPT2V. The model weight of ConsID is publicly available at https://github.com/PKU-YuanGroup/ConsisID.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [SHYuanBest](https://github.com/SHYuanBest). The original codebase can be found [here](https://github.com/PKU-YuanGroup/ConsisID). The original weights can be found under [hf.co/BestWishYsh](https://huggingface.co/BestWishYsh).
 
diff --git a/docs/source/en/api/pipelines/control_flux_inpaint.md b/docs/source/en/api/pipelines/control_flux_inpaint.md
index 03a4fbebb8ba..4b087f20efcd 100644
--- a/docs/source/en/api/pipelines/control_flux_inpaint.md
+++ b/docs/source/en/api/pipelines/control_flux_inpaint.md
@@ -26,11 +26,8 @@ FLUX.1 Depth and Canny [dev] is a 12 billion parameter rectified flow transforme
 | Canny | [Black Forest Labs](https://huggingface.co/black-forest-labs) | [Link](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) |
 
 
-<Tip>
-
-Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more. For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
-
-</Tip>
+> [!TIP]
+> Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more. For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
 
 ```python
 import torch
diff --git a/docs/source/en/api/pipelines/controlnet.md b/docs/source/en/api/pipelines/controlnet.md
index 2a654a37357f..afc0a4653e07 100644
--- a/docs/source/en/api/pipelines/controlnet.md
+++ b/docs/source/en/api/pipelines/controlnet.md
@@ -28,11 +28,8 @@ This model was contributed by [takuma104](https://huggingface.co/takuma104). ❤
 
 The original codebase can be found at [lllyasviel/ControlNet](https://github.com/lllyasviel/ControlNet), and you can find official ControlNet checkpoints on [lllyasviel's](https://huggingface.co/lllyasviel) Hub profile.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionControlNetPipeline
 [[autodoc]] StableDiffusionControlNetPipeline
diff --git a/docs/source/en/api/pipelines/controlnet_flux.md b/docs/source/en/api/pipelines/controlnet_flux.md
index 9feb73652306..ff38ca3f2c2e 100644
--- a/docs/source/en/api/pipelines/controlnet_flux.md
+++ b/docs/source/en/api/pipelines/controlnet_flux.md
@@ -44,11 +44,8 @@ XLabs ControlNets are also supported, which was contributed by the [XLabs team](
 | HED | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-hed-diffusers) |
 
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## FluxControlNetPipeline
 [[autodoc]] FluxControlNetPipeline
diff --git a/docs/source/en/api/pipelines/controlnet_hunyuandit.md b/docs/source/en/api/pipelines/controlnet_hunyuandit.md
index c79b2dbf650e..88dc2de10a64 100644
--- a/docs/source/en/api/pipelines/controlnet_hunyuandit.md
+++ b/docs/source/en/api/pipelines/controlnet_hunyuandit.md
@@ -24,11 +24,8 @@ The abstract from the paper is:
 
 This code is implemented by Tencent Hunyuan Team. You can find pre-trained checkpoints for Hunyuan-DiT ControlNets on [Tencent Hunyuan](https://huggingface.co/Tencent-Hunyuan).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## HunyuanDiTControlNetPipeline
 [[autodoc]] HunyuanDiTControlNetPipeline
diff --git a/docs/source/en/api/pipelines/controlnet_sd3.md b/docs/source/en/api/pipelines/controlnet_sd3.md
index 067c1c6b01cb..8cdada9edf43 100644
--- a/docs/source/en/api/pipelines/controlnet_sd3.md
+++ b/docs/source/en/api/pipelines/controlnet_sd3.md
@@ -38,11 +38,8 @@ This controlnet code is mainly implemented by [The InstantX Team](https://huggin
 | Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
 
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusion3ControlNetPipeline
 [[autodoc]] StableDiffusion3ControlNetPipeline
diff --git a/docs/source/en/api/pipelines/controlnet_sdxl.md b/docs/source/en/api/pipelines/controlnet_sdxl.md
index cb0554a1cc8e..89fc1c389798 100644
--- a/docs/source/en/api/pipelines/controlnet_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnet_sdxl.md
@@ -26,19 +26,13 @@ The abstract from the paper is:
 
 You can find additional smaller Stable Diffusion XL (SDXL) ControlNet checkpoints from the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, and browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) checkpoints on the Hub.
 
-<Tip warning={true}>
-
-🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
-
-</Tip>
+> [!WARNING]
+> 🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
 
 If you don't see a checkpoint you're interested in, you can train your own SDXL ControlNet with our [training script](../../../../../examples/controlnet/README_sdxl).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionXLControlNetPipeline
 [[autodoc]] StableDiffusionXLControlNetPipeline
diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index aea8cb2e867f..d44fb0cf0fdf 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -31,11 +31,8 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
 
 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionControlNetXSPipeline
 [[autodoc]] StableDiffusionControlNetXSPipeline
diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 76937b16c54c..7ae0e2a2a178 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -27,17 +27,11 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe
 
 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
 
-<Tip warning={true}>
-
-🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+> [!WARNING]
+> 🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
 
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionXLControlNetXSPipeline
 [[autodoc]] StableDiffusionXLControlNetXSPipeline
diff --git a/docs/source/en/api/pipelines/cosmos.md b/docs/source/en/api/pipelines/cosmos.md
index dba807c5cee9..fb9453480e74 100644
--- a/docs/source/en/api/pipelines/cosmos.md
+++ b/docs/source/en/api/pipelines/cosmos.md
@@ -18,11 +18,8 @@
 
 *Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Loading original format checkpoints
 
diff --git a/docs/source/en/api/pipelines/dance_diffusion.md b/docs/source/en/api/pipelines/dance_diffusion.md
index 5805561e4916..0434f6319592 100644
--- a/docs/source/en/api/pipelines/dance_diffusion.md
+++ b/docs/source/en/api/pipelines/dance_diffusion.md
@@ -20,11 +20,8 @@ specific language governing permissions and limitations under the License.
 Dance Diffusion is the first in a suite of generative audio tools for producers and musicians released by [Harmonai](https://github.com/Harmonai-org).
 
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## DanceDiffusionPipeline
 [[autodoc]] DanceDiffusionPipeline
diff --git a/docs/source/en/api/pipelines/ddpm.md b/docs/source/en/api/pipelines/ddpm.md
index 716cf7327577..63c2fcaf8923 100644
--- a/docs/source/en/api/pipelines/ddpm.md
+++ b/docs/source/en/api/pipelines/ddpm.md
@@ -20,11 +20,8 @@ The abstract from the paper is:
 
 The original codebase can be found at [hohonathanho/diffusion](https://github.com/hojonathanho/diffusion).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 # DDPMPipeline
 [[autodoc]] DDPMPipeline
diff --git a/docs/source/en/api/pipelines/dit.md b/docs/source/en/api/pipelines/dit.md
index e87058899b97..16d0c999619d 100644
--- a/docs/source/en/api/pipelines/dit.md
+++ b/docs/source/en/api/pipelines/dit.md
@@ -20,11 +20,8 @@ The abstract from the paper is:
 
 The original codebase can be found at [facebookresearch/dit](https://github.com/facebookresearch/dit).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## DiTPipeline
 [[autodoc]] DiTPipeline
diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md
index bb7275822247..1a89de98e48f 100644
--- a/docs/source/en/api/pipelines/flux.md
+++ b/docs/source/en/api/pipelines/flux.md
@@ -21,13 +21,10 @@ Flux is a series of text-to-image generation models based on diffusion transform
 
 Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux).
 
-<Tip>
-
-Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.  For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
-
-[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
-
-</Tip>
+> [!TIP]
+> Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.  For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
+>
+> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
 
 Flux comes in the following variants:
 
@@ -420,11 +417,8 @@ When unloading the Control LoRA weights, call `pipe.unload_lora_weights(reset_to
 
 ## IP-Adapter
 
-<Tip>
-
-Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
-
-</Tip>
+> [!TIP]
+> Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
 
 An IP-Adapter lets you prompt Flux with images, in addition to the text prompt. This is especially useful when describing complex concepts that are difficult to articulate through text alone and you have reference images.
 
@@ -604,9 +598,8 @@ image.save("flux.png")
 
 The `FluxTransformer2DModel` supports loading checkpoints in the original format shipped by Black Forest Labs. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
 
-<Tip>
-`FP8` inference can be brittle depending on the GPU type, CUDA version, and `torch` version that you are using. It is recommended that you use the `optimum-quanto` library in order to run FP8 inference on your machine.
-</Tip>
+> [!TIP]
+> `FP8` inference can be brittle depending on the GPU type, CUDA version, and `torch` version that you are using. It is recommended that you use the `optimum-quanto` library in order to run FP8 inference on your machine.
 
 The following example demonstrates how to run Flux with less than 16GB of VRAM.
 
diff --git a/docs/source/en/api/pipelines/framepack.md b/docs/source/en/api/pipelines/framepack.md
index ba7b2d0dc0f1..a25cfe24a4ba 100644
--- a/docs/source/en/api/pipelines/framepack.md
+++ b/docs/source/en/api/pipelines/framepack.md
@@ -22,11 +22,8 @@
 
 *We present a neural network structure, FramePack, to train next-frame (or next-frame-section) prediction models for video generation. The FramePack compresses input frames to make the transformer context length a fixed number regardless of the video length. As a result, we are able to process a large number of frames using video diffusion with computation bottleneck similar to image diffusion. This also makes the training video batch sizes significantly higher (batch sizes become comparable to image diffusion training). We also propose an anti-drifting sampling method that generates frames in inverted temporal order with early-established endpoints to avoid exposure bias (error accumulation over iterations). Finally, we show that existing video diffusion models can be finetuned with FramePack, and their visual quality may be improved because the next-frame prediction supports more balanced diffusion schedulers with less extreme flow shift timesteps.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Available models
 
diff --git a/docs/source/en/api/pipelines/hidream.md b/docs/source/en/api/pipelines/hidream.md
index 9848612c3300..acfcef93e0ad 100644
--- a/docs/source/en/api/pipelines/hidream.md
+++ b/docs/source/en/api/pipelines/hidream.md
@@ -16,11 +16,8 @@
 
 [HiDream-I1](https://huggingface.co/HiDream-ai) by HiDream.ai
 
-<Tip>
-
-[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
-
-</Tip>
+> [!TIP]
+> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
 
 ## Available models
 
diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md
index 07e869ba95ae..3f4db66c6c94 100644
--- a/docs/source/en/api/pipelines/hunyuandit.md
+++ b/docs/source/en/api/pipelines/hunyuandit.md
@@ -28,17 +28,11 @@ HunyuanDiT has the following components:
 * It uses a diffusion transformer as the backbone
 * It combines two text encoders, a bilingual CLIP and a multilingual T5 encoder
 
-<Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-<Tip>
-
-You can further improve generation quality by passing the generated image from [`HungyuanDiTPipeline`] to the [SDXL refiner](../../using-diffusers/sdxl#base-to-refiner-model) model.
-
-</Tip>
+> [!TIP]
+> You can further improve generation quality by passing the generated image from [`HungyuanDiTPipeline`] to the [SDXL refiner](../../using-diffusers/sdxl#base-to-refiner-model) model.
 
 ## Optimization
 
diff --git a/docs/source/en/api/pipelines/i2vgenxl.md b/docs/source/en/api/pipelines/i2vgenxl.md
index 76a51a6cd57a..711a5625f99c 100644
--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -23,11 +23,8 @@ The abstract from the paper is:
 
 The original codebase can be found [here](https://github.com/ali-vilab/i2vgen-xl/). The model checkpoints can be found [here](https://huggingface.co/ali-vilab/).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section [here](../../using-diffusers/svd#reduce-memory-usage).
 
 Sample output with I2VGenXL:
 
diff --git a/docs/source/en/api/pipelines/kandinsky.md b/docs/source/en/api/pipelines/kandinsky.md
index 90c76954ab96..7717f2db69a5 100644
--- a/docs/source/en/api/pipelines/kandinsky.md
+++ b/docs/source/en/api/pipelines/kandinsky.md
@@ -17,17 +17,11 @@ The description from it's GitHub page is:
 
 The original codebase can be found at [ai-forever/Kandinsky-2](https://github.com/ai-forever/Kandinsky-2).
 
-<Tip>
+> [!TIP]
+> Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
 
-Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## KandinskyPriorPipeline
 
diff --git a/docs/source/en/api/pipelines/kandinsky3.md b/docs/source/en/api/pipelines/kandinsky3.md
index 1727387c4a26..f08afa887904 100644
--- a/docs/source/en/api/pipelines/kandinsky3.md
+++ b/docs/source/en/api/pipelines/kandinsky3.md
@@ -28,17 +28,11 @@ Its architecture includes 3 main components:
 
 The original codebase can be found at [ai-forever/Kandinsky-3](https://github.com/ai-forever/Kandinsky-3).
 
-<Tip>
+> [!TIP]
+> Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
 
-Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Kandinsky3Pipeline
 
diff --git a/docs/source/en/api/pipelines/kandinsky_v22.md b/docs/source/en/api/pipelines/kandinsky_v22.md
index e68c094e23f0..0e0ed80db61c 100644
--- a/docs/source/en/api/pipelines/kandinsky_v22.md
+++ b/docs/source/en/api/pipelines/kandinsky_v22.md
@@ -17,17 +17,11 @@ The description from it's GitHub page is:
 
 The original codebase can be found at [ai-forever/Kandinsky-2](https://github.com/ai-forever/Kandinsky-2).
 
-<Tip>
+> [!TIP]
+> Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
 
-Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## KandinskyV22PriorPipeline
 
diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md
index 048f6c1de980..b4c83fe134f5 100644
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -50,17 +50,11 @@ image.save("kolors_sample.png")
 
 Kolors needs a different IP Adapter to work, and it uses [Openai-CLIP-336](https://huggingface.co/openai/clip-vit-large-patch14-336) as an image encoder.
 
-<Tip>
+> [!TIP]
+> Using an IP Adapter with Kolors requires more than 24GB of VRAM. To use it, we recommend using [`~DiffusionPipeline.enable_model_cpu_offload`] on consumer GPUs.
 
-Using an IP Adapter with Kolors requires more than 24GB of VRAM. To use it, we recommend using [`~DiffusionPipeline.enable_model_cpu_offload`] on consumer GPUs.
-
-</Tip>
-
-<Tip>
-
-While Kolors is integrated in Diffusers, you need to load the image encoder from a revision to use the safetensor files. You can still use the main branch of the original repository if you're comfortable loading pickle checkpoints.
-
-</Tip>
+> [!TIP]
+> While Kolors is integrated in Diffusers, you need to load the image encoder from a revision to use the safetensor files. You can still use the main branch of the original repository if you're comfortable loading pickle checkpoints.
 
 ```python
 import torch
diff --git a/docs/source/en/api/pipelines/latent_diffusion.md b/docs/source/en/api/pipelines/latent_diffusion.md
index 5489d673f557..cefed90e86a5 100644
--- a/docs/source/en/api/pipelines/latent_diffusion.md
+++ b/docs/source/en/api/pipelines/latent_diffusion.md
@@ -20,11 +20,8 @@ The abstract from the paper is:
 
 The original codebase can be found at [CompVis/latent-diffusion](https://github.com/CompVis/latent-diffusion).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## LDMTextToImagePipeline
 [[autodoc]] LDMTextToImagePipeline
diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md
index 9d4d12dd4e02..c8438c668a44 100644
--- a/docs/source/en/api/pipelines/latte.md
+++ b/docs/source/en/api/pipelines/latte.md
@@ -26,11 +26,8 @@ The abstract from the paper is:
 
 This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ### Inference
 
diff --git a/docs/source/en/api/pipelines/ledits_pp.md b/docs/source/en/api/pipelines/ledits_pp.md
index 7c08971aa8d9..103bcf379890 100644
--- a/docs/source/en/api/pipelines/ledits_pp.md
+++ b/docs/source/en/api/pipelines/ledits_pp.md
@@ -22,16 +22,12 @@ The abstract from the paper is:
 
 *Text-to-image diffusion models have recently received increasing interest for their astonishing ability to produce high-fidelity images from solely text inputs. Subsequent research efforts aim to exploit and apply their capabilities to real image editing. However, existing image-to-image methods are often inefficient, imprecise, and of limited versatility. They either require time-consuming fine-tuning, deviate unnecessarily strongly from the input image, and/or lack support for multiple, simultaneous edits. To address these issues, we introduce LEDITS++, an efficient yet versatile and precise textual image manipulation technique. LEDITS++'s novel inversion approach requires no tuning nor optimization and produces high-fidelity results with a few diffusion steps. Second, our methodology supports multiple simultaneous edits and is architecture-agnostic. Third, we use a novel implicit masking technique that limits changes to relevant image regions. We propose the novel TEdBench++ benchmark as part of our exhaustive evaluation. Our results demonstrate the capabilities of LEDITS++ and its improvements over previous methods. The project page is available at https://leditsplusplus-project.static.hf.space .*
 
-<Tip>
+> [!TIP]
+> You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
 
-You can find additional information about LEDITS++ on the [project page](https://leditsplusplus-project.static.hf.space/index.html) and try it out in a [demo](https://huggingface.co/spaces/editing-images/leditsplusplus).
-
-</Tip>
-
-<Tip warning={true}>
-Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
-This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
-</Tip>
+> [!WARNING]
+> Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
+> This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
 
 We provide two distinct pipelines based on different pre-trained models.
 
diff --git a/docs/source/en/api/pipelines/lumina.md b/docs/source/en/api/pipelines/lumina.md
index 3bd3d9f8e07c..0a236d213d6c 100644
--- a/docs/source/en/api/pipelines/lumina.md
+++ b/docs/source/en/api/pipelines/lumina.md
@@ -45,11 +45,8 @@ Lumina-T2X has the following components:
 
 This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ### Inference (Text-to-Image)
 
diff --git a/docs/source/en/api/pipelines/lumina2.md b/docs/source/en/api/pipelines/lumina2.md
index 092d7cde2ebb..0c4e793404fe 100644
--- a/docs/source/en/api/pipelines/lumina2.md
+++ b/docs/source/en/api/pipelines/lumina2.md
@@ -24,11 +24,8 @@ The abstract from the paper is:
 
 *We introduce Lumina-Image 2.0, an advanced text-to-image model that surpasses previous state-of-the-art methods across multiple benchmarks, while also shedding light on its potential to evolve into a generalist vision intelligence model. Lumina-Image 2.0 exhibits three key properties: (1) Unification – it adopts a unified architecture that treats text and image tokens as a joint sequence, enabling natural cross-modal interactions and facilitating task expansion. Besides, since high-quality captioners can provide semantically better-aligned text-image training pairs, we introduce a unified captioning system, UniCaptioner, which generates comprehensive and precise captions for the model. This not only accelerates model convergence but also enhances prompt adherence, variable-length prompt handling, and task generalization via prompt templates. (2) Efficiency – to improve the efficiency of the unified architecture, we develop a set of optimization techniques that improve semantic learning and fine-grained texture generation during training while incorporating inference-time acceleration strategies without compromising image quality. (3) Transparency – we open-source all training details, code, and models to ensure full reproducibility, aiming to bridge the gap between well-resourced closed-source research teams and independent developers.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Using Single File loading with Lumina Image 2.0
 
diff --git a/docs/source/en/api/pipelines/marigold.md b/docs/source/en/api/pipelines/marigold.md
index e9ca0df067ba..81e103afeb64 100644
--- a/docs/source/en/api/pipelines/marigold.md
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -45,14 +45,11 @@ This work expanded Marigold to support new modalities such as **Surface Normals*
 (IID), introduced a training protocol for **Latent Consistency Models** (LCM), and demonstrated **High-Resolution** (HR) 
 processing capability.
 
-<Tip>
-
-The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
-LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
-Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal 
-results in as few as 1 to 4 steps.
-
-</Tip>
+> [!TIP]
+> The early Marigold models (`v1-0` and earlier) were optimized for best results with at least 10 inference steps.
+> LCM models were later developed to enable high-quality inference in just 1 to 4 steps.
+> Marigold models `v1-1` and later use the DDIM scheduler to achieve optimal 
+> results in as few as 1 to 4 steps.
 
 ## Available Pipelines
 
@@ -80,27 +77,21 @@ The following is a summary of the recommended checkpoints, all of which produce
 | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                                      | 
 | [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 
-between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to 
-efficiently load the same components into multiple pipelines. 
-Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section 
-[here](../../using-diffusers/svd#reduce-memory-usage).
-
-</Tip>
-
-<Tip warning={true}>
-
-Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
-The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
-To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the 
-API reference).
-Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration 
-file (`model_index.json`).
-This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 
+> between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to 
+> efficiently load the same components into multiple pipelines. 
+> Also, to know more about reducing the memory usage of this pipeline, refer to the ["Reduce memory usage"] section 
+> [here](../../using-diffusers/svd#reduce-memory-usage).
+
+> [!WARNING]
+> Marigold pipelines were designed and tested with the scheduler embedded in the model checkpoint.
+> The optimal number of inference steps varies by scheduler, with no universal value that works best across all cases.
+> To accommodate this, the `num_inference_steps` parameter in the pipeline's `__call__` method defaults to `None` (see the 
+> API reference).
+> Unless set explicitly, it inherits the value from the `default_denoising_steps` field in the checkpoint configuration 
+> file (`model_index.json`).
+> This ensures high-quality predictions when invoking the pipeline with only the `image` argument.
 
 See also Marigold [usage examples](../../using-diffusers/marigold_usage).
 
diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md
index f1260b07b077..f19a9bd575c1 100644
--- a/docs/source/en/api/pipelines/mochi.md
+++ b/docs/source/en/api/pipelines/mochi.md
@@ -121,15 +121,13 @@ export_to_video(frames, "mochi.mp4", fps=30)
 
 The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the original implementation, please refer to the following example.
 
-<Tip>
-The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder.
-
-When enabling `force_zeros_for_empty_prompt`, it is recommended to run the text encoding step outside the autocast context in full precision.
-</Tip>
+> [!TIP]
+> The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder.
+>
+> When enabling `force_zeros_for_empty_prompt`, it is recommended to run the text encoding step outside the autocast context in full precision.
 
-<Tip>
-Decoding the latents in full precision is very memory intensive. You will need at least 70GB VRAM to generate the 163 frames in this example. To reduce memory, either reduce the number of frames or run the decoding step in `torch.bfloat16`.
-</Tip>
+> [!TIP]
+> Decoding the latents in full precision is very memory intensive. You will need at least 70GB VRAM to generate the 163 frames in this example. To reduce memory, either reduce the number of frames or run the decoding step in `torch.bfloat16`.
 
 ```python
 import torch
@@ -231,9 +229,8 @@ export_to_video(frames, "output.mp4", fps=30)
 
 You can use `from_single_file` to load the Mochi transformer in its original format.
 
-<Tip>
-Diffusers currently doesn't support using the FP8 scaled versions of the Mochi single file checkpoints.
-</Tip>
+> [!TIP]
+> Diffusers currently doesn't support using the FP8 scaled versions of the Mochi single file checkpoints.
 
 ```python
 import torch
diff --git a/docs/source/en/api/pipelines/musicldm.md b/docs/source/en/api/pipelines/musicldm.md
index c2297162f737..1a83e5932ed4 100644
--- a/docs/source/en/api/pipelines/musicldm.md
+++ b/docs/source/en/api/pipelines/musicldm.md
@@ -43,11 +43,8 @@ During inference:
 * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1 to enable. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
 * The _length_ of the generated audio sample can be controlled by varying the `audio_length_in_s` argument.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## MusicLDMPipeline
 [[autodoc]] MusicLDMPipeline
diff --git a/docs/source/en/api/pipelines/omnigen.md b/docs/source/en/api/pipelines/omnigen.md
index 074e7b8f0115..4fac5c789a25 100644
--- a/docs/source/en/api/pipelines/omnigen.md
+++ b/docs/source/en/api/pipelines/omnigen.md
@@ -21,11 +21,8 @@ The abstract from the paper is:
 
 *The emergence of Large Language Models (LLMs) has unified language  generation tasks and revolutionized human-machine interaction.  However, in the realm of image generation, a unified model capable of handling various tasks within a single framework remains largely unexplored. In this work, we introduce OmniGen, a new diffusion model for unified image generation. OmniGen is characterized by the following features: 1) Unification: OmniGen not only demonstrates text-to-image generation capabilities but also inherently supports various downstream tasks, such as image editing, subject-driven generation, and visual conditional generation. 2) Simplicity: The architecture of OmniGen is highly simplified, eliminating the need for additional plugins. Moreover, compared to existing diffusion models, it is more user-friendly and can complete complex tasks end-to-end through instructions without the need for extra intermediate steps, greatly simplifying the image generation workflow. 3) Knowledge Transfer: Benefit from learning in a unified format, OmniGen effectively transfers knowledge across different tasks, manages unseen tasks and domains, and exhibits novel capabilities. We also explore the model’s reasoning capabilities and potential applications of the chain-of-thought mechanism.  This work represents the first attempt at a general-purpose image generation model,  and we will release our resources at https://github.com/VectorSpaceLab/OmniGen to foster future advancements.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [staoxiao](https://github.com/staoxiao). The original codebase can be found [here](https://github.com/VectorSpaceLab/OmniGen). The original weights can be found under [hf.co/shitao](https://huggingface.co/Shitao/OmniGen-v1).
 
diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
index d3cc318a5459..ce883931df21 100644
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -16,15 +16,12 @@ Pipelines provide a simple way to run state-of-the-art diffusion models in infer
 
 All pipelines are built from the base [`DiffusionPipeline`] class which provides basic functionality for loading, downloading, and saving all the components. Specific pipeline types (for example [`StableDiffusionPipeline`]) loaded with [`~DiffusionPipeline.from_pretrained`] are automatically detected and the pipeline components are loaded and passed to the `__init__` function of the pipeline.
 
-<Tip warning={true}>
-
-You shouldn't use the [`DiffusionPipeline`] class for training. Individual components (for example, [`UNet2DModel`] and [`UNet2DConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with them instead.
-
-<br>
-
-Pipelines do not offer any training functionality. You'll notice PyTorch's autograd is disabled by decorating the [`~DiffusionPipeline.__call__`] method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should not be used for training. If you're interested in training, please take a look at the [Training](../../training/overview) guides instead!
-
-</Tip>
+> [!WARNING]
+> You shouldn't use the [`DiffusionPipeline`] class for training. Individual components (for example, [`UNet2DModel`] and [`UNet2DConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with them instead.
+>
+> <br>
+>
+> Pipelines do not offer any training functionality. You'll notice PyTorch's autograd is disabled by decorating the [`~DiffusionPipeline.__call__`] method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should not be used for training. If you're interested in training, please take a look at the [Training](../../training/overview) guides instead!
 
 The table below lists all the pipelines currently available in 🤗 Diffusers and the tasks they support. Click on a pipeline to view its abstract and published paper.
 
diff --git a/docs/source/en/api/pipelines/pag.md b/docs/source/en/api/pipelines/pag.md
index 7b87e58a87e2..35004b6ad39c 100644
--- a/docs/source/en/api/pipelines/pag.md
+++ b/docs/source/en/api/pipelines/pag.md
@@ -31,11 +31,8 @@ PAG can be used by specifying the `pag_applied_layers` as a parameter when insta
 - Partial identifier as a RegEx: `down_blocks.2`, or `attn1`
 - List of identifiers (can be combo of strings and ReGex): `["blocks.1", "blocks.(14|20)", r"down_blocks\.(2,3)"]`
 
-<Tip warning={true}>
-
-Since RegEx is supported as a way for matching layer identifiers, it is crucial to use it correctly otherwise there might be unexpected behaviour. The recommended way to use PAG is by specifying layers as `blocks.{layer_index}` and `blocks.({layer_index_1|layer_index_2|...})`. Using it in any other way, while doable, may bypass our basic validation checks and give you unexpected results.
-
-</Tip>
+> [!WARNING]
+> Since RegEx is supported as a way for matching layer identifiers, it is crucial to use it correctly otherwise there might be unexpected behaviour. The recommended way to use PAG is by specifying layers as `blocks.{layer_index}` and `blocks.({layer_index_1|layer_index_2|...})`. Using it in any other way, while doable, may bypass our basic validation checks and give you unexpected results.
 
 ## AnimateDiffPAGPipeline
 [[autodoc]] AnimateDiffPAGPipeline
diff --git a/docs/source/en/api/pipelines/paint_by_example.md b/docs/source/en/api/pipelines/paint_by_example.md
index 362c26de68a4..02bf6db7265d 100644
--- a/docs/source/en/api/pipelines/paint_by_example.md
+++ b/docs/source/en/api/pipelines/paint_by_example.md
@@ -27,11 +27,8 @@ The original codebase can be found at [Fantasy-Studio/Paint-by-Example](https://
 
 Paint by Example is supported by the official [Fantasy-Studio/Paint-by-Example](https://huggingface.co/Fantasy-Studio/Paint-by-Example) checkpoint. The checkpoint is warm-started from [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) to inpaint partly masked images conditioned on example and reference images.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## PaintByExamplePipeline
 [[autodoc]] PaintByExamplePipeline
diff --git a/docs/source/en/api/pipelines/panorama.md b/docs/source/en/api/pipelines/panorama.md
index 9f61388dd57a..b65e05dd0b51 100644
--- a/docs/source/en/api/pipelines/panorama.md
+++ b/docs/source/en/api/pipelines/panorama.md
@@ -42,11 +42,8 @@ For example, without circular padding, there is a stitching artifact (default):
 But with circular padding, the right and the left parts are matching (`circular_padding=True`):
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/indoor_%20circular_padding.png)
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionPanoramaPipeline
 [[autodoc]] StableDiffusionPanoramaPipeline
diff --git a/docs/source/en/api/pipelines/pia.md b/docs/source/en/api/pipelines/pia.md
index 7bd480b49a75..eebfa4d4f8a6 100644
--- a/docs/source/en/api/pipelines/pia.md
+++ b/docs/source/en/api/pipelines/pia.md
@@ -87,11 +87,8 @@ Here are some sample outputs:
 </table>
 
 
-<Tip>
-
-If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the PIA checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
-
-</Tip>
+> [!TIP]
+> If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the PIA checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
 
 ## Using FreeInit
 
@@ -149,11 +146,8 @@ export_to_gif(frames, "pia-freeinit-animation.gif")
 </table>
 
 
-<Tip warning={true}>
-
-FreeInit is not really free - the improved quality comes at the cost of extra computation. It requires sampling a few extra times depending on the `num_iters` parameter that is set when enabling it. Setting the `use_fast_sampling` parameter to `True` can improve the overall performance (at the cost of lower quality compared to when `use_fast_sampling=False` but still better results than vanilla video generation models).
-
-</Tip>
+> [!WARNING]
+> FreeInit is not really free - the improved quality comes at the cost of extra computation. It requires sampling a few extra times depending on the `num_iters` parameter that is set when enabling it. Setting the `use_fast_sampling` parameter to `True` can improve the overall performance (at the cost of lower quality compared to when `use_fast_sampling=False` but still better results than vanilla video generation models).
 
 ## PIAPipeline
 
diff --git a/docs/source/en/api/pipelines/pix2pix.md b/docs/source/en/api/pipelines/pix2pix.md
index 20a74577c164..84eb0cb5e5d3 100644
--- a/docs/source/en/api/pipelines/pix2pix.md
+++ b/docs/source/en/api/pipelines/pix2pix.md
@@ -24,11 +24,8 @@ The abstract from the paper is:
 
 You can find additional information about InstructPix2Pix on the [project page](https://www.timothybrooks.com/instruct-pix2pix), [original codebase](https://github.com/timothybrooks/instruct-pix2pix), and try it out in a [demo](https://huggingface.co/spaces/timbrooks/instruct-pix2pix).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionInstructPix2PixPipeline
 [[autodoc]] StableDiffusionInstructPix2PixPipeline
diff --git a/docs/source/en/api/pipelines/pixart.md b/docs/source/en/api/pipelines/pixart.md
index a36a2a4b7a96..dbdc89857e5e 100644
--- a/docs/source/en/api/pipelines/pixart.md
+++ b/docs/source/en/api/pipelines/pixart.md
@@ -29,11 +29,8 @@ Some notes about this pipeline:
 * It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-alpha/blob/08fbbd281ec96866109bdd2cdb75f2f58fb17610/diffusion/data/datasets/utils.py).
 * It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as Stable Diffusion XL, Imagen, and DALL-E 2, while being more efficient than them.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## Inference with under 8GB GPU VRAM
 
@@ -112,11 +109,8 @@ del pipe.transformer
 flush()
 ```
 
-<Tip>
-
-Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
-
-</Tip>
+> [!TIP]
+> Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
 
 Once the latents are computed, pass it off to the VAE to decode into a real image:
 
@@ -133,11 +127,8 @@ By deleting components you aren't using and flushing the GPU VRAM, you should be
 
 If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
 
-<Tip warning={true}>
-
-Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
-
-</Tip>
+> [!WARNING]
+> Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
 
 While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
 
diff --git a/docs/source/en/api/pipelines/pixart_sigma.md b/docs/source/en/api/pipelines/pixart_sigma.md
index dded4ea2d771..06b54de43bbc 100644
--- a/docs/source/en/api/pipelines/pixart_sigma.md
+++ b/docs/source/en/api/pipelines/pixart_sigma.md
@@ -31,17 +31,11 @@ Some notes about this pipeline:
 * It shows the ability of generating super high resolution images, such as 2048px or even 4K.
 * It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.)
 
-<Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-<Tip>
-
-You can further improve generation quality by passing the generated image from [`PixArtSigmaPipeline`] to the [SDXL refiner](../../using-diffusers/sdxl#base-to-refiner-model) model.
-
-</Tip>
+> [!TIP]
+> You can further improve generation quality by passing the generated image from [`PixArtSigmaPipeline`] to the [SDXL refiner](../../using-diffusers/sdxl#base-to-refiner-model) model.
 
 ## Inference with under 8GB GPU VRAM
 
@@ -119,11 +113,8 @@ del pipe.transformer
 flush()
 ```
 
-<Tip>
-
-Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
-
-</Tip>
+> [!TIP]
+> Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
 
 Once the latents are computed, pass it off to the VAE to decode into a real image:
 
@@ -140,11 +131,8 @@ By deleting components you aren't using and flushing the GPU VRAM, you should be
 
 If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
 
-<Tip warning={true}>
-
-Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
-
-</Tip>
+> [!WARNING]
+> Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
 
 While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
 
diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md
index 4c999bca35f9..27cc4802f601 100644
--- a/docs/source/en/api/pipelines/qwenimage.md
+++ b/docs/source/en/api/pipelines/qwenimage.md
@@ -28,11 +28,8 @@ Qwen-Image comes in the following variants:
 | Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
 | Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |
 
-<Tip>
-
-[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
-
-</Tip>
+> [!TIP]
+> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
 
 ## LoRA for faster inference
 
@@ -91,11 +88,8 @@ image.save("qwen_fewsteps.png")
 
 </details>
 
-<Tip>
-
-The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
-
-</Tip>
+> [!TIP]
+> The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
 
 ## Multi-image reference with QwenImageEditPlusPipeline
 
diff --git a/docs/source/en/api/pipelines/sana.md b/docs/source/en/api/pipelines/sana.md
index 7491689fd83d..a948620f96cb 100644
--- a/docs/source/en/api/pipelines/sana.md
+++ b/docs/source/en/api/pipelines/sana.md
@@ -25,11 +25,8 @@ The abstract from the paper is:
 
 *We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096×4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4) Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024×1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj) and [chenjy2003](https://github.com/chenjy2003). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://huggingface.co/Efficient-Large-Model).
 
@@ -49,11 +46,8 @@ Refer to [this](https://huggingface.co/collections/Efficient-Large-Model/sana-67
 
 Note: The recommended dtype mentioned is for the transformer weights. The text encoder and VAE weights must stay in `torch.bfloat16` or `torch.float32` for the model to work correctly. Please refer to the inference example below to see how to load the model with the recommended dtype. 
 
-<Tip>
-
-Make sure to pass the `variant` argument for downloaded checkpoints to use lower disk space. Set it to `"fp16"` for models with recommended dtype as `torch.float16`, and `"bf16"` for models with recommended dtype as `torch.bfloat16`. By default, `torch.float32` weights are downloaded, which use twice the amount of disk storage. Additionally, `torch.float32` weights can be downcasted on-the-fly by specifying the `torch_dtype` argument. Read about it in the [docs](https://huggingface.co/docs/diffusers/v0.31.0/en/api/pipelines/overview#diffusers.DiffusionPipeline.from_pretrained).
-
-</Tip>
+> [!TIP]
+> Make sure to pass the `variant` argument for downloaded checkpoints to use lower disk space. Set it to `"fp16"` for models with recommended dtype as `torch.float16`, and `"bf16"` for models with recommended dtype as `torch.bfloat16`. By default, `torch.float32` weights are downloaded, which use twice the amount of disk storage. Additionally, `torch.float32` weights can be downcasted on-the-fly by specifying the `torch_dtype` argument. Read about it in the [docs](https://huggingface.co/docs/diffusers/v0.31.0/en/api/pipelines/overview#diffusers.DiffusionPipeline.from_pretrained).
 
 ## Quantization
 
diff --git a/docs/source/en/api/pipelines/sana_sprint.md b/docs/source/en/api/pipelines/sana_sprint.md
index 93ab9fe418c1..357d7e406dd4 100644
--- a/docs/source/en/api/pipelines/sana_sprint.md
+++ b/docs/source/en/api/pipelines/sana_sprint.md
@@ -24,11 +24,8 @@ The abstract from the paper is:
 
 *This paper presents SANA-Sprint, an efficient diffusion model for ultra-fast text-to-image (T2I) generation. SANA-Sprint is built on a pre-trained foundation model and augmented with hybrid distillation, dramatically reducing inference steps from 20 to 1-4. We introduce three key innovations: (1) We propose a training-free approach that transforms a pre-trained flow-matching model for continuous-time consistency distillation (sCM), eliminating costly training from scratch and achieving high training efficiency. Our hybrid distillation strategy combines sCM with latent adversarial distillation (LADD): sCM ensures alignment with the teacher model, while LADD enhances single-step generation fidelity. (2) SANA-Sprint is a unified step-adaptive model that achieves high-quality generation in 1-4 steps, eliminating step-specific training and improving efficiency. (3) We integrate ControlNet with SANA-Sprint for real-time interactive image generation, enabling instant visual feedback for user interaction. SANA-Sprint establishes a new Pareto frontier in speed-quality tradeoffs, achieving state-of-the-art performance with 7.59 FID and 0.74 GenEval in only 1 step — outperforming FLUX-schnell (7.94 FID / 0.71 GenEval) while being 10× faster (0.1s vs 1.1s on H100). It also achieves 0.1s (T2I) and 0.25s (ControlNet) latency for 1024×1024 images on H100, and 0.31s (T2I) on an RTX 4090, showcasing its exceptional efficiency and potential for AI-powered consumer applications (AIPC). Code and pre-trained models will be open-sourced.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj), [shuchen Xue](https://github.com/scxue) and [Enze Xie](https://github.com/xieenze). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://huggingface.co/Efficient-Large-Model/).
 
diff --git a/docs/source/en/api/pipelines/self_attention_guidance.md b/docs/source/en/api/pipelines/self_attention_guidance.md
index 5578fdfa637d..8d411598ae6d 100644
--- a/docs/source/en/api/pipelines/self_attention_guidance.md
+++ b/docs/source/en/api/pipelines/self_attention_guidance.md
@@ -23,11 +23,8 @@ The abstract from the paper is:
 
 You can find additional information about Self-Attention Guidance on the [project page](https://ku-cvlab.github.io/Self-Attention-Guidance), [original codebase](https://github.com/KU-CVLAB/Self-Attention-Guidance), and try it out in a [demo](https://huggingface.co/spaces/susunghong/Self-Attention-Guidance) or [notebook](https://colab.research.google.com/github/SusungHong/Self-Attention-Guidance/blob/main/SAG_Stable.ipynb).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableDiffusionSAGPipeline
 [[autodoc]] StableDiffusionSAGPipeline
diff --git a/docs/source/en/api/pipelines/semantic_stable_diffusion.md b/docs/source/en/api/pipelines/semantic_stable_diffusion.md
index 1ce44cf2de79..dda428e80f8f 100644
--- a/docs/source/en/api/pipelines/semantic_stable_diffusion.md
+++ b/docs/source/en/api/pipelines/semantic_stable_diffusion.md
@@ -22,11 +22,8 @@ The abstract from the paper is:
 
 *Text-to-image diffusion models have recently received a lot of interest for their astonishing ability to produce high-fidelity images from text only. However, achieving one-shot generation that aligns with the user's intent is nearly impossible, yet small changes to the input prompt often result in very different images. This leaves the user with little semantic control. To put the user in control, we show how to interact with the diffusion process to flexibly steer it along semantic directions. This semantic guidance (SEGA) generalizes to any generative architecture using classifier-free guidance. More importantly, it allows for subtle and extensive edits, changes in composition and style, as well as optimizing the overall artistic conception. We demonstrate SEGA's effectiveness on both latent and pixel-based diffusion models such as Stable Diffusion, Paella, and DeepFloyd-IF using a variety of tasks, thus providing strong evidence for its versatility, flexibility, and improvements over existing methods.*
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## SemanticStableDiffusionPipeline
 [[autodoc]] SemanticStableDiffusionPipeline
diff --git a/docs/source/en/api/pipelines/shap_e.md b/docs/source/en/api/pipelines/shap_e.md
index 5e5af0656a63..3e505894ca80 100644
--- a/docs/source/en/api/pipelines/shap_e.md
+++ b/docs/source/en/api/pipelines/shap_e.md
@@ -17,11 +17,8 @@ The abstract from the paper is:
 
 The original codebase can be found at [openai/shap-e](https://github.com/openai/shap-e).
 
-<Tip>
-
-See the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> See the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## ShapEPipeline
 [[autodoc]] ShapEPipeline
diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index b47f9ae3372b..70de6776e98f 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -41,15 +41,12 @@ The Stage C model operates on the small 24 x 24 latents and denoises the latents
 
 The Stage B and Stage A models are used with the `StableCascadeDecoderPipeline` and are responsible for generating the final image given the small 24 x 24 latents.
 
-<Tip warning={true}>
-
-There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
-
-In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
-
-If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
-
-</Tip>
+> [!WARNING]
+> There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
+>
+> In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
+>
+> If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
 
 ## Usage example
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/depth2img.md b/docs/source/en/api/pipelines/stable_diffusion/depth2img.md
index e198eaa9524d..aa43cf7db903 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/depth2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/depth2img.md
@@ -18,13 +18,10 @@ specific language governing permissions and limitations under the License.
 
 The Stable Diffusion model can also infer depth based on an image using [MiDaS](https://github.com/isl-org/MiDaS). This allows you to pass a text prompt and an initial image to condition the generation of new images as well as a `depth_map` to preserve the image structure.
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## StableDiffusionDepth2ImgPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/gligen.md b/docs/source/en/api/pipelines/stable_diffusion/gligen.md
index e9704fc1de4c..c8297fb7b3de 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/gligen.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/gligen.md
@@ -21,13 +21,10 @@ The abstract from the [paper](https://huggingface.co/papers/2301.07093) is:
 
 *Large-scale text-to-image diffusion models have made amazing advances. However, the status quo is to use text input alone, which can impede controllability. In this work, we propose GLIGEN, Grounded-Language-to-Image Generation, a novel approach that builds upon and extends the functionality of existing pre-trained text-to-image diffusion models by enabling them to also be conditioned on grounding inputs. To preserve the vast concept knowledge of the pre-trained model, we freeze all of its weights and inject the grounding information into new trainable layers via a gated mechanism. Our model achieves open-world grounded text2img generation with caption and bounding box condition inputs, and the grounding ability generalizes well to novel spatial configurations and concepts. GLIGEN’s zeroshot performance on COCO and LVIS outperforms existing supervised layout-to-image baselines by a large margin.*
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality and how to reuse pipeline components efficiently!
-
-If you want to use one of the official checkpoints for a task, explore the [gligen](https://huggingface.co/gligen) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality and how to reuse pipeline components efficiently!
+>
+> If you want to use one of the official checkpoints for a task, explore the [gligen](https://huggingface.co/gligen) Hub organizations!
 
 [`StableDiffusionGLIGENPipeline`] was contributed by [Nikhil Gajendrakumar](https://github.com/nikhil-masterful) and [`StableDiffusionGLIGENTextImagePipeline`] was contributed by [Nguyễn Công Tú Anh](https://github.com/tuanh123789).
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/image_variation.md b/docs/source/en/api/pipelines/stable_diffusion/image_variation.md
index 7a50971fdfa7..b1b7146b336f 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/image_variation.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/image_variation.md
@@ -16,11 +16,8 @@ The Stable Diffusion model can also generate variations from an input image. It
 
 The original codebase can be found at [LambdaLabsML/lambda-diffusers](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) and additional official checkpoints for image variation can be found at [lambdalabs/sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers).
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](./overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](./overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
 
 ## StableDiffusionImageVariationPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/img2img.md b/docs/source/en/api/pipelines/stable_diffusion/img2img.md
index bec67b4f4e89..f9e4476427de 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/img2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.md
@@ -24,11 +24,8 @@ The abstract from the paper is:
 
 *Guided image synthesis enables everyday users to create and edit photo-realistic images with minimum effort. The key challenge is balancing faithfulness to the user input (e.g., hand-drawn colored strokes) and realism of the synthesized image. Existing GAN-based methods attempt to achieve such balance using either conditional GANs or GAN inversions, which are challenging and often require additional training data or loss functions for individual applications. To address these issues, we introduce a new image synthesis and editing method, Stochastic Differential Editing (SDEdit), based on a diffusion model generative prior, which synthesizes realistic images by iteratively denoising through a stochastic differential equation (SDE). Given an input image with user guide of any type, SDEdit first adds noise to the input, then subsequently denoises the resulting image through the SDE prior to increase its realism. SDEdit does not require task-specific training or inversions and can naturally achieve the balance between realism and faithfulness. SDEdit significantly outperforms state-of-the-art GAN-based methods by up to 98.09% on realism and 91.72% on overall satisfaction scores, according to a human perception study, on multiple tasks, including stroke-based image synthesis and editing as well as image compositing.*
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
 
 ## StableDiffusionImg2ImgPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/inpaint.md b/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
index 0b558b2fc01d..84cc31e15897 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.md
@@ -25,13 +25,10 @@ as [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable
 text-to-image Stable Diffusion checkpoints, such as
 [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) are also compatible but they might be less performant.
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## StableDiffusionInpaintPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.md b/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.md
index d5a15cb002cf..4f0521740cab 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.md
@@ -14,13 +14,10 @@ specific language governing permissions and limitations under the License.
 
 The Stable Diffusion latent upscaler model was created by [Katherine Crowson](https://github.com/crowsonkb/k-diffusion) in collaboration with [Stability AI](https://stability.ai/). It is used to enhance the output image resolution by a factor of 2 (see this demo [notebook](https://colab.research.google.com/drive/1o1qYJcFeywzCIdkfKJy7cTpgZTCM2EI4) for a demonstration of the original implementation).
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## StableDiffusionLatentUpscalePipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
index 4c52ed90f0e3..15f9f1db851f 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md
@@ -30,11 +30,8 @@ The abstract from the paper is:
 
 *This research paper proposes a Latent Diffusion Model for 3D (LDM3D) that generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. The LDM3D model is fine-tuned on a dataset of tuples containing an RGB image, depth map and caption, and validated through extensive experiments. We also develop an application called DepthFusion, which uses the generated RGB images and depth maps to create immersive and interactive 360-degree-view experiences using TouchDesigner. This technology has the potential to transform a wide range of industries, from entertainment and gaming to architecture and design. Overall, this paper presents a significant contribution to the field of generative AI and computer vision, and showcases the potential of LDM3D and DepthFusion to revolutionize content creation and digital experiences. A short video summarizing the approach can be found at [this url](https://t.ly/tdi2).*
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
 
 ## StableDiffusionLDM3DPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
index aac3a3d870c8..7964db4c9d7e 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
@@ -26,10 +26,7 @@ The abstract from the paper is:
 - SDXL Turbo has been trained to generate images of size 512x512.
 - SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
 
-<Tip>
-
-To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [SDXL Turbo](../../../using-diffusers/sdxl_turbo) guide.
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
-
-</Tip>
+> [!TIP]
+> To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [SDXL Turbo](../../../using-diffusers/sdxl_turbo) guide.
+>
+> Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.md
index 89e9f5305e39..67729cd195ca 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.md
@@ -33,13 +33,10 @@ Stable Diffusion 2 is available for tasks like text-to-image, inpainting, super-
 
 Here are some examples for how to use Stable Diffusion 2 for each task:
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## Text-to-image
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
index 211b26889aff..3c49df101c1e 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -34,11 +34,8 @@ Use the command below to log in:
 hf auth login
 ```
 
-<Tip>
-
-The SD3 pipeline uses three text encoders to generate an image. Model offloading is necessary in order for it to run on most commodity hardware. Please use the `torch.float16` data type for additional memory savings.
-
-</Tip>
+> [!TIP]
+> The SD3 pipeline uses three text encoders to generate an image. Model offloading is necessary in order for it to run on most commodity hardware. Please use the `torch.float16` data type for additional memory savings.
 
 ```python
 import torch
@@ -124,11 +121,8 @@ image.save("result.jpg")
 </div>
 
 
-<Tip>
-
-Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
-
-</Tip>
+> [!TIP]
+> Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
 
 
 ## Memory Optimisations for SD3
@@ -333,11 +327,8 @@ image = pipe(
 
 You can send a different prompt to the CLIP Text Encoders and the T5 Text Encoder to prevent the prompt from being truncated by the CLIP Text Encoders and to improve generation.
 
-<Tip>
-
-The prompt with the CLIP Text Encoders is still truncated to the 77 token limit.
-
-</Tip>
+> [!TIP]
+> The prompt with the CLIP Text Encoders is still truncated to the 77 token limit.
 
 ```python
 prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus, basking in a river of melted butter amidst a breakfast-themed landscape. A river of warm, melted butter, pancake-like foliage in the background, a towering pepper mill standing in for a tree."
diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md
index 173649110783..151b0b8a6507 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.md
@@ -45,11 +45,8 @@ There are 4 configurations (`SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyC
 >>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
 ```
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
 
 ## StableDiffusionPipelineSafe
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
index 30e43790663d..6863d408b5fd 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
@@ -33,13 +33,10 @@ The abstract from the paper is:
 - SDXL output images can be improved by making use of a refiner model in an image-to-image setting.
 - SDXL offers `negative_original_size`, `negative_crops_coords_top_left`, and `negative_target_size` to negatively condition the model on image resolution and cropping parameters.
 
-<Tip>
-
-To learn how to use SDXL for various tasks, how to optimize performance, and other usage examples, take a look at the [Stable Diffusion XL](../../../using-diffusers/sdxl) guide.
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
-
-</Tip>
+> [!TIP]
+> To learn how to use SDXL for various tasks, how to optimize performance, and other usage examples, take a look at the [Stable Diffusion XL](../../../using-diffusers/sdxl) guide.
+>
+> Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
 
 ## StableDiffusionXLPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/svd.md b/docs/source/en/api/pipelines/stable_diffusion/svd.md
index ab51f9b66398..0c33c0600728 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/svd.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/svd.md
@@ -18,15 +18,12 @@ The abstract from the paper is:
 
 *We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.*
 
-<Tip>
-
-To learn how to use Stable Video Diffusion, take a look at the [Stable Video Diffusion](../../../using-diffusers/svd) guide.
-
-<br>
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the [base](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [extended frame](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) checkpoints!
-
-</Tip>
+> [!TIP]
+> To learn how to use Stable Video Diffusion, take a look at the [Stable Video Diffusion](../../../using-diffusers/svd) guide.
+>
+> <br>
+>
+> Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the [base](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [extended frame](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) checkpoints!
 
 ## Tips
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/text2img.md b/docs/source/en/api/pipelines/stable_diffusion/text2img.md
index c17348c8ffd4..59a0f00d22e6 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/text2img.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.md
@@ -22,13 +22,10 @@ The abstract from the paper is:
 
 *By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs. Code is available at https://github.com/CompVis/latent-diffusion.*
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## StableDiffusionPipeline
 
diff --git a/docs/source/en/api/pipelines/stable_diffusion/upscale.md b/docs/source/en/api/pipelines/stable_diffusion/upscale.md
index 411491263c63..14393370bec7 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/upscale.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/upscale.md
@@ -18,13 +18,10 @@ specific language governing permissions and limitations under the License.
 
 The Stable Diffusion upscaler diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/). It is used to enhance the resolution of input images by a factor of 4.
 
-<Tip>
-
-Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
-
-If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Stable Diffusion [Tips](overview#tips) section to learn how to explore the tradeoff between scheduler speed and quality, and how to reuse pipeline components efficiently!
+>
+> If you're interested in using one of the official checkpoints for a task, explore the [CompVis](https://huggingface.co/CompVis), [Runway](https://huggingface.co/runwayml), and [Stability AI](https://huggingface.co/stabilityai) Hub organizations!
 
 ## StableDiffusionUpscalePipeline
 
diff --git a/docs/source/en/api/pipelines/stable_unclip.md b/docs/source/en/api/pipelines/stable_unclip.md
index 5abb6028c4cb..09100201bb1e 100644
--- a/docs/source/en/api/pipelines/stable_unclip.md
+++ b/docs/source/en/api/pipelines/stable_unclip.md
@@ -65,11 +65,8 @@ wave_prompt = "dramatic wave, the Oceans roar, Strong wave spiral across the oce
 image = pipe(prompt=wave_prompt).images[0]
 image
 ```
-<Tip warning={true}>
-
-For text-to-image we use `stabilityai/stable-diffusion-2-1-unclip-small` as it was trained on CLIP ViT-L/14 embedding, the same as the Karlo model prior. [stabilityai/stable-diffusion-2-1-unclip](https://hf.co/stabilityai/stable-diffusion-2-1-unclip) was trained on OpenCLIP ViT-H, so we don't recommend its use.
-
-</Tip>
+> [!WARNING]
+> For text-to-image we use `stabilityai/stable-diffusion-2-1-unclip-small` as it was trained on CLIP ViT-L/14 embedding, the same as the Karlo model prior. [stabilityai/stable-diffusion-2-1-unclip](https://hf.co/stabilityai/stable-diffusion-2-1-unclip) was trained on OpenCLIP ViT-H, so we don't recommend its use.
 
 ### Text guided Image-to-Image Variation
 
@@ -99,11 +96,8 @@ image = pipe(init_image, prompt=prompt).images[0]
 image
 ```
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## StableUnCLIPPipeline
 
diff --git a/docs/source/en/api/pipelines/text_to_video.md b/docs/source/en/api/pipelines/text_to_video.md
index 7faf88d1335f..d7c37be6371e 100644
--- a/docs/source/en/api/pipelines/text_to_video.md
+++ b/docs/source/en/api/pipelines/text_to_video.md
@@ -174,11 +174,8 @@ Video generation is memory-intensive and one way to reduce your memory usage is
 
 Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## TextToVideoSDPipeline
 [[autodoc]] TextToVideoSDPipeline
diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md
index 5fe3789d8287..50e7620760f3 100644
--- a/docs/source/en/api/pipelines/text_to_video_zero.md
+++ b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -289,11 +289,8 @@ can run with custom [DreamBooth](../../training/dreambooth) models, as shown bel
 
 You can filter out some available DreamBooth-trained models with [this link](https://huggingface.co/models?search=dreambooth).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## TextToVideoZeroPipeline
 [[autodoc]] TextToVideoZeroPipeline
diff --git a/docs/source/en/api/pipelines/unclip.md b/docs/source/en/api/pipelines/unclip.md
index 8011a4b533a1..7c5c2b0d9ab9 100644
--- a/docs/source/en/api/pipelines/unclip.md
+++ b/docs/source/en/api/pipelines/unclip.md
@@ -20,11 +20,8 @@ The abstract from the paper is following:
 
 You can find lucidrains' DALL-E 2 recreation at [lucidrains/DALLE2-pytorch](https://github.com/lucidrains/DALLE2-pytorch).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## UnCLIPPipeline
 [[autodoc]] UnCLIPPipeline
diff --git a/docs/source/en/api/pipelines/unidiffuser.md b/docs/source/en/api/pipelines/unidiffuser.md
index 7d767f2db530..2ff700e4b8be 100644
--- a/docs/source/en/api/pipelines/unidiffuser.md
+++ b/docs/source/en/api/pipelines/unidiffuser.md
@@ -27,11 +27,8 @@ The abstract from the paper is:
 
 You can find the original codebase at [thu-ml/unidiffuser](https://github.com/thu-ml/unidiffuser) and additional checkpoints at [thu-ml](https://huggingface.co/thu-ml).
 
-<Tip warning={true}>
-
-There is currently an issue on PyTorch 1.X where the output images are all black or the pixel values become `NaNs`. This issue can be mitigated by switching to PyTorch 2.X.
-
-</Tip>
+> [!WARNING]
+> There is currently an issue on PyTorch 1.X where the output images are all black or the pixel values become `NaNs`. This issue can be mitigated by switching to PyTorch 2.X.
 
 This pipeline was contributed by [dg845](https://github.com/dg845). ❤️
 
@@ -197,11 +194,8 @@ final_prompt = sample.text[0]
 print(final_prompt)
 ```
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## UniDiffuserPipeline
 [[autodoc]] UniDiffuserPipeline
diff --git a/docs/source/en/api/pipelines/value_guided_sampling.md b/docs/source/en/api/pipelines/value_guided_sampling.md
index 797847ee479c..d050ea309ca5 100644
--- a/docs/source/en/api/pipelines/value_guided_sampling.md
+++ b/docs/source/en/api/pipelines/value_guided_sampling.md
@@ -12,11 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # Value-guided planning
 
-<Tip warning={true}>
-
-🧪 This is an experimental pipeline for reinforcement learning!
-
-</Tip>
+> [!WARNING]
+> 🧪 This is an experimental pipeline for reinforcement learning!
 
 This pipeline is based on the [Planning with Diffusion for Flexible Behavior Synthesis](https://huggingface.co/papers/2205.09991) paper by Michael Janner, Yilun Du, Joshua B. Tenenbaum, Sergey Levine.
 
@@ -28,11 +25,8 @@ You can find additional information about the model on the [project page](https:
 
 The script to run the model is available [here](https://github.com/huggingface/diffusers/tree/main/examples/reinforcement_learning).
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+> [!TIP]
+> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
 
 ## ValueGuidedRLPipeline
 [[autodoc]] diffusers.experimental.ValueGuidedRLPipeline
diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
index 31271f1722a8..7fa7c7c9d016 100644
--- a/docs/source/en/api/quantization.md
+++ b/docs/source/en/api/quantization.md
@@ -15,11 +15,8 @@ specific language governing permissions and limitations under the License.
 
 Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
 
-<Tip>
-
-Learn how to quantize models in the [Quantization](../quantization/overview) guide.
-
-</Tip>
+> [!TIP]
+> Learn how to quantize models in the [Quantization](../quantization/overview) guide.
 
 ## PipelineQuantizationConfig
 
diff --git a/docs/source/en/api/schedulers/ddim.md b/docs/source/en/api/schedulers/ddim.md
index 5d6b4673d2b8..61ef30c786f9 100644
--- a/docs/source/en/api/schedulers/ddim.md
+++ b/docs/source/en/api/schedulers/ddim.md
@@ -28,11 +28,8 @@ The original codebase of this paper can be found at [ermongroup/ddim](https://gi
 
 The paper [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) claims that a mismatch between the training and inference settings leads to suboptimal inference generation results for Stable Diffusion. To fix this, the authors propose:
 
-<Tip warning={true}>
-
-🧪 This is an experimental feature!
-
-</Tip>
+> [!WARNING]
+> 🧪 This is an experimental feature!
 
 1. rescale the noise schedule to enforce zero terminal signal-to-noise ratio (SNR)
 
diff --git a/docs/source/en/api/schedulers/score_sde_vp.md b/docs/source/en/api/schedulers/score_sde_vp.md
index 0a1fe5a3be3e..8ce9800ee3e1 100644
--- a/docs/source/en/api/schedulers/score_sde_vp.md
+++ b/docs/source/en/api/schedulers/score_sde_vp.md
@@ -18,11 +18,8 @@ The abstract from the paper is:
 
 *Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.*
 
-<Tip warning={true}>
-
-🚧 This scheduler is under construction!
-
-</Tip>
+> [!WARNING]
+> 🚧 This scheduler is under construction!
 
 ## ScoreSdeVpScheduler
 [[autodoc]] schedulers.deprecated.scheduling_sde_vp.ScoreSdeVpScheduler
diff --git a/docs/source/en/conceptual/evaluation.md b/docs/source/en/conceptual/evaluation.md
index 6546e5bf2454..4af38254bea6 100644
--- a/docs/source/en/conceptual/evaluation.md
+++ b/docs/source/en/conceptual/evaluation.md
@@ -104,13 +104,10 @@ We can also set `num_images_per_prompt` accordingly to compare different images
 Once several images are generated from all the prompts using multiple models (under evaluation), these results are presented to human evaluators for scoring. For
 more details on the DrawBench and PartiPrompts benchmarks, refer to their respective papers.
 
-<Tip>
-
-It is useful to look at some inference samples while a model is training to measure the
-training progress. In our [training scripts](https://github.com/huggingface/diffusers/tree/main/examples/), we support this utility with additional support for
-logging to TensorBoard and Weights & Biases.
-
-</Tip>
+> [!TIP]
+> It is useful to look at some inference samples while a model is training to measure the
+> training progress. In our [training scripts](https://github.com/huggingface/diffusers/tree/main/examples/), we support this utility with additional support for
+> logging to TensorBoard and Weights & Biases.
 
 ## Quantitative Evaluation
 
@@ -205,14 +202,11 @@ print(f"CLIP Score with v-1-5: {sd_clip_score_1_5}")
 
 It seems like the [v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint performs better than its predecessor. Note, however, that the number of prompts we used to compute the CLIP scores is quite low. For a more practical evaluation, this number should be way higher, and the prompts should be diverse.
 
-<Tip warning={true}>
-
-By construction, there are some limitations in this score. The captions in the training dataset
-were crawled from the web and extracted from `alt` and similar tags associated an image on the internet.
-They are not necessarily representative of what a human being would use to describe an image. Hence we
-had to "engineer" some prompts here.
-
-</Tip>
+> [!WARNING]
+> By construction, there are some limitations in this score. The captions in the training dataset
+> were crawled from the web and extracted from `alt` and similar tags associated an image on the internet.
+> They are not necessarily representative of what a human being would use to describe an image. Hence we
+> had to "engineer" some prompts here.
 
 ### Image-conditioned text-to-image generation
 
@@ -421,11 +415,8 @@ We can extend the idea of this metric to measure how similar the original image
 
 We can use these metrics for similar pipelines such as the [`StableDiffusionPix2PixZeroPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix_zero#diffusers.StableDiffusionPix2PixZeroPipeline).
 
-<Tip>
-
-Both CLIP score and CLIP direction similarity rely on the CLIP model, which can make the evaluations biased.
-
-</Tip>
+> [!TIP]
+> Both CLIP score and CLIP direction similarity rely on the CLIP model, which can make the evaluations biased.
 
 ***Extending metrics like IS, FID (discussed later), or KID can be difficult*** when the model under evaluation was pre-trained on a large image-captioning dataset (such as the [LAION-5B dataset](https://laion.ai/blog/laion-5b/)). This is because underlying these metrics is an InceptionNet (pre-trained on the ImageNet-1k dataset) used for extracting intermediate image features. The pre-training dataset of Stable Diffusion may have limited overlap with the pre-training dataset of InceptionNet, so it is not a good candidate here for feature extraction.
 
@@ -554,21 +545,18 @@ The lower the FID, the better it is. Several things can influence FID here:
 
 For the last two points, it is, therefore, a good practice to run the evaluation across different seeds and inference steps, and then report an average result.
 
-<Tip warning={true}>
-
-FID results tend to be fragile as they depend on a lot of factors:
-
-* The specific Inception model used during computation.
-* The implementation accuracy of the computation.
-* The image format (not the same if we start from PNGs vs JPGs).
-
-Keeping that in mind, FID is often most useful when comparing similar runs, but it is
-hard to reproduce paper results unless the authors carefully disclose the FID
-measurement code.
-
-These points apply to other related metrics too, such as KID and IS.
-
-</Tip>
+> [!WARNING]
+> FID results tend to be fragile as they depend on a lot of factors:
+>
+> * The specific Inception model used during computation.
+> * The implementation accuracy of the computation.
+> * The image format (not the same if we start from PNGs vs JPGs).
+>
+> Keeping that in mind, FID is often most useful when comparing similar runs, but it is
+> hard to reproduce paper results unless the authors carefully disclose the FID
+> measurement code.
+>
+> These points apply to other related metrics too, such as KID and IS.
 
 As a final step, let's visually inspect the `fake_images`.
 
diff --git a/docs/source/en/optimization/coreml.md b/docs/source/en/optimization/coreml.md
index cd0e662bb738..71da1e3dc1fe 100644
--- a/docs/source/en/optimization/coreml.md
+++ b/docs/source/en/optimization/coreml.md
@@ -16,11 +16,8 @@ specific language governing permissions and limitations under the License.
 
 Core ML models can leverage all the compute engines available in Apple devices: the CPU, the GPU, and the Apple Neural Engine (or ANE, a tensor-optimized accelerator available in Apple Silicon Macs and modern iPhones/iPads). Depending on the model and the device it's running on, Core ML can mix and match compute engines too, so some portions of the model may run on the CPU while others run on GPU, for example.
 
-<Tip>
-
-You can also run the `diffusers` Python codebase on Apple Silicon Macs using the `mps` accelerator built into PyTorch. This approach is explained in depth in [the mps guide](mps), but it is not compatible with native apps.
-
-</Tip>
+> [!TIP]
+> You can also run the `diffusers` Python codebase on Apple Silicon Macs using the `mps` accelerator built into PyTorch. This approach is explained in depth in [the mps guide](mps), but it is not compatible with native apps.
 
 ## Stable Diffusion Core ML Checkpoints
 
diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
index 76d749ecf375..941f53604cec 100644
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -239,11 +239,8 @@ The `step()` function is [called](https://github.com/huggingface/diffusers/blob/
 
 In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
 
-<Tip>
-
-Refer to the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post for maximizing performance with `torch.compile` for diffusion models.
-
-</Tip>
+> [!TIP]
+> Refer to the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post for maximizing performance with `torch.compile` for diffusion models.
 
 ### Benchmarks
 
diff --git a/docs/source/en/optimization/mps.md b/docs/source/en/optimization/mps.md
index 7e4c2716accf..b5afa25b2fda 100644
--- a/docs/source/en/optimization/mps.md
+++ b/docs/source/en/optimization/mps.md
@@ -38,11 +38,8 @@ image = pipe(prompt).images[0]
 image
 ```
 
-<Tip warning={true}>
-
-The PyTorch [mps](https://pytorch.org/docs/stable/notes/mps.html) backend does not support NDArray sizes greater than `2**32`. Please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) if you encounter this problem so we can investigate.
-
-</Tip>
+> [!WARNING]
+> The PyTorch [mps](https://pytorch.org/docs/stable/notes/mps.html) backend does not support NDArray sizes greater than `2**32`. Please open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) if you encounter this problem so we can investigate.
 
 If you're using **PyTorch 1.13**, you need to "prime" the pipeline with an additional one-time pass through it. This is a temporary workaround for an issue where the first inference pass produces slightly different results than subsequent ones. You only need to do this pass once, and after just one inference step you can discard the result.
 
diff --git a/docs/source/en/optimization/neuron.md b/docs/source/en/optimization/neuron.md
index fa933317b40f..6a45bd0563bb 100644
--- a/docs/source/en/optimization/neuron.md
+++ b/docs/source/en/optimization/neuron.md
@@ -20,11 +20,8 @@ Diffusers functionalities are available on [AWS Inf2 instances](https://aws.amaz
 python -m pip install --upgrade-strategy eager optimum[neuronx]
 ```
 
-<Tip>
-
-We provide pre-built [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) (DLAMI) and Optimum Neuron containers for Amazon SageMaker. It's recommended to correctly set up your environment.
-
-</Tip>
+> [!TIP]
+> We provide pre-built [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) (DLAMI) and Optimum Neuron containers for Amazon SageMaker. It's recommended to correctly set up your environment.
 
 The example below demonstrates how to generate images with the Stable Diffusion XL model on an inf2.8xlarge instance (you can switch to cheaper inf2.xlarge instances once the model is compiled). To generate some images, use the [`~optimum.neuron.NeuronStableDiffusionXLPipeline`] class, which is similar to the [`StableDiffusionXLPipeline`] class in Diffusers.
 
diff --git a/docs/source/en/optimization/onnx.md b/docs/source/en/optimization/onnx.md
index d160dcffe865..620f2af994b3 100644
--- a/docs/source/en/optimization/onnx.md
+++ b/docs/source/en/optimization/onnx.md
@@ -34,11 +34,8 @@ image = pipeline(prompt).images[0]
 pipeline.save_pretrained("./onnx-stable-diffusion-v1-5")
 ```
 
-<Tip warning={true}>
-
-Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
-
-</Tip>
+> [!WARNING]
+> Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
 
 To export the pipeline in the ONNX format offline and use it later for inference,
 use the [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) command:
diff --git a/docs/source/en/optimization/xformers.md b/docs/source/en/optimization/xformers.md
index 3e2792fd5f7a..523e81559547 100644
--- a/docs/source/en/optimization/xformers.md
+++ b/docs/source/en/optimization/xformers.md
@@ -20,16 +20,10 @@ Install xFormers from `pip`:
 pip install xformers
 ```
 
-<Tip>
-
-The xFormers `pip` package requires the latest version of PyTorch. If you need to use a previous version of PyTorch, then we recommend [installing xFormers from the source](https://github.com/facebookresearch/xformers#installing-xformers).
-
-</Tip>
+> [!TIP]
+> The xFormers `pip` package requires the latest version of PyTorch. If you need to use a previous version of PyTorch, then we recommend [installing xFormers from the source](https://github.com/facebookresearch/xformers#installing-xformers).
 
 After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption as shown in this [section](memory#memory-efficient-attention).
 
-<Tip warning={true}>
-
-According to this [issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or DreamBooth) in some GPUs. If you observe this problem, please install a development version as indicated in the issue comments.
-
-</Tip>
+> [!WARNING]
+> According to this [issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or DreamBooth) in some GPUs. If you observe this problem, please install a development version as indicated in the issue comments.
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index f97119d5f4cd..072947274463 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -206,11 +206,8 @@ Once a model is quantized, you can push the model to the Hub with the [`~ModelMi
 </hfoption>
 </hfoptions>
 
-<Tip warning={true}>
-
-Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
-
-</Tip>
+> [!WARNING]
+> Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
 
 Check your memory footprint with the `get_memory_footprint` method:
 
@@ -234,11 +231,8 @@ model_4bit = AutoModel.from_pretrained(
 
 ## 8-bit (LLM.int8() algorithm)
 
-<Tip>
-
-Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
-
-</Tip>
+> [!TIP]
+> Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
 
 This section explores some of the specific features of 8-bit models, such as outlier thresholds and skipping module conversion.
 
@@ -283,11 +277,8 @@ model_8bit = SD3Transformer2DModel.from_pretrained(
 
 ## 4-bit (QLoRA algorithm)
 
-<Tip>
-
-Learn more about its details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
-
-</Tip>
+> [!TIP]
+> Learn more about its details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
 
 This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
 
diff --git a/docs/source/en/training/controlnet.md b/docs/source/en/training/controlnet.md
index 17da819db84b..840130d2b43c 100644
--- a/docs/source/en/training/controlnet.md
+++ b/docs/source/en/training/controlnet.md
@@ -33,11 +33,8 @@ cd examples/controlnet
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -61,11 +58,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -100,11 +94,8 @@ As with the script parameters, a general walkthrough of the training script is p
 
 The training script has a [`make_train_dataset`](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/controlnet/train_controlnet.py#L582) function for preprocessing the dataset with image transforms and caption tokenization. You'll see that in addition to the usual caption tokenization and image transforms, the script also includes transforms for the conditioning image.
 
-<Tip>
-
-If you're streaming a dataset on a TPU, performance may be bottlenecked by the 🤗 Datasets library which is not optimized for images. To ensure maximum throughput, you're encouraged to explore other dataset formats like [WebDataset](https://webdataset.github.io/webdataset/), [TorchData](https://github.com/pytorch/data), and [TensorFlow Datasets](https://www.tensorflow.org/datasets/tfless_tfds).
-
-</Tip>
+> [!TIP]
+> If you're streaming a dataset on a TPU, performance may be bottlenecked by the 🤗 Datasets library which is not optimized for images. To ensure maximum throughput, you're encouraged to explore other dataset formats like [WebDataset](https://webdataset.github.io/webdataset/), [TorchData](https://github.com/pytorch/data), and [TensorFlow Datasets](https://www.tensorflow.org/datasets/tfless_tfds).
 
 ```py
 conditioning_image_transforms = transforms.Compose(
diff --git a/docs/source/en/training/create_dataset.md b/docs/source/en/training/create_dataset.md
index 8e0d6f92005c..725f143bba40 100644
--- a/docs/source/en/training/create_dataset.md
+++ b/docs/source/en/training/create_dataset.md
@@ -7,11 +7,8 @@ This guide will show you two ways to create a dataset to finetune on:
 - provide a folder of images to the `--train_data_dir` argument
 - upload a dataset to the Hub and pass the dataset repository id to the `--dataset_name` argument
 
-<Tip>
-
-💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide.
-
-</Tip>
+> [!TIP]
+> 💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide.
 
 ## Provide a dataset as a folder
 
@@ -33,11 +30,8 @@ accelerate launch train_unconditional.py \
 
 ## Upload your data to the Hub
 
-<Tip>
-
-💡 For more details and context about creating and uploading a dataset to the Hub, take a look at the [Image search with 🤗 Datasets](https://huggingface.co/blog/image-search-datasets) post.
-
-</Tip>
+> [!TIP]
+> 💡 For more details and context about creating and uploading a dataset to the Hub, take a look at the [Image search with 🤗 Datasets](https://huggingface.co/blog/image-search-datasets) post.
 
 Start by creating a dataset with the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) feature, which creates an `image` column containing the PIL-encoded images.
 
diff --git a/docs/source/en/training/custom_diffusion.md b/docs/source/en/training/custom_diffusion.md
index e803448b5f82..bfa4fe6f9e66 100644
--- a/docs/source/en/training/custom_diffusion.md
+++ b/docs/source/en/training/custom_diffusion.md
@@ -34,11 +34,8 @@ pip install -r requirements.txt
 pip install clip-retrieval
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -62,11 +59,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion/train_custom_diffusion.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion/train_custom_diffusion.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -117,11 +111,8 @@ accelerate launch train_custom_diffusion.py \
 
 ## Training script
 
-<Tip>
-
-A lot of the code in the Custom Diffusion training script is similar to the [DreamBooth](dreambooth#training-script) script. This guide instead focuses on the code that is relevant to Custom Diffusion.
-
-</Tip>
+> [!TIP]
+> A lot of the code in the Custom Diffusion training script is similar to the [DreamBooth](dreambooth#training-script) script. This guide instead focuses on the code that is relevant to Custom Diffusion.
 
 The Custom Diffusion training script has two dataset classes:
 
@@ -224,16 +215,13 @@ Set the environment variable `MODEL_NAME` to a model id on the Hub or a path to
 
 To monitor training progress with Weights and Biases, add the `--report_to=wandb` parameter to the training command and specify a validation prompt with `--validation_prompt`. This is useful for debugging and saving intermediate results.
 
-<Tip>
-
-If you're training on human faces, the Custom Diffusion team has found the following parameters to work well:
-
-- `--learning_rate=5e-6`
-- `--max_train_steps` can be anywhere between 1000 and 2000
-- `--freeze_model=crossattn`
-- use at least 15-20 images to train with
-
-</Tip>
+> [!TIP]
+> If you're training on human faces, the Custom Diffusion team has found the following parameters to work well:
+>
+> - `--learning_rate=5e-6`
+> - `--max_train_steps` can be anywhere between 1000 and 2000
+> - `--freeze_model=crossattn`
+> - use at least 15-20 images to train with
 
 <hfoptions id="training-inference">
 <hfoption id="single concept">
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
index 586f765709b7..f9756e1a67aa 100644
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -56,6 +56,9 @@ Call `accelerate launch` to run the script and use the `--num_processes` argumen
 accelerate launch run_distributed.py --num_processes=2
 ```
 
+> [!TIP]
+> Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+
 ## PyTorch Distributed
 
 PyTorch [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) enables [data parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=data_parallelism), which replicates the same model on each device, to process different batches of data in parallel.
diff --git a/docs/source/en/training/dreambooth.md b/docs/source/en/training/dreambooth.md
index 3a5ba5aa39c3..81ed09c9d002 100644
--- a/docs/source/en/training/dreambooth.md
+++ b/docs/source/en/training/dreambooth.md
@@ -33,11 +33,8 @@ cd examples/dreambooth
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -61,19 +58,13 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
-<Tip warning={true}>
-
-DreamBooth is very sensitive to training hyperparameters, and it is easy to overfit. Read the [Training Stable Diffusion with Dreambooth using 🧨 Diffusers](https://huggingface.co/blog/dreambooth) blog post for recommended settings for different subjects to help you choose the appropriate hyperparameters.
-
-</Tip>
+> [!WARNING]
+> DreamBooth is very sensitive to training hyperparameters, and it is easy to overfit. Read the [Training Stable Diffusion with Dreambooth using 🧨 Diffusers](https://huggingface.co/blog/dreambooth) blog post for recommended settings for different subjects to help you choose the appropriate hyperparameters.
 
 The training script offers many parameters for customizing your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/072e00897a7cf4302c347a63ec917b4b8add16d4/examples/dreambooth/train_dreambooth.py#L228) function. The parameters are set with default values that should work pretty well out-of-the-box, but you can also set your own values in the training command if you'd like.
 
@@ -319,29 +310,26 @@ accelerate launch train_dreambooth.py \
 
 Once training is complete, you can use your newly trained model for inference!
 
-<Tip>
-
-Can't wait to try your model for inference before training is complete? 🤭 Make sure you have the latest version of 🤗 Accelerate installed.
-
-```py
-from diffusers import DiffusionPipeline, UNet2DConditionModel
-from transformers import CLIPTextModel
-import torch
-
-unet = UNet2DConditionModel.from_pretrained("path/to/model/checkpoint-100/unet")
-
-# if you have trained with `--args.train_text_encoder` make sure to also load the text encoder
-text_encoder = CLIPTextModel.from_pretrained("path/to/model/checkpoint-100/checkpoint-100/text_encoder")
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
-).to("cuda")
-
-image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guidance_scale=7.5).images[0]
-image.save("dog-bucket.png")
-```
-
-</Tip>
+> [!TIP]
+> Can't wait to try your model for inference before training is complete? 🤭 Make sure you have the latest version of 🤗 Accelerate installed.
+>
+> ```py
+> from diffusers import DiffusionPipeline, UNet2DConditionModel
+> from transformers import CLIPTextModel
+> import torch
+>
+> unet = UNet2DConditionModel.from_pretrained("path/to/model/checkpoint-100/unet")
+>
+> # if you have trained with `--args.train_text_encoder` make sure to also load the text encoder
+> text_encoder = CLIPTextModel.from_pretrained("path/to/model/checkpoint-100/checkpoint-100/text_encoder")
+>
+> pipeline = DiffusionPipeline.from_pretrained(
+>     "stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
+> ).to("cuda")
+>
+> image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guidance_scale=7.5).images[0]
+> image.save("dog-bucket.png")
+> ```
 
 ```py
 from diffusers import DiffusionPipeline
diff --git a/docs/source/en/training/instructpix2pix.md b/docs/source/en/training/instructpix2pix.md
index c1ba5d870ac7..a1c94bb33ffe 100644
--- a/docs/source/en/training/instructpix2pix.md
+++ b/docs/source/en/training/instructpix2pix.md
@@ -31,11 +31,8 @@ cd examples/instruct_pix2pix
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -59,11 +56,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -174,15 +168,12 @@ This guide uses the [fusing/instructpix2pix-1000-samples](https://huggingface.co
 
 Set the `MODEL_NAME` environment variable to the name of the model (can be a model id on the Hub or a path to a local model), and the `DATASET_ID` to the name of the dataset on the Hub. The script creates and saves all the components (feature extractor, scheduler, text encoder, UNet, etc.) to a subfolder in your repository.
 
-<Tip>
-
-For better results, try longer training runs with a larger dataset. We've only tested this training script on a smaller-scale dataset.
-
-<br>
-
-To monitor training progress with Weights and Biases, add the `--report_to=wandb` parameter to the training command and specify a validation image with `--val_image_url` and a validation prompt with `--validation_prompt`. This can be really useful for debugging the model.
-
-</Tip>
+> [!TIP]
+> For better results, try longer training runs with a larger dataset. We've only tested this training script on a smaller-scale dataset.
+>
+> <br>
+>
+> To monitor training progress with Weights and Biases, add the `--report_to=wandb` parameter to the training command and specify a validation image with `--val_image_url` and a validation prompt with `--validation_prompt`. This can be really useful for debugging the model.
 
 If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
diff --git a/docs/source/en/training/kandinsky.md b/docs/source/en/training/kandinsky.md
index 561bc1c351b7..6cfd9f8d60a2 100644
--- a/docs/source/en/training/kandinsky.md
+++ b/docs/source/en/training/kandinsky.md
@@ -12,11 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # Kandinsky 2.2
 
-<Tip warning={true}>
-
-This script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
-
-</Tip>
+> [!WARNING]
+> This script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
 
 Kandinsky 2.2 is a multilingual text-to-image model capable of producing more photorealistic images. The model includes an image prior model for creating image embeddings from text prompts, and a decoder model that generates images based on the prior model's embeddings. That's why you'll find two separate scripts in Diffusers for Kandinsky 2.2, one for training the prior model and one for training the decoder model. You can train both models separately, but to get the best results, you should train both the prior and decoder models.
 
@@ -39,11 +36,8 @@ cd examples/kandinsky2_2/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -67,11 +61,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training scripts that are important for understanding how to modify it, but it doesn't cover every aspect of the scripts in detail. If you're interested in learning more, feel free to read through the scripts and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training scripts that are important for understanding how to modify it, but it doesn't cover every aspect of the scripts in detail. If you're interested in learning more, feel free to read through the scripts and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -209,11 +200,8 @@ You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambd
 
 If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
-<Tip>
-
-To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
-
-</Tip>
+> [!TIP]
+> To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
 
 <hfoptions id="training-inference">
 <hfoption id="prior model">
@@ -283,11 +271,8 @@ prompt="A robot naruto, 4k photo"
 image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
 ```
 
-<Tip>
-
-Feel free to replace `kandinsky-community/kandinsky-2-2-decoder` with your own trained decoder checkpoint!
-
-</Tip>
+> [!TIP]
+> Feel free to replace `kandinsky-community/kandinsky-2-2-decoder` with your own trained decoder checkpoint!
 
 </hfoption>
 <hfoption id="decoder model">
diff --git a/docs/source/en/training/lcm_distill.md b/docs/source/en/training/lcm_distill.md
index 280b6469f6fd..232f2eceed5d 100644
--- a/docs/source/en/training/lcm_distill.md
+++ b/docs/source/en/training/lcm_distill.md
@@ -33,11 +33,8 @@ cd examples/consistency_distillation
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment (try enabling `torch.compile` to significantly speedup training):
 
@@ -63,11 +60,8 @@ Lastly, if you want to train a model on your own dataset, take a look at the [Cr
 
 ## Script parameters
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) and let us know if you have any questions or concerns.
 
 The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L419) function. This function provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
 
diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
index e97d8acdac46..45a234b76a61 100644
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -12,19 +12,13 @@ specific language governing permissions and limitations under the License.
 
 # LoRA
 
-<Tip warning={true}>
-
-This is experimental and the API may change in the future.
-
-</Tip>
+> [!WARNING]
+> This is experimental and the API may change in the future.
 
 [LoRA (Low-Rank Adaptation of Large Language Models)](https://hf.co/papers/2106.09685) is a popular and lightweight training technique that significantly reduces the number of trainable parameters. It works by inserting a smaller number of new weights into the model and only these are trained. This makes training with LoRA much faster, memory-efficient, and produces smaller model weights (a few hundred MBs), which are easier to store and share. LoRA can also be combined with other training techniques like DreamBooth to speedup training.
 
-<Tip>
-
-LoRA is very versatile and supported for [DreamBooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py), [Kandinsky 2.2](https://github.com/huggingface/diffusers/blob/main/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py), [Stable Diffusion XL](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora_sdxl.py), [text-to-image](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py), and [Wuerstchen](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py).
-
-</Tip>
+> [!TIP]
+> LoRA is very versatile and supported for [DreamBooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py), [Kandinsky 2.2](https://github.com/huggingface/diffusers/blob/main/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py), [Stable Diffusion XL](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora_sdxl.py), [text-to-image](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py), and [Wuerstchen](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py).
 
 This guide will explore the [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
 
@@ -43,11 +37,8 @@ cd examples/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -71,11 +62,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -163,11 +151,8 @@ Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambda
 
 If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
-<Tip warning={true}>
-
-A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
-
-</Tip>
+> [!WARNING]
+> A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
 
 ```bash
 export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/sdxl.md b/docs/source/en/training/sdxl.md
index 12051b7c2d11..266bbc7d6166 100644
--- a/docs/source/en/training/sdxl.md
+++ b/docs/source/en/training/sdxl.md
@@ -12,11 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # Stable Diffusion XL
 
-<Tip warning={true}>
-
-This script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
-
-</Tip>
+> [!WARNING]
+> This script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
 
 [Stable Diffusion XL (SDXL)](https://hf.co/papers/2307.01952) is a larger and more powerful iteration of the Stable Diffusion model, capable of producing higher resolution images.
 
@@ -39,11 +36,8 @@ cd examples/text_to_image
 pip install -r requirements_sdxl.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -69,11 +63,8 @@ Lastly, if you want to train a model on your own dataset, take a look at the [Cr
 
 ## Script parameters
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_sdxl.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_sdxl.py) and let us know if you have any questions or concerns.
 
 The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/aab6de22c33cc01fb7bc81c0807d6109e2c998c9/examples/text_to_image/train_text_to_image_sdxl.py#L129) function. This function provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
 
@@ -178,11 +169,8 @@ Once you’ve made all your changes or you’re okay with the default configurat
 
 Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
 
-<Tip>
-
-To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` and `--validation_epochs` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
-
-</Tip>
+> [!TIP]
+> To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` and `--validation_epochs` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
 
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
diff --git a/docs/source/en/training/t2i_adapters.md b/docs/source/en/training/t2i_adapters.md
index 243c591bea6b..6d760040731d 100644
--- a/docs/source/en/training/t2i_adapters.md
+++ b/docs/source/en/training/t2i_adapters.md
@@ -33,11 +33,8 @@ cd examples/t2i_adapter
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -61,11 +58,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/t2i_adapter/train_t2i_adapter_sdxl.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/t2i_adapter/train_t2i_adapter_sdxl.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -166,11 +160,8 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
 wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
 ```
 
-<Tip>
-
-To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You'll also need to add the `--validation_image`, `--validation_prompt`, and `--validation_steps` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
-
-</Tip>
+> [!TIP]
+> To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You'll also need to add the `--validation_image`, `--validation_prompt`, and `--validation_steps` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
 
 ```bash
 export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
diff --git a/docs/source/en/training/text2image.md b/docs/source/en/training/text2image.md
index 5212fe8393bc..a9327457c783 100644
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -12,11 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # Text-to-image
 
-<Tip warning={true}>
-
-The text-to-image script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
-
-</Tip>
+> [!WARNING]
+> The text-to-image script is experimental, and it's easy to overfit and run into issues like catastrophic forgetting. Try exploring different hyperparameters to get the best results on your dataset.
 
 Text-to-image models like Stable Diffusion are conditioned to generate images given a text prompt.
 
@@ -39,11 +36,8 @@ cd examples/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -69,11 +63,8 @@ Lastly, if you want to train a model on your own dataset, take a look at the [Cr
 
 ## Script parameters
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) and let us know if you have any questions or concerns.
 
 The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/8959c5b9dec1c94d6ba482c94a58d2215c5fd026/examples/text_to_image/train_text_to_image.py#L193) function. This function provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
 
@@ -147,11 +138,8 @@ Once you've made all your changes or you're okay with the default configuration,
 
 Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
-<Tip>
-
-To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment variables to the path of the dataset and where to save the model to.
-
-</Tip>
+> [!TIP]
+> To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment variables to the path of the dataset and where to save the model to.
 
 ```bash
 export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/text_inversion.md b/docs/source/en/training/text_inversion.md
index 91af2f6afb81..0b540107e9b2 100644
--- a/docs/source/en/training/text_inversion.md
+++ b/docs/source/en/training/text_inversion.md
@@ -32,11 +32,8 @@ Navigate to the example folder with the training script and install the required
 cd examples/textual_inversion
 pip install -r requirements.txt
 ```
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -60,11 +57,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py) and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -160,11 +154,8 @@ Set the environment variable `MODEL_NAME` to a model id on the Hub or a path to
 - `token_identifier.txt`: the special placeholder token
 - `type_of_concept.txt`: the type of concept you're training on (either "object" or "style")
 
-<Tip warning={true}>
-
-A full training run takes ~1 hour on a single V100 GPU.
-
-</Tip>
+> [!WARNING]
+> A full training run takes ~1 hour on a single V100 GPU.
 
 One more thing before you launch the script. If you're interested in following along with the training process, you can periodically save generated images as training progresses. Add the following parameters to the training command:
 
diff --git a/docs/source/en/training/unconditional_training.md b/docs/source/en/training/unconditional_training.md
index d2facc7852ec..ab3bdd6416f3 100644
--- a/docs/source/en/training/unconditional_training.md
+++ b/docs/source/en/training/unconditional_training.md
@@ -31,11 +31,8 @@ cd examples/unconditional_image_generation
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -61,11 +58,8 @@ Lastly, if you want to train a model on your own dataset, take a look at the [Cr
 
 ## Script parameters
 
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) and let us know if you have any questions or concerns.
 
 The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/096f84b05f9514fae9f185cbec0a4d38fbad9919/examples/unconditional_image_generation/train_unconditional.py#L55) function. It provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
 
@@ -163,11 +157,8 @@ Finally, the [training loop](https://github.com/huggingface/diffusers/blob/096f8
 
 Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
 
-<Tip warning={true}>
-
-A full training run takes 2 hours on 4xV100 GPUs.
-
-</Tip>
+> [!WARNING]
+> A full training run takes 2 hours on 4xV100 GPUs.
 
 <hfoptions id="launchtraining">
 <hfoption id="single GPU">
diff --git a/docs/source/en/training/wuerstchen.md b/docs/source/en/training/wuerstchen.md
index 38a1387dd31c..1c362879a6f4 100644
--- a/docs/source/en/training/wuerstchen.md
+++ b/docs/source/en/training/wuerstchen.md
@@ -33,11 +33,8 @@ cd examples/wuerstchen/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
 
 Initialize an 🤗 Accelerate environment:
 
@@ -61,11 +58,8 @@ write_basic_config()
 
 Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
 
-<Tip>
-
-The following sections highlight parts of the training scripts that are important for understanding how to modify it, but it doesn't cover every aspect of the [script](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_prior.py) in detail. If you're interested in learning more, feel free to read through the scripts and let us know if you have any questions or concerns.
-
-</Tip>
+> [!TIP]
+> The following sections highlight parts of the training scripts that are important for understanding how to modify it, but it doesn't cover every aspect of the [script](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_prior.py) in detail. If you're interested in learning more, feel free to read through the scripts and let us know if you have any questions or concerns.
 
 ## Script parameters
 
@@ -133,11 +127,8 @@ Once you’ve made all your changes or you’re okay with the default configurat
 
 Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
 
-<Tip>
-
-To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
-
-</Tip>
+> [!TIP]
+> To monitor training progress with Weights & Biases, add the `--report_to=wandb` parameter to the training command. You’ll also need to add the `--validation_prompt` to the training command to keep track of results. This can be really useful for debugging the model and viewing intermediate results.
 
 ```bash
 export DATASET_NAME="lambdalabs/naruto-blip-captions"
diff --git a/docs/source/en/tutorials/basic_training.md b/docs/source/en/tutorials/basic_training.md
index 9a35b3438f3f..3aa2ae429ba8 100644
--- a/docs/source/en/tutorials/basic_training.md
+++ b/docs/source/en/tutorials/basic_training.md
@@ -18,11 +18,8 @@ Unconditional image generation is a popular application of diffusion models that
 
 This tutorial will teach you how to train a [`UNet2DModel`] from scratch on a subset of the [Smithsonian Butterflies](https://huggingface.co/datasets/huggan/smithsonian_butterflies_subset) dataset to generate your own 🦋 butterflies 🦋.
 
-<Tip>
-
-💡 This training tutorial is based on the [Training with 🧨 Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook. For additional details and context about diffusion models like how they work, check out the notebook!
-
-</Tip>
+> [!TIP]
+> 💡 This training tutorial is based on the [Training with 🧨 Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook. For additional details and context about diffusion models like how they work, check out the notebook!
 
 Before you begin, make sure you have 🤗 Datasets installed to load and preprocess image datasets, and 🤗 Accelerate, to simplify training on any number of GPUs. The following command will also install [TensorBoard](https://www.tensorflow.org/tensorboard) to visualize training metrics (you can also use [Weights & Biases](https://docs.wandb.ai/) to track your training).
 
@@ -94,11 +91,8 @@ You can easily load the [Smithsonian Butterflies](https://huggingface.co/dataset
 >>> dataset = load_dataset(config.dataset_name, split="train")
 ```
 
-<Tip>
-
-💡 You can find additional datasets from the [HugGan Community Event](https://huggingface.co/huggan) or you can use your own dataset by creating a local [`ImageFolder`](https://huggingface.co/docs/datasets/image_dataset#imagefolder). Set `config.dataset_name` to the repository id of the dataset if it is from the HugGan Community Event, or `imagefolder` if you're using your own images.
-
-</Tip>
+> [!TIP]
+> 💡 You can find additional datasets from the [HugGan Community Event](https://huggingface.co/huggan) or you can use your own dataset by creating a local [`ImageFolder`](https://huggingface.co/docs/datasets/image_dataset#imagefolder). Set `config.dataset_name` to the repository id of the dataset if it is from the HugGan Community Event, or `imagefolder` if you're using your own images.
 
 🤗 Datasets uses the [`~datasets.Image`] feature to automatically decode the image data and load it as a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html) which we can visualize:
 
@@ -274,11 +268,8 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [
 
 Now you can wrap all these components together in a training loop with 🤗 Accelerate for easy TensorBoard logging, gradient accumulation, and mixed precision training. To upload the model to the Hub, write a function to get your repository name and information and then push it to the Hub.
 
-<Tip>
-
-💡 The training loop below may look intimidating and long, but it'll be worth it later when you launch your training in just one line of code! If you can't wait and want to start generating images, feel free to copy and run the code below. You can always come back and examine the training loop more closely later, like when you're waiting for your model to finish training. 🤗
-
-</Tip>
+> [!TIP]
+> 💡 The training loop below may look intimidating and long, but it'll be worth it later when you launch your training in just one line of code! If you can't wait and want to start generating images, feel free to copy and run the code below. You can always come back and examine the training loop more closely later, like when you're waiting for your model to finish training. 🤗
 
 ```py
 >>> from accelerate import Accelerator
diff --git a/docs/source/en/using-diffusers/conditional_image_generation.md b/docs/source/en/using-diffusers/conditional_image_generation.md
index 7efc0c653ed6..eb75b6b8a8b1 100644
--- a/docs/source/en/using-diffusers/conditional_image_generation.md
+++ b/docs/source/en/using-diffusers/conditional_image_generation.md
@@ -18,11 +18,8 @@ When you think of diffusion models, text-to-image is usually one of the first th
 
 From a very high level, a diffusion model takes a prompt and some random initial noise, and iteratively removes the noise to construct an image. The *denoising* process is guided by the prompt, and once the denoising process ends after a predetermined number of time steps, the image representation is decoded into an image.
 
-<Tip>
-
-Read the [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) blog post to learn more about how a latent diffusion model works.
-
-</Tip>
+> [!TIP]
+> Read the [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) blog post to learn more about how a latent diffusion model works.
 
 You can generate images from a prompt in 🤗 Diffusers in two steps:
 
@@ -176,11 +173,8 @@ image
 	<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/text2img-hw.png"/>
 </div>
 
-<Tip warning={true}>
-
-Other models may have different default image sizes depending on the image sizes in the training dataset. For example, SDXL's default image size is 1024x1024 and using lower `height` and `width` values may result in lower quality images. Make sure you check the model's API reference first!
-
-</Tip>
+> [!WARNING]
+> Other models may have different default image sizes depending on the image sizes in the training dataset. For example, SDXL's default image size is 1024x1024 and using lower `height` and `width` values may result in lower quality images. Make sure you check the model's API reference first!
 
 ### Guidance scale
 
@@ -272,11 +266,8 @@ There are several ways to exert more control over how an image is generated outs
 
 Prompt weighting is a technique for increasing or decreasing the importance of concepts in a prompt to emphasize or minimize certain features in an image. We recommend using the [Compel](https://github.com/damian0815/compel) library to help you generate the weighted prompt embeddings.
 
-<Tip>
-
-Learn how to create the prompt embeddings in the [Prompt weighting](weighted_prompts) guide. This example focuses on how to use the prompt embeddings in the pipeline.
-
-</Tip>
+> [!TIP]
+> Learn how to create the prompt embeddings in the [Prompt weighting](weighted_prompts) guide. This example focuses on how to use the prompt embeddings in the pipeline.
 
 Once you've created the embeddings, you can pass them to the `prompt_embeds` (and `negative_prompt_embeds` if you're using a negative prompt) parameter in the pipeline.
 
diff --git a/docs/source/en/using-diffusers/controlling_generation.md b/docs/source/en/using-diffusers/controlling_generation.md
index 8fd57a7cb8d6..aed3a8b729ac 100644
--- a/docs/source/en/using-diffusers/controlling_generation.md
+++ b/docs/source/en/using-diffusers/controlling_generation.md
@@ -84,23 +84,17 @@ Pix2Pix Zero can be used both to edit synthetic images as well as real images.
   Next, we generate image captions for the concept that shall be edited and for the new target concept. We can use a model like [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) for this purpose. Then, "mean" prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image.
 - To edit a real image, one first generates an image caption using a model like [BLIP](https://huggingface.co/docs/transformers/model_doc/blip). Then one applies DDIM inversion on the prompt and image to generate "inverse" latents. Similar to before, "mean" prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the "inverse" latents is used to edit the image.
 
-<Tip>
-
-Pix2Pix Zero is the first model that allows "zero-shot" image editing. This means that the model
-can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/pix2pix_zero#usage-example).
-
-</Tip>
+> [!TIP]
+> Pix2Pix Zero is the first model that allows "zero-shot" image editing. This means that the model
+> can edit an image in less than a minute on a consumer GPU as shown [here](../api/pipelines/pix2pix_zero#usage-example).
 
 As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall
 pipeline might require more memory than a standard [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img).
 
-<Tip>
-
-An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former
-involves fine-tuning the pre-trained weights while the latter does not. This means that you can
-apply Pix2Pix Zero to any of the available Stable Diffusion models.
-
-</Tip>
+> [!TIP]
+> An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former
+> involves fine-tuning the pre-trained weights while the latter does not. This means that you can
+> apply Pix2Pix Zero to any of the available Stable Diffusion models.
 
 ## Attend and Excite
 
diff --git a/docs/source/en/using-diffusers/diffedit.md b/docs/source/en/using-diffusers/diffedit.md
index bb1c234dd62d..adea210263d6 100644
--- a/docs/source/en/using-diffusers/diffedit.md
+++ b/docs/source/en/using-diffusers/diffedit.md
@@ -156,11 +156,8 @@ print(source_prompts)
 print(target_prompts)
 ```
 
-<Tip>
-
-Check out the [generation strategy](https://huggingface.co/docs/transformers/main/en/generation_strategies) guide if you're interested in learning more about strategies for generating different quality text.
-
-</Tip>
+> [!TIP]
+> Check out the [generation strategy](https://huggingface.co/docs/transformers/main/en/generation_strategies) guide if you're interested in learning more about strategies for generating different quality text.
 
 Load the text encoder model used by the [`StableDiffusionDiffEditPipeline`] to encode the text. You'll use the text encoder to compute the text embeddings:
 
diff --git a/docs/source/en/using-diffusers/img2img.md b/docs/source/en/using-diffusers/img2img.md
index 3f42c9396d0d..ef00bf7f9b2b 100644
--- a/docs/source/en/using-diffusers/img2img.md
+++ b/docs/source/en/using-diffusers/img2img.md
@@ -33,11 +33,8 @@ pipeline.enable_model_cpu_offload()
 pipeline.enable_xformers_memory_efficient_attention()
 ```
 
-<Tip>
-
-You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/fp16#scaled-dot-product-attention).
-
-</Tip>
+> [!TIP]
+> You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/fp16#scaled-dot-product-attention).
 
 2. Load an image to pass to the pipeline:
 
@@ -386,11 +383,8 @@ prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
 image = pipeline(prompt, image=init_image, output_type="latent").images[0]
 ```
 
-<Tip>
-
-It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.
-
-</Tip>
+> [!TIP]
+> It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.
 
 Pass the latent output from this pipeline to the next pipeline to generate an image in a [comic book art style](https://huggingface.co/ogkalu/Comic-Diffusion):
 
@@ -449,11 +443,8 @@ prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
 image_1 = pipeline(prompt, image=init_image, output_type="latent").images[0]
 ```
 
-<Tip>
-
-It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in *latent* space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.
-
-</Tip>
+> [!TIP]
+> It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in *latent* space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.
 
 Chain it to an upscaler pipeline to increase the image resolution:
 
diff --git a/docs/source/en/using-diffusers/inference_with_tcd_lora.md b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
index 88dd4733b5c3..a4de12b5e722 100644
--- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
@@ -335,9 +335,8 @@ grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
 ```
 ![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png)
 
-<Tip>
-The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
-</Tip>
+> [!TIP]
+> The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
 
 </hfoption>
 <hfoption id="IP-Adapter">
diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md
index 695ec040883b..28da3a68a59f 100644
--- a/docs/source/en/using-diffusers/inpaint.md
+++ b/docs/source/en/using-diffusers/inpaint.md
@@ -33,11 +33,8 @@ pipeline.enable_model_cpu_offload()
 pipeline.enable_xformers_memory_efficient_attention()
 ```
 
-<Tip>
-
-You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, it's not necessary to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/fp16#scaled-dot-product-attention).
-
-</Tip>
+> [!TIP]
+> You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, it's not necessary to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](../optimization/fp16#scaled-dot-product-attention).
 
 2. Load the base and mask images:
 
@@ -639,11 +636,8 @@ pipeline.enable_xformers_memory_efficient_attention()
 image = pipeline(prompt=prompt, image=image_inpainting, mask_image=mask_image, output_type="latent").images[0]
 ```
 
-<Tip>
-
-It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE. For example, in the [Text-to-image-to-inpaint](#text-to-image-to-inpaint) section, Kandinsky 2.2 uses a different VAE class than the Stable Diffusion model so it won't work. But if you use Stable Diffusion v1.5 for both pipelines, then you can keep everything in latent space because they both use [`AutoencoderKL`].
-
-</Tip>
+> [!TIP]
+> It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE. For example, in the [Text-to-image-to-inpaint](#text-to-image-to-inpaint) section, Kandinsky 2.2 uses a different VAE class than the Stable Diffusion model so it won't work. But if you use Stable Diffusion v1.5 for both pipelines, then you can keep everything in latent space because they both use [`AutoencoderKL`].
 
 Finally, you can pass this image to an image-to-image pipeline to put the finishing touches on it. It is more efficient to use the [`~AutoPipelineForImage2Image.from_pipe`] method to reuse the existing pipeline components, and avoid unnecessarily loading all the pipeline components into memory again.
 
diff --git a/docs/source/en/using-diffusers/kandinsky.md b/docs/source/en/using-diffusers/kandinsky.md
index a482380524e7..2671c108b37b 100644
--- a/docs/source/en/using-diffusers/kandinsky.md
+++ b/docs/source/en/using-diffusers/kandinsky.md
@@ -31,15 +31,12 @@ Before you begin, make sure you have the following libraries installed:
 #!pip install -q diffusers transformers accelerate
 ```
 
-<Tip warning={true}>
-
-Kandinsky 2.1 and 2.2 usage is very similar! The only difference is Kandinsky 2.2 doesn't accept `prompt` as an input when decoding the latents. Instead, Kandinsky 2.2 only accepts `image_embeds` during decoding.
-
-<br>
-
-Kandinsky 3 has a more concise architecture and it doesn't require a prior model. This means it's usage is identical to other diffusion models like [Stable Diffusion XL](sdxl).
-
-</Tip>
+> [!WARNING]
+> Kandinsky 2.1 and 2.2 usage is very similar! The only difference is Kandinsky 2.2 doesn't accept `prompt` as an input when decoding the latents. Instead, Kandinsky 2.2 only accepts `image_embeds` during decoding.
+>
+> <br>
+>
+> Kandinsky 3 has a more concise architecture and it doesn't require a prior model. This means it's usage is identical to other diffusion models like [Stable Diffusion XL](sdxl).
 
 ## Text-to-image
 
@@ -321,20 +318,17 @@ make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], r
 
 ## Inpainting
 
-<Tip warning={true}>
-
-⚠️ The Kandinsky models use ⬜️ **white pixels** to represent the masked area now instead of black pixels. If you are using [`KandinskyInpaintPipeline`] in production, you need to change the mask to use white pixels:
-
-```py
-# For PIL input
-import PIL.ImageOps
-mask = PIL.ImageOps.invert(mask)
-
-# For PyTorch and NumPy input
-mask = 1 - mask
-```
-
-</Tip>
+> [!WARNING]
+> ⚠️ The Kandinsky models use ⬜️ **white pixels** to represent the masked area now instead of black pixels. If you are using [`KandinskyInpaintPipeline`] in production, you need to change the mask to use white pixels:
+>
+> ```py
+> # For PIL input
+> import PIL.ImageOps
+> mask = PIL.ImageOps.invert(mask)
+>
+> # For PyTorch and NumPy input
+> mask = 1 - mask
+> ```
 
 For inpainting, you'll need the original image, a mask of the area to replace in the original image, and a text prompt of what to inpaint. Load the prior pipeline:
 
@@ -565,11 +559,8 @@ image
 
 ## ControlNet
 
-<Tip warning={true}>
-
-⚠️ ControlNet is only supported for Kandinsky 2.2!
-
-</Tip>
+> [!WARNING]
+> ⚠️ ControlNet is only supported for Kandinsky 2.2!
 
 ControlNet enables conditioning large pretrained diffusion models with additional inputs such as a depth map or edge detection. For example, you can condition Kandinsky 2.2 with a depth map so the model understands and preserves the structure of the depth image.
 
diff --git a/docs/source/en/using-diffusers/pag.md b/docs/source/en/using-diffusers/pag.md
index 46d716bcf8cc..c11a5dc379c8 100644
--- a/docs/source/en/using-diffusers/pag.md
+++ b/docs/source/en/using-diffusers/pag.md
@@ -219,11 +219,8 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
 pipeline.enable_model_cpu_offload()
 ```
 
-<Tip>
-
-If you already have a controlnet pipeline and want to enable PAG, you can use the `from_pipe` API: `AutoPipelineForText2Image.from_pipe(pipeline_controlnet, enable_pag=True)`
-
-</Tip>
+> [!TIP]
+> If you already have a controlnet pipeline and want to enable PAG, you can use the `from_pipe` API: `AutoPipelineForText2Image.from_pipe(pipeline_controlnet, enable_pag=True)`
 
 You can use the pipeline in the same way you normally use ControlNet pipelines, with the added option to specify a `pag_scale` parameter. Note that PAG works well for unconditional generation. In this example, we will generate an image without a prompt.
 
diff --git a/docs/source/en/using-diffusers/sdxl.md b/docs/source/en/using-diffusers/sdxl.md
index 106005c33807..79625e0c4a81 100644
--- a/docs/source/en/using-diffusers/sdxl.md
+++ b/docs/source/en/using-diffusers/sdxl.md
@@ -29,15 +29,12 @@ Before you begin, make sure you have the following libraries installed:
 #!pip install -q diffusers transformers accelerate invisible-watermark>=0.2.0
 ```
 
-<Tip warning={true}>
-
-We recommend installing the [invisible-watermark](https://pypi.org/project/invisible-watermark/) library to help identify images that are generated. If the invisible-watermark library is installed, it is used by default. To disable the watermarker:
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(..., add_watermarker=False)
-```
-
-</Tip>
+> [!WARNING]
+> We recommend installing the [invisible-watermark](https://pypi.org/project/invisible-watermark/) library to help identify images that are generated. If the invisible-watermark library is installed, it is used by default. To disable the watermarker:
+>
+> ```py
+> pipeline = StableDiffusionXLPipeline.from_pretrained(..., add_watermarker=False)
+> ```
 
 ## Load model checkpoints
 
@@ -174,11 +171,8 @@ refiner = DiffusionPipeline.from_pretrained(
 
 To use this approach, you need to define the number of timesteps for each model to run through their respective stages. For the base model, this is controlled by the [`denoising_end`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline.__call__.denoising_end) parameter and for the refiner model, it is controlled by the [`denoising_start`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline.__call__.denoising_start) parameter.
 
-<Tip>
-
-The `denoising_end` and `denoising_start` parameters should be a float between 0 and 1. These parameters are represented as a proportion of discrete timesteps as defined by the scheduler. If you're also using the `strength` parameter, it'll be ignored because the number of denoising steps is determined by the discrete timesteps the model is trained on and the declared fractional cutoff.
-
-</Tip>
+> [!TIP]
+> The `denoising_end` and `denoising_start` parameters should be a float between 0 and 1. These parameters are represented as a proportion of discrete timesteps as defined by the scheduler. If you're also using the `strength` parameter, it'll be ignored because the number of denoising steps is determined by the discrete timesteps the model is trained on and the declared fractional cutoff.
 
 Let's set `denoising_end=0.8` so the base model performs the first 80% of denoising the **high-noise** timesteps and set `denoising_start=0.8` so the refiner model performs the last 20% of denoising the **low-noise** timesteps. The base model output should be in **latent** space instead of a PIL image.
 
@@ -285,11 +279,8 @@ refiner = DiffusionPipeline.from_pretrained(
 ).to("cuda")
 ```
 
-<Tip warning={true}>
-
-You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../../api/pipelines/hunyuandit) or [PixArt-Sigma](../../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.
-
-</Tip>
+> [!WARNING]
+> You can use SDXL refiner with a different base model. For example, you can use the [Hunyuan-DiT](../../api/pipelines/hunyuandit) or [PixArt-Sigma](../../api/pipelines/pixart_sigma) pipelines to generate images with better prompt adherence. Once you have generated an image, you can pass it to the SDXL refiner model to enhance final generation quality.
 
 Generate an image from the base model, and set the model output to **latent** space:
 
@@ -322,11 +313,8 @@ For inpainting, load the base and the refiner model in the [`StableDiffusionXLIn
 
 SDXL training involves several additional conditioning techniques, which are referred to as *micro-conditioning*. These include original image size, target image size, and cropping parameters. The micro-conditionings can be used at inference time to create high-quality, centered images.
 
-<Tip>
-
-You can use both micro-conditioning and negative micro-conditioning parameters thanks to classifier-free guidance. They are available in the [`StableDiffusionXLPipeline`], [`StableDiffusionXLImg2ImgPipeline`], [`StableDiffusionXLInpaintPipeline`], and [`StableDiffusionXLControlNetPipeline`].
-
-</Tip>
+> [!TIP]
+> You can use both micro-conditioning and negative micro-conditioning parameters thanks to classifier-free guidance. They are available in the [`StableDiffusionXLPipeline`], [`StableDiffusionXLImg2ImgPipeline`], [`StableDiffusionXLInpaintPipeline`], and [`StableDiffusionXLControlNetPipeline`].
 
 ### Size conditioning
 
diff --git a/docs/source/en/using-diffusers/shap-e.md b/docs/source/en/using-diffusers/shap-e.md
index 51f0f53b0221..8cd62b3ffdb7 100644
--- a/docs/source/en/using-diffusers/shap-e.md
+++ b/docs/source/en/using-diffusers/shap-e.md
@@ -151,11 +151,8 @@ images = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=64, fra
 
 Use the [`~utils.export_to_ply`] function to save the mesh output as a `ply` file:
 
-<Tip>
-
-You can optionally save the mesh output as an `obj` file with the [`~utils.export_to_obj`] function. The ability to save the mesh output in a variety of formats makes it more flexible for downstream usage!
-
-</Tip>
+> [!TIP]
+> You can optionally save the mesh output as an `obj` file with the [`~utils.export_to_obj`] function. The ability to save the mesh output in a variety of formats makes it more flexible for downstream usage!
 
 ```py
 from diffusers.utils import export_to_ply
diff --git a/docs/source/en/using-diffusers/unconditional_image_generation.md b/docs/source/en/using-diffusers/unconditional_image_generation.md
index 0208d715d437..0add5bab6707 100644
--- a/docs/source/en/using-diffusers/unconditional_image_generation.md
+++ b/docs/source/en/using-diffusers/unconditional_image_generation.md
@@ -26,11 +26,8 @@ image = generator().images[0]
 image
 ```
 
-<Tip>
-
-Want to generate images of something else? Take a look at the training [guide](../training/unconditional_training) to learn how to train a model to generate your own images.
-
-</Tip>
+> [!TIP]
+> Want to generate images of something else? Take a look at the training [guide](../training/unconditional_training) to learn how to train a model to generate your own images.
 
 The output image is a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object that can be saved:
 
diff --git a/docs/source/en/using-diffusers/weighted_prompts.md b/docs/source/en/using-diffusers/weighted_prompts.md
index 2ebf92d0eb9b..b45568ac4de0 100644
--- a/docs/source/en/using-diffusers/weighted_prompts.md
+++ b/docs/source/en/using-diffusers/weighted_prompts.md
@@ -217,11 +217,8 @@ Prompt weighting provides a way to emphasize or de-emphasize certain parts of a
 
 Prompt weighting works by increasing or decreasing the scale of the text embedding vector that corresponds to its concept in the prompt because you may not necessarily want the model to focus on all concepts equally. The easiest way to prepare the prompt embeddings is to use [Stable Diffusion Long Prompt Weighted Embedding](https://github.com/xhinker/sd_embed) (sd_embed). Once you have the prompt-weighted embeddings, you can pass them to any pipeline that has a [prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.prompt_embeds) (and optionally [negative_prompt_embeds](https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline.__call__.negative_prompt_embeds)) parameter, such as [`StableDiffusionPipeline`], [`StableDiffusionControlNetPipeline`], and [`StableDiffusionXLPipeline`].
 
-<Tip>
-
-If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open an [issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can add it!
-
-</Tip>
+> [!TIP]
+> If your favorite pipeline doesn't have a `prompt_embeds` parameter, please open an [issue](https://github.com/huggingface/diffusers/issues/new/choose) so we can add it!
 
 This guide will show you how to weight your prompts with sd_embed.
 
diff --git a/docs/source/en/using-diffusers/write_own_pipeline.md b/docs/source/en/using-diffusers/write_own_pipeline.md
index 15a7e8dc7c35..930b0fe21fc4 100644
--- a/docs/source/en/using-diffusers/write_own_pipeline.md
+++ b/docs/source/en/using-diffusers/write_own_pipeline.md
@@ -110,11 +110,8 @@ Stable Diffusion is a text-to-image *latent diffusion* model. It is called a lat
 
 As you can see, this is already more complex than the DDPM pipeline which only contains a UNet model. The Stable Diffusion model has three separate pretrained models.
 
-<Tip>
-
-💡 Read the [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) blog for more details about how the VAE, UNet, and text encoder models work.
-
-</Tip>
+> [!TIP]
+> 💡 Read the [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) blog for more details about how the VAE, UNet, and text encoder models work.
 
 Now that you know what you need for the Stable Diffusion pipeline, load all these components with the [`~ModelMixin.from_pretrained`] method. You can find them in the pretrained [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint, and each component is stored in a separate subfolder:
 
@@ -155,11 +152,8 @@ To speed up inference, move the models to a GPU since, unlike the scheduler, the
 
 The next step is to tokenize the text to generate embeddings. The text is used to condition the UNet model and steer the diffusion process towards something that resembles the input prompt.
 
-<Tip>
-
-💡 The `guidance_scale` parameter determines how much weight should be given to the prompt when generating an image.
-
-</Tip>
+> [!TIP]
+> 💡 The `guidance_scale` parameter determines how much weight should be given to the prompt when generating an image.
 
 Feel free to choose any prompt you like if you want to generate something else!
 
@@ -202,15 +196,12 @@ Let's concatenate the conditional and unconditional embeddings into a batch to a
 
 Next, generate some initial random noise as a starting point for the diffusion process. This is the latent representation of the image, and it'll be gradually denoised. At this point, the `latent` image is smaller than the final image size but that's okay though because the model will transform it into the final 512x512 image dimensions later.
 
-<Tip>
-
-💡 The height and width are divided by 8 because the `vae` model has 3 down-sampling layers. You can check by running the following:
-
-```py
-2 ** (len(vae.config.block_out_channels) - 1) == 8
-```
-
-</Tip>
+> [!TIP]
+> 💡 The height and width are divided by 8 because the `vae` model has 3 down-sampling layers. You can check by running the following:
+>
+> ```py
+> 2 ** (len(vae.config.block_out_channels) - 1) == 8
+> ```
 
 ```py
 >>> latents = torch.randn(
diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md
index 97d60528c4fd..fd6f4eda0fca 100644
--- a/docs/source/ja/installation.md
+++ b/docs/source/ja/installation.md
@@ -108,11 +108,8 @@ pip install -e ".[flax]"
 Python は通常のライブラリパスに加えて、クローンしたフォルダの中を探すようになります。
 例えば、Python パッケージが通常 `~/anaconda3/envs/main/lib/python3.10/site-packages/` にインストールされている場合、Python はクローンした `~/diffusers/` フォルダも同様に参照します。
 
-<Tip warning={true}>
-
-ライブラリを使い続けたい場合は、`diffusers`フォルダを残しておく必要があります。
-
-</Tip>
+> [!WARNING]
+> ライブラリを使い続けたい場合は、`diffusers`フォルダを残しておく必要があります。
 
 これで、以下のコマンドで簡単にクローンを最新版の🤗 Diffusersにアップデートできます：
 
diff --git a/docs/source/ja/quicktour.md b/docs/source/ja/quicktour.md
index 03b340b35228..ce88aaf7b56d 100644
--- a/docs/source/ja/quicktour.md
+++ b/docs/source/ja/quicktour.md
@@ -24,11 +24,8 @@ specific language governing permissions and limitations under the License.
 
 この案内では、[`DiffusionPipeline`]を生成に使用する方法を紹介し、モデルとスケジューラを組み合わせて[`DiffusionPipeline`]の内部で起こっていることを再現する方法を説明します。
 
-<Tip>
-
-この案内は🧨 Diffusers [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)を簡略化したもので、すぐに使い始めることができます。Diffusers 🧨のゴール、設計哲学、コアAPIの詳細についてもっと知りたい方は、ノートブックをご覧ください！
-
-</Tip>
+> [!TIP]
+> この案内は🧨 Diffusers [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)を簡略化したもので、すぐに使い始めることができます。Diffusers 🧨のゴール、設計哲学、コアAPIの詳細についてもっと知りたい方は、ノートブックをご覧ください！
 
 始める前に必要なライブラリーがすべてインストールされていることを確認してください：
 
@@ -56,11 +53,8 @@ specific language governing permissions and limitations under the License.
 この[`DiffusionPipeline`]はHugging Face Hubに保存されている任意の[チェックポイント](https://huggingface.co/models?library=diffusers&sort=downloads)を使用することができます。
 この案内では、[`stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)チェックポイントでテキストから画像へ生成します。
 
-<Tip warning={true}>
-
-[Stable Diffusion]モデルについては、モデルを実行する前にまず[ライセンス](https://huggingface.co/spaces/CompVis/stable-diffusion-license)を注意深くお読みください。🧨  Diffusers は、攻撃的または有害なコンテンツを防ぐために [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) を実装していますが、モデルの改良された画像生成機能により、潜在的に有害なコンテンツが生成される可能性があります。
-
-</Tip>
+> [!WARNING]
+> [Stable Diffusion]モデルについては、モデルを実行する前にまず[ライセンス](https://huggingface.co/spaces/CompVis/stable-diffusion-license)を注意深くお読みください。🧨  Diffusers は、攻撃的または有害なコンテンツを防ぐために [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) を実装していますが、モデルの改良された画像生成機能により、潜在的に有害なコンテンツが生成される可能性があります。
 
 モデルを[`~DiffusionPipeline.from_pretrained`]メソッドでロードします：
 
@@ -204,11 +198,8 @@ torch.Size([1, 3, 256, 256])
 スケジューラは、モデルの出力（この場合は `noisy_residual` ）が与えられたときに、ノイズの多いサンプルからノイズの少ないサンプルへの移行を管理します。
 
 
-<Tip>
-
-🧨 Diffusersは拡散システムを構築するためのツールボックスです。[`DiffusionPipeline`]は事前に構築された拡散システムを使い始めるのに便利な方法ですが、独自のモデルとスケジューラコンポーネントを個別に選択してカスタム拡散システムを構築することもできます。
-
-</Tip>
+> [!TIP]
+> 🧨 Diffusersは拡散システムを構築するためのツールボックスです。[`DiffusionPipeline`]は事前に構築された拡散システムを使い始めるのに便利な方法ですが、独自のモデルとスケジューラコンポーネントを個別に選択してカスタム拡散システムを構築することもできます。
 
 この案内では、[`DDPMScheduler`]を[`~diffusers.ConfigMixin.from_config`]メソッドでインスタンス化します：
 
@@ -232,11 +223,8 @@ DDPMScheduler {
 }
 ```
 
-<Tip>
-
-💡 スケジューラがどのようにコンフィギュレーションからインスタンス化されるかに注目してください。モデルとは異なり、スケジューラは学習可能な重みを持たず、パラメーターを持ちません！
-
-</Tip>
+> [!TIP]
+> 💡 スケジューラがどのようにコンフィギュレーションからインスタンス化されるかに注目してください。モデルとは異なり、スケジューラは学習可能な重みを持たず、パラメーターを持ちません！
 
 最も重要なパラメータは以下の通りです：
 
diff --git a/docs/source/ja/stable_diffusion.md b/docs/source/ja/stable_diffusion.md
index 85f2b38a7d80..79abfa005d62 100644
--- a/docs/source/ja/stable_diffusion.md
+++ b/docs/source/ja/stable_diffusion.md
@@ -37,11 +37,8 @@ prompt = "portrait photo of a old warrior chief"
 
 ## Speed
 
-<Tip>
-
-💡 GPUを利用できない場合は、[Colab](https://colab.research.google.com/)のようなGPUプロバイダーから無料で利用できます！
-
-</Tip>
+> [!TIP]
+> 💡 GPUを利用できない場合は、[Colab](https://colab.research.google.com/)のようなGPUプロバイダーから無料で利用できます！
 
 画像生成を高速化する最も簡単な方法の1つは、PyTorchモジュールと同じようにGPU上にパイプラインを配置することです：
 
@@ -88,11 +85,8 @@ image
 
 今回、画像生成にかかった時間はわずか11秒で、以前より3倍近く速くなりました！
 
-<Tip>
-
-💡 パイプラインは常に `float16` で実行することを強くお勧めします。
-
-</Tip>
+> [!TIP]
+> 💡 パイプラインは常に `float16` で実行することを強くお勧めします。
 
 生成ステップ数を減らすという方法もあります。より効率的なスケジューラを選択することで、出力品質を犠牲にすることなくステップ数を減らすことができます。`compatibles`メソッドを呼び出すことで、[`DiffusionPipeline`]の現在のモデルと互換性のあるスケジューラを見つけることができます：
 
diff --git a/docs/source/ja/tutorials/autopipeline.md b/docs/source/ja/tutorials/autopipeline.md
index a9a780186ad1..7dc678da90be 100644
--- a/docs/source/ja/tutorials/autopipeline.md
+++ b/docs/source/ja/tutorials/autopipeline.md
@@ -16,11 +16,8 @@ Diffusersは様々なタスクをこなすことができ、テキストから
 
 `AutoPipeline` クラスは、🤗 Diffusers の様々なパイプラインをよりシンプルするために設計されています。この汎用的でタスク重視のパイプラインによってタスクそのものに集中することができます。`AutoPipeline` は、使用するべき正しいパイプラインクラスを自動的に検出するため、特定のパイプラインクラス名を知らなくても、タスクのチェックポイントを簡単にロードできます。
 
-<Tip>
-
-どのタスクがサポートされているかは、[AutoPipeline](../api/pipelines/auto_pipeline) のリファレンスをご覧ください。現在、text-to-image、image-to-image、inpaintingをサポートしています。
-
-</Tip>
+> [!TIP]
+> どのタスクがサポートされているかは、[AutoPipeline](../api/pipelines/auto_pipeline) のリファレンスをご覧ください。現在、text-to-image、image-to-image、inpaintingをサポートしています。
 
 このチュートリアルでは、`AutoPipeline` を使用して、事前に学習された重みが与えられたときに、特定のタスクを読み込むためのパイプラインクラスを自動的に推測する方法を示します。
 
diff --git a/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md b/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md
index 34a00d63fed1..ba85b4a855d3 100644
--- a/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md
+++ b/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md
@@ -207,11 +207,8 @@ image = refiner(
 
 동일한 40 단계에서 base 모델을 실행한다면, 이미지의 디테일(예: 사자의 눈과 코)이 떨어졌을 것입니다:
 
-<Tip>
-
-앙상블 방식은 사용 가능한 모든 스케줄러에서 잘 작동합니다!
-
-</Tip>
+> [!TIP]
+> 앙상블 방식은 사용 가능한 모든 스케줄러에서 잘 작동합니다!
 
 #### 2.) 노이즈가 완전히 제거된 기본 이미지에서 이미지 출력을 정제하기
 
@@ -248,11 +245,8 @@ image = refiner(prompt=prompt, image=image[None, :]).images[0]
 |---|---|
 | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png) |
 
-<Tip>
-
-refiner는 또한 인페인팅 설정에 잘 사용될 수 있습니다. 아래에 보여지듯이 [`StableDiffusionXLInpaintPipeline`] 클래스를 사용해서 만들어보세요.
-
-</Tip>
+> [!TIP]
+> refiner는 또한 인페인팅 설정에 잘 사용될 수 있습니다. 아래에 보여지듯이 [`StableDiffusionXLInpaintPipeline`] 클래스를 사용해서 만들어보세요.
 
 Denoiser 앙상블 설정에서 인페인팅에 refiner를 사용하려면 다음을 수행하면 됩니다:
 
diff --git a/docs/source/ko/conceptual/evaluation.md b/docs/source/ko/conceptual/evaluation.md
index 2d296420bcfb..731b511485c3 100644
--- a/docs/source/ko/conceptual/evaluation.md
+++ b/docs/source/ko/conceptual/evaluation.md
@@ -95,11 +95,8 @@ images = sd_pipeline(sample_prompts, num_images_per_prompt=1, generator=generato
 
 다양한 모델을 사용하여 모든 프롬프트에서 생성된 여러 이미지들이 생성되면 (평가 과정에서) 이러한 결과물들은 사람 평가자들에게 점수를 매기기 위해 제시됩니다. DrawBench와 PartiPrompts 벤치마크에 대한 자세한 내용은 각각의 논문을 참조하십시오.
 
-<Tip>
-
-모델이 훈련 중일 때 추론 샘플을 살펴보는 것은 훈련 진행 상황을 측정하는 데 유용합니다. [훈련 스크립트](https://github.com/huggingface/diffusers/tree/main/examples/)에서는 TensorBoard와 Weights & Biases에 대한 추가 지원과 함께 이 유틸리티를 지원합니다.
-
-</Tip>
+> [!TIP]
+> 모델이 훈련 중일 때 추론 샘플을 살펴보는 것은 훈련 진행 상황을 측정하는 데 유용합니다. [훈련 스크립트](https://github.com/huggingface/diffusers/tree/main/examples/)에서는 TensorBoard와 Weights & Biases에 대한 추가 지원과 함께 이 유틸리티를 지원합니다.
 
 ## 정량적 평가[[quantitative-evaluation]]
 
@@ -193,11 +190,8 @@ print(f"CLIP Score with v-1-5: {sd_clip_score_1_5}")
 
 [v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) 체크포인트가 이전 버전보다 더 나은 성능을 보이는 것 같습니다. 그러나 CLIP 점수를 계산하기 위해 사용한 프롬프트의 수가 상당히 적습니다. 보다 실용적인 평가를 위해서는 이 수를 훨씬 높게 설정하고, 프롬프트를 다양하게 사용해야 합니다.
 
-<Tip warning={true}>
-
-이 점수에는 몇 가지 제한 사항이 있습니다. 훈련 데이터셋의 캡션은 웹에서 크롤링되어 이미지와 관련된 `alt` 및 유사한 태그에서 추출되었습니다. 이들은 인간이 이미지를 설명하는 데 사용할 수 있는 것과 일치하지 않을 수 있습니다. 따라서 여기서는 몇 가지 프롬프트를 "엔지니어링"해야 했습니다.
-
-</Tip>
+> [!WARNING]
+> 이 점수에는 몇 가지 제한 사항이 있습니다. 훈련 데이터셋의 캡션은 웹에서 크롤링되어 이미지와 관련된 `alt` 및 유사한 태그에서 추출되었습니다. 이들은 인간이 이미지를 설명하는 데 사용할 수 있는 것과 일치하지 않을 수 있습니다. 따라서 여기서는 몇 가지 프롬프트를 "엔지니어링"해야 했습니다.
 
 ### 이미지 조건화된 텍스트-이미지 생성[[image-conditioned-text-to-image-generation]]
 
@@ -405,11 +399,8 @@ CLIP 점수와 마찬가지로, CLIP 방향 유사성이 높을수록 좋습니
 
 [`StableDiffusionPix2PixZeroPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix_zero#diffusers.StableDiffusionPix2PixZeroPipeline)와 같은 유사한 파이프라인에도 이러한 메트릭을 사용할 수 있습니다.
 
-<Tip>
-
-CLIP 점수와 CLIP 방향 유사성 모두 CLIP 모델에 의존하기 때문에 평가가 편향될 수 있습니다
-
-</Tip>
+> [!TIP]
+> CLIP 점수와 CLIP 방향 유사성 모두 CLIP 모델에 의존하기 때문에 평가가 편향될 수 있습니다
 
 ***IS, FID (나중에 설명할 예정), 또는 KID와 같은 메트릭을 확장하는 것은 어려울 수 있습니다***. 평가 중인 모델이 대규모 이미지 캡셔닝 데이터셋 (예: [LAION-5B 데이터셋](https://laion.ai/blog/laion-5b/))에서 사전 훈련되었을 때 이는 문제가 될 수 있습니다. 왜냐하면 이러한 메트릭의 기반에는 중간 이미지 특징을 추출하기 위해 ImageNet-1k 데이터셋에서 사전 훈련된 InceptionNet이 사용되기 때문입니다. Stable Diffusion의 사전 훈련 데이터셋은 InceptionNet의 사전 훈련 데이터셋과 겹치는 부분이 제한적일 수 있으므로 따라서 여기에는 좋은 후보가 아닙니다.
 
@@ -532,19 +523,16 @@ FID는 낮을수록 좋습니다. 여러 가지 요소가 FID에 영향을 줄 
 
 마지막 두 가지 요소에 대해서는, 다른 시드와 추론 단계에서 평가를 실행하고 평균 결과를 보고하는 것은 좋은 실천 방법입니다
 
-<Tip warning={true}>
-
-FID 결과는 많은 요소에 의존하기 때문에 취약할 수 있습니다:
-
-* 계산 중 사용되는 특정 Inception 모델.
-* 계산의 구현 정확도.
-* 이미지 형식 (PNG 또는 JPG에서 시작하는 경우가 다릅니다).
-
-이러한 사항을 염두에 두면, FID는 유사한 실행을 비교할 때 가장 유용하지만, 저자가 FID 측정 코드를 주의 깊게 공개하지 않는 한 논문 결과를 재현하기는 어렵습니다.
-
-이러한 사항은 KID 및 IS와 같은 다른 관련 메트릭에도 적용됩니다.
-
-</Tip>
+> [!WARNING]
+> FID 결과는 많은 요소에 의존하기 때문에 취약할 수 있습니다:
+>
+> * 계산 중 사용되는 특정 Inception 모델.
+> * 계산의 구현 정확도.
+> * 이미지 형식 (PNG 또는 JPG에서 시작하는 경우가 다릅니다).
+>
+> 이러한 사항을 염두에 두면, FID는 유사한 실행을 비교할 때 가장 유용하지만, 저자가 FID 측정 코드를 주의 깊게 공개하지 않는 한 논문 결과를 재현하기는 어렵습니다.
+>
+> 이러한 사항은 KID 및 IS와 같은 다른 관련 메트릭에도 적용됩니다.
 
 마지막 단계로, `fake_images`를 시각적으로 검사해 봅시다.
 
diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md
index c03b4642903a..198ca4b7c760 100644
--- a/docs/source/ko/installation.md
+++ b/docs/source/ko/installation.md
@@ -107,11 +107,8 @@ pip install -e ".[flax]"
 Python은 이제 일반 라이브러리 경로에 더하여 복제한 폴더 내부를 살펴봅니다.
 예를들어 Python 패키지가 `~/anaconda3/envs/main/lib/python3.10/site-packages/`에 설치되어 있는 경우 Python은 복제한 폴더인 `~/diffusers/`도 검색합니다.
 
-<Tip warning={true}>
-
-라이브러리를 계속 사용하려면 `diffusers` 폴더를 유지해야 합니다.
-
-</Tip>
+> [!WARNING]
+> 라이브러리를 계속 사용하려면 `diffusers` 폴더를 유지해야 합니다.
 
 이제 다음 명령어를 사용하여 최신 버전의 🤗 Diffusers로 쉽게 업데이트할 수 있습니다:
 
diff --git a/docs/source/ko/optimization/coreml.md b/docs/source/ko/optimization/coreml.md
index 60f19fd2c3dd..73ca851177f5 100644
--- a/docs/source/ko/optimization/coreml.md
+++ b/docs/source/ko/optimization/coreml.md
@@ -16,11 +16,8 @@ specific language governing permissions and limitations under the License.
 
 Core ML 모델은 Apple 기기에서 사용할 수 있는 모든 컴퓨팅 엔진들, 즉 CPU, GPU, Apple Neural Engine(또는 Apple Silicon Mac 및 최신 iPhone/iPad에서 사용할 수 있는 텐서 최적화 가속기인 ANE)을 활용할 수 있습니다. 모델과 실행 중인 기기에 따라 Core ML은 컴퓨팅 엔진도 혼합하여 사용할 수 있으므로, 예를 들어 모델의 일부가 CPU에서 실행되는 반면 다른 부분은 GPU에서 실행될 수 있습니다.
 
-<Tip>
-
-PyTorch에 내장된 `mps` 가속기를 사용하여 Apple Silicon Macs에서 `diffusers` Python 코드베이스를 실행할 수도 있습니다. 이 방법은 [mps 가이드]에 자세히 설명되어 있지만 네이티브 앱과 호환되지 않습니다.
-
-</Tip>
+> [!TIP]
+> PyTorch에 내장된 `mps` 가속기를 사용하여 Apple Silicon Macs에서 `diffusers` Python 코드베이스를 실행할 수도 있습니다. 이 방법은 [mps 가이드]에 자세히 설명되어 있지만 네이티브 앱과 호환되지 않습니다.
 
 ## Stable Diffusion Core ML 체크포인트
 
diff --git a/docs/source/ko/optimization/fp16.md b/docs/source/ko/optimization/fp16.md
index db0370875ec6..56f1330c404e 100644
--- a/docs/source/ko/optimization/fp16.md
+++ b/docs/source/ko/optimization/fp16.md
@@ -74,18 +74,16 @@ prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]
 ```
 
-<Tip warning={true}>
-  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
-</Tip>
+> [!WARNING]
+> 어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
 
 ## 추가 메모리 절약을 위한 슬라이스 어텐션
 
 추가 메모리 절약을 위해, 한 번에 모두 계산하는 대신 단계적으로 계산을 수행하는 슬라이스 버전의 어텐션(attention)을 사용할 수 있습니다.
 
-<Tip>
-  Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다.
-  하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.
-</Tip>
+> [!TIP]
+> Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다.
+>   하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.
 
 각 헤드에 대해 순차적으로 어텐션 계산을 수행하려면, 다음과 같이 추론 전에 파이프라인에서 [`~StableDiffusionPipeline.enable_attention_slicing`]를 호출하면 됩니다:
 
@@ -161,9 +159,8 @@ image = pipe(prompt).images[0]
 
 참고로 이 방법은 전체 모델이 아닌 서브모듈 수준에서 작동합니다. 이는 메모리 소비를 최소화하는 가장 좋은 방법이지만 프로세스의 반복적 특성으로 인해 추론 속도가 훨씬 느립니다. 파이프라인의 UNet 구성 요소는 여러 번 실행됩니다('num_inference_steps' 만큼). 매번 UNet의 서로 다른 서브모듈이 순차적으로 온로드된 다음 필요에 따라 오프로드되므로 메모리 이동 횟수가 많습니다.
 
-<Tip>
-또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
-</Tip>
+> [!TIP]
+> 또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
 
 또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
 
@@ -231,9 +228,8 @@ pipe.enable_attention_slicing(1)
 image = pipe(prompt).images[0]
 ```
 
-<Tip>
-이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.
-</Tip>
+> [!TIP]
+> 이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.
 
 ## Channels Last 메모리 형식 사용하기
 
diff --git a/docs/source/ko/optimization/mps.md b/docs/source/ko/optimization/mps.md
index 4daeaf5dbacf..004374c4af03 100644
--- a/docs/source/ko/optimization/mps.md
+++ b/docs/source/ko/optimization/mps.md
@@ -27,11 +27,8 @@ Diffusers는 Stable Diffusion 추론을 위해 PyTorch `mps`를 사용해 Apple
 아래 코도는 익숙한 `to()` 인터페이스를 사용하여 `mps` 백엔드로 Stable Diffusion 파이프라인을 M1 또는 M2 장치로 이동하는 방법을 보여줍니다.
 
 
-<Tip warning={true}>
-
-**PyTorch 1.13을 사용 중일 때 ** 추가 일회성 전달을 사용하여 파이프라인을 "프라이밍"하는 것을 추천합니다. 이것은 발견한 이상한 문제에 대한 임시 해결 방법입니다. 첫 번째 추론 전달은 후속 전달와 약간 다른 결과를 생성합니다. 이 전달은 한 번만 수행하면 되며 추론 단계를 한 번만 사용하고 결과를 폐기해도 됩니다.
-
-</Tip>
+> [!WARNING]
+> **PyTorch 1.13을 사용 중일 때 ** 추가 일회성 전달을 사용하여 파이프라인을 "프라이밍"하는 것을 추천합니다. 이것은 발견한 이상한 문제에 대한 임시 해결 방법입니다. 첫 번째 추론 전달은 후속 전달와 약간 다른 결과를 생성합니다. 이 전달은 한 번만 수행하면 되며 추론 단계를 한 번만 사용하고 결과를 폐기해도 됩니다.
 
 이전 팁에서 설명한 것들을 포함한 여러 문제를 해결하므로 PyTorch 2 이상을 사용하는 것이 좋습니다.
 
diff --git a/docs/source/ko/optimization/xformers.md b/docs/source/ko/optimization/xformers.md
index 3e4d107c0a8c..96fab34acfb3 100644
--- a/docs/source/ko/optimization/xformers.md
+++ b/docs/source/ko/optimization/xformers.md
@@ -21,16 +21,10 @@ specific language governing permissions and limitations under the License.
 pip install xformers
 ```
 
-<Tip>
-
-xFormers PIP 패키지에는 최신 버전의 PyTorch(xFormers 0.0.16에 1.13.1)가 필요합니다. 이전 버전의 PyTorch를 사용해야 하는 경우 [프로젝트 지침](https://github.com/facebookresearch/xformers#installing-xformers)의 소스를 사용해 xFormers를 설치하는 것이 좋습니다.
-
-</Tip>
+> [!TIP]
+> xFormers PIP 패키지에는 최신 버전의 PyTorch(xFormers 0.0.16에 1.13.1)가 필요합니다. 이전 버전의 PyTorch를 사용해야 하는 경우 [프로젝트 지침](https://github.com/facebookresearch/xformers#installing-xformers)의 소스를 사용해 xFormers를 설치하는 것이 좋습니다.
 
 xFormers를 설치하면, [여기](fp16#memory-efficient-attention)서 설명한 것처럼 'enable_xformers_memory_efficient_attention()'을 사용하여 추론 속도를 높이고 메모리 소비를 줄일 수 있습니다.
 
-<Tip warning={true}>
-
-[이 이슈](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)에 따르면 xFormers `v0.0.16`에서 GPU를 사용한 학습(파인 튜닝 또는 Dreambooth)을 할 수 없습니다. 해당 문제가 발견되면. 해당 코멘트를 참고해 development 버전을 설치하세요.
-
-</Tip>
+> [!WARNING]
+> [이 이슈](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)에 따르면 xFormers `v0.0.16`에서 GPU를 사용한 학습(파인 튜닝 또는 Dreambooth)을 할 수 없습니다. 해당 문제가 발견되면. 해당 코멘트를 참고해 development 버전을 설치하세요.
diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md
index 58ebb8960f07..0a3cd0f7c4b2 100644
--- a/docs/source/ko/quicktour.md
+++ b/docs/source/ko/quicktour.md
@@ -23,11 +23,8 @@ Diffusion 모델은 이미지나 오디오와 같은 관심 샘플들을 생성
 
 훑어보기에서는 추론을 위해 [`DiffusionPipeline`]을 사용하는 방법을 보여준 다음, 모델과 스케줄러를 결합하여 [`DiffusionPipeline`] 내부에서 일어나는 일을 복제하는 방법을 안내합니다.
 
-<Tip>
-
-훑어보기는 간결한 버전의 🧨 Diffusers 소개로서 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) 빠르게 시작할 수 있도록 도와드립니다. 디퓨저의 목표, 디자인 철학, 핵심 API에 대한 추가 세부 정보를 자세히 알아보려면 노트북을 확인하세요!
-
-</Tip>
+> [!TIP]
+> 훑어보기는 간결한 버전의 🧨 Diffusers 소개로서 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) 빠르게 시작할 수 있도록 도와드립니다. 디퓨저의 목표, 디자인 철학, 핵심 API에 대한 추가 세부 정보를 자세히 알아보려면 노트북을 확인하세요!
 
 시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
 
@@ -55,11 +52,8 @@ Diffusion 모델은 이미지나 오디오와 같은 관심 샘플들을 생성
 허깅페이스 허브에 저장된 모든 [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads)에 대해 [`DiffusionPipeline`]을 사용할 수 있습니다.
 이 훑어보기에서는 text-to-image 생성을 위한 [`stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) 체크포인트를 로드합니다.
 
-<Tip warning={true}>
-
-[Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) 모델의 경우, 모델을 실행하기 전에 [라이선스](https://huggingface.co/spaces/CompVis/stable-diffusion-license)를 먼저 주의 깊게 읽어주세요. 🧨 Diffusers는 불쾌하거나 유해한 콘텐츠를 방지하기 위해 [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py)를 구현하고 있지만, 모델의 향상된 이미지 생성 기능으로 인해 여전히 잠재적으로 유해한 콘텐츠가 생성될 수 있습니다.
-
-</Tip>
+> [!WARNING]
+> [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) 모델의 경우, 모델을 실행하기 전에 [라이선스](https://huggingface.co/spaces/CompVis/stable-diffusion-license)를 먼저 주의 깊게 읽어주세요. 🧨 Diffusers는 불쾌하거나 유해한 콘텐츠를 방지하기 위해 [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py)를 구현하고 있지만, 모델의 향상된 이미지 생성 기능으로 인해 여전히 잠재적으로 유해한 콘텐츠가 생성될 수 있습니다.
 
 [`~DiffusionPipeline.from_pretrained`] 방법으로 모델 로드하기:
 
@@ -203,11 +197,8 @@ torch.Size([1, 3, 256, 256])
 
 스케줄러는 모델 출력이 주어졌을 때 노이즈가 많은 샘플에서 노이즈가 적은 샘플로 전환하는 것을 관리합니다 - 이 경우 'noisy_residual'.
 
-<Tip>
-
-🧨 Diffusers는 Diffusion 시스템을 구축하기 위한 툴박스입니다. [`DiffusionPipeline`]을 사용하면 미리 만들어진 Diffusion 시스템을 편리하게 시작할 수 있지만, 모델과 스케줄러 구성 요소를 개별적으로 선택하여 사용자 지정 Diffusion 시스템을 구축할 수도 있습니다.
-
-</Tip>
+> [!TIP]
+> 🧨 Diffusers는 Diffusion 시스템을 구축하기 위한 툴박스입니다. [`DiffusionPipeline`]을 사용하면 미리 만들어진 Diffusion 시스템을 편리하게 시작할 수 있지만, 모델과 스케줄러 구성 요소를 개별적으로 선택하여 사용자 지정 Diffusion 시스템을 구축할 수도 있습니다.
 
 훑어보기의 경우, [`~diffusers.ConfigMixin.from_config`] 메서드를 사용하여 [`DDPMScheduler`]를 인스턴스화합니다:
 
@@ -231,11 +222,8 @@ DDPMScheduler {
 }
 ```
 
-<Tip>
-
-💡 스케줄러가 구성에서 어떻게 인스턴스화되는지 주목하세요. 모델과 달리 스케줄러에는 학습 가능한 가중치가 없으며 매개변수도 없습니다!
-
-</Tip>
+> [!TIP]
+> 💡 스케줄러가 구성에서 어떻게 인스턴스화되는지 주목하세요. 모델과 달리 스케줄러에는 학습 가능한 가중치가 없으며 매개변수도 없습니다!
 
 가장 중요한 매개변수는 다음과 같습니다:
 
diff --git a/docs/source/ko/stable_diffusion.md b/docs/source/ko/stable_diffusion.md
index 794bdf9c669b..0f61e16d2a9c 100644
--- a/docs/source/ko/stable_diffusion.md
+++ b/docs/source/ko/stable_diffusion.md
@@ -37,11 +37,8 @@ prompt = "portrait photo of a old warrior chief"
 
 ## 속도
 
-<Tip>
-
-💡 GPU에 액세스할 수 없는 경우 다음과 같은 GPU 제공업체에서 무료로 사용할 수 있습니다!. [Colab](https://colab.research.google.com/)
-
-</Tip>
+> [!TIP]
+> 💡 GPU에 액세스할 수 없는 경우 다음과 같은 GPU 제공업체에서 무료로 사용할 수 있습니다!. [Colab](https://colab.research.google.com/)
 
 추론 속도를 높이는 가장 간단한 방법 중 하나는 Pytorch 모듈을 사용할 때와 같은 방식으로 GPU에 파이프라인을 배치하는 것입니다:
 
@@ -89,11 +86,8 @@ image
 
 이번에는 이미지를 생성하는 데 약 11초밖에 걸리지 않아 이전보다 3배 가까이 빨라졌습니다!
 
-<Tip>
-
-💡 파이프라인은 항상 `float16`에서 실행할 것을 강력히 권장하며, 지금까지 출력 품질이 저하되는 경우는 거의 없었습니다.
-
-</Tip>
+> [!TIP]
+> 💡 파이프라인은 항상 `float16`에서 실행할 것을 강력히 권장하며, 지금까지 출력 품질이 저하되는 경우는 거의 없었습니다.
 
 또 다른 옵션은 추론 단계의 수를 줄이는 것입니다. 보다 효율적인 스케줄러를 선택하면 출력 품질 저하 없이 단계 수를 줄이는 데 도움이 될 수 있습니다. 현재 모델과 호환되는 스케줄러는 `compatibles` 메서드를 호출하여 [`DiffusionPipeline`]에서 찾을 수 있습니다:
 
diff --git a/docs/source/ko/training/controlnet.md b/docs/source/ko/training/controlnet.md
index 434ca959bd99..e868b57c5546 100644
--- a/docs/source/ko/training/controlnet.md
+++ b/docs/source/ko/training/controlnet.md
@@ -20,11 +20,8 @@ specific language governing permissions and limitations under the License.
 
 아래의 스크립트를 실행하기 전에, 라이브러리의 학습 의존성을 설치해야 합니다.
 
-<Tip warning={true}>
-
-가장 최신 버전의 예시 스크립트를 성공적으로 실행하기 위해서는, 소스에서 설치하고 최신 버전의 설치를 유지하는 것을 강력하게 추천합니다. 우리는 예시 스크립트들을 자주 업데이트하고 예시에 맞춘 특정한 요구사항을 설치합니다.
-
-</Tip>
+> [!WARNING]
+> 가장 최신 버전의 예시 스크립트를 성공적으로 실행하기 위해서는, 소스에서 설치하고 최신 버전의 설치를 유지하는 것을 강력하게 추천합니다. 우리는 예시 스크립트들을 자주 업데이트하고 예시에 맞춘 특정한 요구사항을 설치합니다.
 
 위 사항을 만족시키기 위해서, 새로운 가상환경에서 다음 일련의 스텝을 실행하세요:
 
diff --git a/docs/source/ko/training/create_dataset.md b/docs/source/ko/training/create_dataset.md
index a869cd09f05d..c459a9d6a15d 100644
--- a/docs/source/ko/training/create_dataset.md
+++ b/docs/source/ko/training/create_dataset.md
@@ -11,11 +11,8 @@
 - 이미지 폴더를 `--train_data_dir` 인수에 제공합니다.
 - 데이터셋을 Hub에 업로드하고 데이터셋 리포지토리 id를 `--dataset_name` 인수에 전달합니다.
 
-<Tip>
-
-💡 학습에 사용할 이미지 데이터셋을 만드는 방법에 대한 자세한 내용은 [이미지 데이터셋 만들기](https://huggingface.co/docs/datasets/image_dataset) 가이드를 참고하세요.
-
-</Tip>
+> [!TIP]
+> 💡 학습에 사용할 이미지 데이터셋을 만드는 방법에 대한 자세한 내용은 [이미지 데이터셋 만들기](https://huggingface.co/docs/datasets/image_dataset) 가이드를 참고하세요.
 
 ## 폴더 형태로 데이터셋 구축하기
 
@@ -40,11 +37,8 @@ accelerate launch train_unconditional.py \
 
 ## Hub에 데이터 올리기
 
-<Tip>
-
-💡 데이터셋을 만들고 Hub에 업로드하는 것에 대한 자세한 내용은 [🤗 Datasets을 사용한 이미지 검색](https://huggingface.co/blog/image-search-datasets) 게시물을 참고하세요.
-
-</Tip>
+> [!TIP]
+> 💡 데이터셋을 만들고 Hub에 업로드하는 것에 대한 자세한 내용은 [🤗 Datasets을 사용한 이미지 검색](https://huggingface.co/blog/image-search-datasets) 게시물을 참고하세요.
 
 PIL 인코딩된 이미지가 포함된 `이미지` 열을 생성하는 [이미지 폴더](https://huggingface.co/docs/datasets/image_load#imagefolder) 기능을 사용하여 데이터셋 생성을 시작합니다.
 
diff --git a/docs/source/ko/training/distributed_inference.md b/docs/source/ko/training/distributed_inference.md
index c4d6400d9795..e63764f5eb8c 100644
--- a/docs/source/ko/training/distributed_inference.md
+++ b/docs/source/ko/training/distributed_inference.md
@@ -32,9 +32,8 @@ Use the `--num_processes` argument to specify the number of GPUs to use, and cal
 accelerate launch run_distributed.py --num_processes=2
 ```
 
-<Tip>자세한 내용은 [🤗 Accelerate를 사용한 분산 추론](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) 가이드를 참조하세요.
-
-</Tip>
+> [!TIP]
+> 자세한 내용은 [🤗 Accelerate를 사용한 분산 추론](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) 가이드를 참조하세요.
 
 ## Pytoerch 분산
 
diff --git a/docs/source/ko/training/dreambooth.md b/docs/source/ko/training/dreambooth.md
index 8e62f8edab95..3e5a17d5f67c 100644
--- a/docs/source/ko/training/dreambooth.md
+++ b/docs/source/ko/training/dreambooth.md
@@ -51,11 +51,8 @@ write_basic_config()
 
 ## 파인튜닝
 
-<Tip warning={true}>
-
-DreamBooth 파인튜닝은 하이퍼파라미터에 매우 민감하고 과적합되기 쉽습니다. 적절한 하이퍼파라미터를 선택하는 데 도움이 되도록 다양한 권장 설정이 포함된 [심층 분석](https://huggingface.co/blog/dreambooth)을 살펴보는 것이 좋습니다.
-
-</Tip>
+> [!WARNING]
+> DreamBooth 파인튜닝은 하이퍼파라미터에 매우 민감하고 과적합되기 쉽습니다. 적절한 하이퍼파라미터를 선택하는 데 도움이 되도록 다양한 권장 설정이 포함된 [심층 분석](https://huggingface.co/blog/dreambooth)을 살펴보는 것이 좋습니다.
 
 <frameworkcontent>
 <pt>
@@ -176,11 +173,8 @@ python train_dreambooth_flax.py \
 
 해당 스크립트를 사용하면 `unet`과 함께 `text_encoder`를 파인튜닝할 수 있습니다. 실험에서(자세한 내용은 [🧨 Diffusers를 사용해 DreamBooth로 Stable Diffusion 학습하기](https://huggingface.co/blog/dreambooth) 게시물을 확인하세요), 특히 얼굴 이미지를 생성할 때 훨씬 더 나은 결과를 얻을 수 있습니다.
 
-<Tip warning={true}>
-
-텍스트 인코더를 학습시키려면 추가 메모리가 필요해 16GB GPU로는 동작하지 않습니다. 이 옵션을 사용하려면 최소 24GB VRAM이 필요합니다.
-
-</Tip>
+> [!WARNING]
+> 텍스트 인코더를 학습시키려면 추가 메모리가 필요해 16GB GPU로는 동작하지 않습니다. 이 옵션을 사용하려면 최소 24GB VRAM이 필요합니다.
 
 `--train_text_encoder` 인수를 학습 스크립트에 전달하여 `text_encoder` 및 `unet`을 파인튜닝할 수 있습니다:
 
diff --git a/docs/source/ko/training/lora.md b/docs/source/ko/training/lora.md
index 5bcef271438d..515e3fd65e89 100644
--- a/docs/source/ko/training/lora.md
+++ b/docs/source/ko/training/lora.md
@@ -14,11 +14,8 @@ specific language governing permissions and limitations under the License.
 
 [[open-in-colab]]
 
-<Tip warning={true}>
-
-현재 LoRA는 [`UNet2DConditionalModel`]의 어텐션 레이어에서만 지원됩니다.
-
-</Tip>
+> [!WARNING]
+> 현재 LoRA는 [`UNet2DConditionalModel`]의 어텐션 레이어에서만 지원됩니다.
 
 [LoRA(Low-Rank Adaptation of Large Language Models)](https://huggingface.co/papers/2106.09685)는 메모리를 적게 사용하면서 대규모 모델의 학습을 가속화하는 학습 방법입니다. 이는 rank-decomposition weight 행렬 쌍(**업데이트 행렬**이라고 함)을 추가하고 새로 추가된 가중치**만** 학습합니다. 여기에는 몇 가지 장점이 있습니다.
 
@@ -28,11 +25,8 @@ specific language governing permissions and limitations under the License.
 - 메모리 효율성이 향상되어 Tesla T4, RTX 3080 또는 RTX 2080 Ti와 같은 소비자용 GPU에서 파인튜닝을 실행할 수 있습니다! T4와 같은 GPU는 무료이며 Kaggle 또는 Google Colab 노트북에서 쉽게 액세스할 수 있습니다.
 
 
-<Tip>
-
-💡 LoRA는 어텐션 레이어에만 한정되지는 않습니다. 저자는 언어 모델의 어텐션 레이어를 수정하는 것이 매우 효율적으로 죻은 성능을 얻기에 충분하다는 것을 발견했습니다. 이것이 LoRA 가중치를 모델의 어텐션 레이어에 추가하는 것이 일반적인 이유입니다. LoRA 작동 방식에 대한 자세한 내용은 [Using LoRA for effective Stable Diffusion fine-tuning](https://huggingface.co/blog/lora) 블로그를 확인하세요!
-
-</Tip>
+> [!TIP]
+> 💡 LoRA는 어텐션 레이어에만 한정되지는 않습니다. 저자는 언어 모델의 어텐션 레이어를 수정하는 것이 매우 효율적으로 죻은 성능을 얻기에 충분하다는 것을 발견했습니다. 이것이 LoRA 가중치를 모델의 어텐션 레이어에 추가하는 것이 일반적인 이유입니다. LoRA 작동 방식에 대한 자세한 내용은 [Using LoRA for effective Stable Diffusion fine-tuning](https://huggingface.co/blog/lora) 블로그를 확인하세요!
 
 [cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) 및 [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다.
 
@@ -104,11 +98,8 @@ accelerate launch train_dreambooth_lora.py \
 
 *기본 모델의 가중치 위에* 파인튜닝된 DreamBooth 모델에서 LoRA 가중치를 불러온 다음, 더 빠른 추론을 위해 파이프라인을 GPU로 이동합니다. LoRA 가중치를 프리징된 사전 훈련된 모델 가중치와 병합할 때, 선택적으로 'scale' 매개변수로 어느 정도의 가중치를 병합할 지 조절할 수 있습니다:
 
-<Tip>
-
-💡 `0`의 `scale` 값은 LoRA 가중치를 사용하지 않아 원래 모델의 가중치만 사용한 것과 같고, `1`의 `scale` 값은 파인튜닝된 LoRA 가중치만 사용함을 의미합니다. 0과 1 사이의 값들은 두 결과들 사이로 보간됩니다.
-
-</Tip>
+> [!TIP]
+> 💡 `0`의 `scale` 값은 LoRA 가중치를 사용하지 않아 원래 모델의 가중치만 사용한 것과 같고, `1`의 `scale` 값은 파인튜닝된 LoRA 가중치만 사용함을 의미합니다. 0과 1 사이의 값들은 두 결과들 사이로 보간됩니다.
 
 ```py
 >>> pipe.unet.load_attn_procs(model_path)
diff --git a/docs/source/ko/training/text2image.md b/docs/source/ko/training/text2image.md
index 4283f73ed9bc..b26603bf1b34 100644
--- a/docs/source/ko/training/text2image.md
+++ b/docs/source/ko/training/text2image.md
@@ -13,11 +13,8 @@ specific language governing permissions and limitations under the License.
 
 # Text-to-image
 
-<Tip warning={true}>
-
-text-to-image 파인튜닝 스크립트는 experimental 상태입니다. 과적합하기 쉽고 치명적인 망각과 같은 문제에 부딪히기 쉽습니다. 자체 데이터셋에서 최상의 결과를 얻으려면 다양한 하이퍼파라미터를 탐색하는 것이 좋습니다.
-
-</Tip>
+> [!WARNING]
+> text-to-image 파인튜닝 스크립트는 experimental 상태입니다. 과적합하기 쉽고 치명적인 망각과 같은 문제에 부딪히기 쉽습니다. 자체 데이터셋에서 최상의 결과를 얻으려면 다양한 하이퍼파라미터를 탐색하는 것이 좋습니다.
 
 Stable Diffusion과 같은 text-to-image 모델은 텍스트 프롬프트에서 이미지를 생성합니다. 이 가이드는 PyTorch 및 Flax를 사용하여 자체 데이터셋에서 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 파인튜닝하는 방법을 보여줍니다. 이 가이드에 사용된 text-to-image 파인튜닝을 위한 모든 학습 스크립트에 관심이 있는 경우 이 [리포지토리](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image)에서 자세히 찾을 수 있습니다.
 
diff --git a/docs/source/ko/training/text_inversion.md b/docs/source/ko/training/text_inversion.md
index b27bed7d14e3..d8b44930e3fd 100644
--- a/docs/source/ko/training/text_inversion.md
+++ b/docs/source/ko/training/text_inversion.md
@@ -23,11 +23,8 @@ specific language governing permissions and limitations under the License.
 
 이 가이드에서는 textual-inversion으로 [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) 모델을 학습하는 방법을 설명합니다. 이 가이드에서 사용된 모든 textual-inversion 학습 스크립트는 [여기](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion)에서 확인할 수 있습니다. 내부적으로 어떻게 작동하는지 자세히 살펴보고 싶으시다면 해당 링크를 참조해주시기 바랍니다.
 
-<Tip>
-
-[Stable Diffusion Textual Inversion Concepts Library](https://huggingface.co/sd-concepts-library)에는 커뮤니티에서 제작한 학습된 textual-inversion 모델들이 있습니다. 시간이 지남에 따라 더 많은 콘셉트들이 추가되어 유용한 리소스로 성장할 것입니다!
-
-</Tip>
+> [!TIP]
+> [Stable Diffusion Textual Inversion Concepts Library](https://huggingface.co/sd-concepts-library)에는 커뮤니티에서 제작한 학습된 textual-inversion 모델들이 있습니다. 시간이 지남에 따라 더 많은 콘셉트들이 추가되어 유용한 리소스로 성장할 것입니다!
 
 시작하기 전에 학습을 위한 의존성 라이브러리들을 설치해야 합니다:
 
@@ -100,11 +97,8 @@ snapshot_download(
 - `token_identifier.txt`
 - `type_of_concept.txt`.
 
-<Tip>
-
-💡V100 GPU 1개를 기준으로 전체 학습에는 최대 1시간이 걸립니다. 학습이 완료되기를 기다리는 동안 궁금한 점이 있으면 아래 섹션에서 [textual-inversion이 어떻게 작동하는지](https://huggingface.co/docs/diffusers/training/text_inversion#how-it-works) 자유롭게 확인하세요 !
-
-</Tip>
+> [!TIP]
+> 💡V100 GPU 1개를 기준으로 전체 학습에는 최대 1시간이 걸립니다. 학습이 완료되기를 기다리는 동안 궁금한 점이 있으면 아래 섹션에서 [textual-inversion이 어떻게 작동하는지](https://huggingface.co/docs/diffusers/training/text_inversion#how-it-works) 자유롭게 확인하세요 !
 
 <frameworkcontent>
 <pt>
@@ -128,15 +122,12 @@ accelerate launch textual_inversion.py \
   --push_to_hub
 ```
 
-<Tip>
-
-💡학습 성능을 올리기 위해, 플레이스홀더 토큰(`<cat-toy>`)을 (단일한 임베딩 벡터가 아닌) 복수의 임베딩 벡터로 표현하는 것 역시 고려할 있습니다.  이러한 트릭이 모델이 보다 복잡한 이미지의 스타일(앞서 말한 콘셉트)을 더 잘 캡처하는 데 도움이 될 수 있습니다. 복수의 임베딩 벡터 학습을 활성화하려면 다음 옵션을 전달하십시오.
-
-```bash
---num_vectors=5
-```
-
-</Tip>
+> [!TIP]
+> 💡학습 성능을 올리기 위해, 플레이스홀더 토큰(`<cat-toy>`)을 (단일한 임베딩 벡터가 아닌) 복수의 임베딩 벡터로 표현하는 것 역시 고려할 있습니다.  이러한 트릭이 모델이 보다 복잡한 이미지의 스타일(앞서 말한 콘셉트)을 더 잘 캡처하는 데 도움이 될 수 있습니다. 복수의 임베딩 벡터 학습을 활성화하려면 다음 옵션을 전달하십시오.
+>
+> ```bash
+> --num_vectors=5
+> ```
 </pt>
 <jax>
 
@@ -193,11 +184,8 @@ textual-inversion 스크립트는 기본적으로 textual-inversion을 통해 
 
 <frameworkcontent>
 <pt>
-<Tip>
-
-💡 커뮤니티는 [sd-concepts-library](https://huggingface.co/sd-concepts-library) 라는 대규모의 textual-inversion 임베딩 벡터 라이브러리를 만들었습니다. textual-inversion 임베딩을 밑바닥부터 학습하는 대신, 해당 라이브러리에 본인이 찾는 textual-inversion 임베딩이 이미 추가되어 있지 않은지를 확인하는 것도 좋은 방법이 될 것 같습니다.
-
-</Tip>
+> [!TIP]
+> 💡 커뮤니티는 [sd-concepts-library](https://huggingface.co/sd-concepts-library) 라는 대규모의 textual-inversion 임베딩 벡터 라이브러리를 만들었습니다. textual-inversion 임베딩을 밑바닥부터 학습하는 대신, 해당 라이브러리에 본인이 찾는 textual-inversion 임베딩이 이미 추가되어 있지 않은지를 확인하는 것도 좋은 방법이 될 것 같습니다.
 
 textual-inversion 임베딩 벡터을 불러오기 위해서는, 먼저 해당 임베딩 벡터를 학습할 때 사용한 모델을 불러와야 합니다. 여기서는  [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/docs/diffusers/training/stable-diffusion-v1-5/stable-diffusion-v1-5) 모델이 사용되었다고 가정하고 불러오겠습니다.
 
diff --git a/docs/source/ko/training/unconditional_training.md b/docs/source/ko/training/unconditional_training.md
index c8c463da6b8d..04a9a6c7ea3b 100644
--- a/docs/source/ko/training/unconditional_training.md
+++ b/docs/source/ko/training/unconditional_training.md
@@ -78,11 +78,8 @@ write_basic_config()
 
 학습 스크립트는 `diffusion_pytorch_model.bin` 파일을 생성하고, 그것을 당신의 리포지토리에 저장합니다.
 
-<Tip>
-
-💡 전체 학습은 V100 GPU 4개를 사용할 경우, 2시간이 소요됩니다.
-
-</Tip>
+> [!TIP]
+> 💡 전체 학습은 V100 GPU 4개를 사용할 경우, 2시간이 소요됩니다.
 
 예를 들어, [Oxford Flowers](https://huggingface.co/datasets/huggan/flowers-102-categories) 데이터셋을 사용해 파인튜닝할 경우:
 
diff --git a/docs/source/ko/tutorials/basic_training.md b/docs/source/ko/tutorials/basic_training.md
index 2c4c89edd11d..05ce1037b537 100644
--- a/docs/source/ko/tutorials/basic_training.md
+++ b/docs/source/ko/tutorials/basic_training.md
@@ -19,11 +19,8 @@ Unconditional 이미지 생성은 학습에 사용된 데이터셋과 유사한
 
 이 튜토리얼은 나만의 🦋 나비 🦋를 생성하기 위해 [Smithsonian Butterflies](https://huggingface.co/datasets/huggan/smithsonian_butterflies_subset) 데이터셋의 하위 집합에서 [`UNet2DModel`] 모델을 학습하는 방법을 가르쳐줄 것입니다.
 
-<Tip>
-
-💡 이 학습 튜토리얼은 [Training with 🧨 Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) 노트북 기반으로 합니다. Diffusion 모델의 작동 방식 및 자세한 내용은 노트북을 확인하세요!
-
-</Tip>
+> [!TIP]
+> 💡 이 학습 튜토리얼은 [Training with 🧨 Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) 노트북 기반으로 합니다. Diffusion 모델의 작동 방식 및 자세한 내용은 노트북을 확인하세요!
 
 시작 전에, 🤗 Datasets을 불러오고 전처리하기 위해 데이터셋이 설치되어 있는지 다수 GPU에서 학습을 간소화하기 위해 🤗 Accelerate 가 설치되어 있는지 확인하세요. 그 후 학습 메트릭을 시각화하기 위해 [TensorBoard](https://www.tensorflow.org/tensorboard)를 또한 설치하세요. (또한 학습 추적을 위해 [Weights & Biases](https://docs.wandb.ai/)를 사용할 수 있습니다.)
 
diff --git a/docs/source/ko/using-diffusers/controlling_generation.md b/docs/source/ko/using-diffusers/controlling_generation.md
index 1b9a8b5df5de..db22fe042dc1 100644
--- a/docs/source/ko/using-diffusers/controlling_generation.md
+++ b/docs/source/ko/using-diffusers/controlling_generation.md
@@ -85,12 +85,9 @@ Pix2Pix Zero는 합성 이미지와 실제 이미지를 편집하는 데 모두
   다음으로 편집할 컨셉과 새로운 타겟 컨셉에 대한 이미지 캡션을 생성합니다. 이를 위해 [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)와 같은 모델을 사용할 수 있습니다. 그런 다음 텍스트 인코더를 통해 소스 개념과 대상 개념 모두에 대한 "평균" 프롬프트 임베딩을 생성합니다. 마지막으로, 합성 이미지를 편집하기 위해 pix2pix-zero 알고리즘을 사용합니다.
 - 실제 이미지를 편집하려면 먼저 [BLIP](https://huggingface.co/docs/transformers/model_doc/blip)과 같은 모델을 사용하여 이미지 캡션을 생성합니다. 그런 다음 프롬프트와 이미지에 ddim 반전을 적용하여 "역(inverse)" latents을 생성합니다. 이전과 마찬가지로 소스 및 대상 개념 모두에 대한 "평균(mean)" 프롬프트 임베딩이 생성되고 마지막으로 "역(inverse)" latents와 결합된 pix2pix-zero 알고리즘이 이미지를 편집하는 데 사용됩니다.
 
-<Tip>
-
-Pix2Pix Zero는 '제로 샷(zero-shot)' 이미지 편집이 가능한 최초의 모델입니다.
-즉, 이 모델은 다음과 같이 일반 소비자용 GPU에서 1분 이내에 이미지를 편집할 수 있습니다(../api/pipelines/stable_diffusion/pix2pix_zero#usage-example).
-
-</Tip>
+> [!TIP]
+> Pix2Pix Zero는 '제로 샷(zero-shot)' 이미지 편집이 가능한 최초의 모델입니다.
+> 즉, 이 모델은 다음과 같이 일반 소비자용 GPU에서 1분 이내에 이미지를 편집할 수 있습니다(../api/pipelines/stable_diffusion/pix2pix_zero#usage-example).
 
 위에서 언급했듯이 Pix2Pix Zero에는 특정 개념으로 세대를 유도하기 위해 (UNet, VAE 또는 텍스트 인코더가 아닌) latents을 최적화하는 기능이 포함되어 있습니다.즉, 전체 파이프라인에 표준 [StableDiffusionPipeline](../api/pipelines/stable_diffusion/text2img)보다 더 많은 메모리가 필요할 수 있습니다.
 
@@ -140,13 +137,10 @@ SAG는 고빈도 세부 정보를 기반으로 하지 않은 예측에서 완전
 
 사용 방법에 대한 자세한 내용은 [여기](../api/pipelines/stable_diffusion_2#depthtoimage)를 참조하세요.
 
-<Tip>
-
-InstructPix2Pix와 Pix2Pix Zero와 같은 방법의 중요한 차이점은 전자의 경우
-는 사전 학습된 가중치를 미세 조정하는 반면, 후자는 그렇지 않다는 것입니다. 즉, 다음을 수행할 수 있습니다.
-사용 가능한 모든 안정적 확산 모델에 Pix2Pix Zero를 적용할 수 있습니다.
-
-</Tip>
+> [!TIP]
+> InstructPix2Pix와 Pix2Pix Zero와 같은 방법의 중요한 차이점은 전자의 경우
+> 는 사전 학습된 가중치를 미세 조정하는 반면, 후자는 그렇지 않다는 것입니다. 즉, 다음을 수행할 수 있습니다.
+> 사용 가능한 모든 안정적 확산 모델에 Pix2Pix Zero를 적용할 수 있습니다.
 
 ## MultiDiffusion Panorama
 
diff --git a/docs/source/ko/using-diffusers/custom_pipeline_overview.md b/docs/source/ko/using-diffusers/custom_pipeline_overview.md
index b143bf8ab0d0..caeeca8cefec 100644
--- a/docs/source/ko/using-diffusers/custom_pipeline_overview.md
+++ b/docs/source/ko/using-diffusers/custom_pipeline_overview.md
@@ -20,11 +20,8 @@ specific language governing permissions and limitations under the License.
 
 허브에서 커뮤니티 파이프라인을 로드하려면, 커뮤니티 파이프라인의 리포지토리 ID와 (파이프라인 가중치 및 구성 요소를 로드하려는) 모델의 리포지토리 ID를 인자로 전달해야 합니다. 예를 들어, 아래 예시에서는 `hf-internal-testing/diffusers-dummy-pipeline`에서 더미 파이프라인을 불러오고, `google/ddpm-cifar10-32`에서 파이프라인의 가중치와 컴포넌트들을 로드합니다.
 
-<Tip warning={true}>
-
-🔒 허깅 페이스 허브에서 커뮤니티 파이프라인을 불러오는 것은 곧 해당 코드가 안전하다고 신뢰하는 것입니다. 코드를 자동으로 불러오고 실행하기 앞서 반드시 온라인으로 해당 코드의 신뢰성을 검사하세요!
-
-</Tip>
+> [!WARNING]
+> 🔒 허깅 페이스 허브에서 커뮤니티 파이프라인을 불러오는 것은 곧 해당 코드가 안전하다고 신뢰하는 것입니다. 코드를 자동으로 불러오고 실행하기 앞서 반드시 온라인으로 해당 코드의 신뢰성을 검사하세요!
 
 ```py
 from diffusers import DiffusionPipeline
diff --git a/docs/source/ko/using-diffusers/diffedit.md b/docs/source/ko/using-diffusers/diffedit.md
index 74b9e9783155..edf23f0214ab 100644
--- a/docs/source/ko/using-diffusers/diffedit.md
+++ b/docs/source/ko/using-diffusers/diffedit.md
@@ -156,11 +156,8 @@ print(source_prompts)
 print(target_prompts)
 ```
 
-<Tip>
-
-다양한 품질의 텍스트를 생성하는 전략에 대해 자세히 알아보려면 [생성 전략](https://huggingface.co/docs/transformers/main/en/generation_strategies) 가이드를 참조하세요.
-
-</Tip>
+> [!TIP]
+> 다양한 품질의 텍스트를 생성하는 전략에 대해 자세히 알아보려면 [생성 전략](https://huggingface.co/docs/transformers/main/en/generation_strategies) 가이드를 참조하세요.
 
 텍스트 인코딩을 위해 [`StableDiffusionDiffEditPipeline`]에서 사용하는 텍스트 인코더 모델을 불러옵니다. 텍스트 인코더를 사용하여 텍스트 임베딩을 계산합니다:
 
diff --git a/docs/source/ko/using-diffusers/img2img.md b/docs/source/ko/using-diffusers/img2img.md
index 8da840f74814..3901fb755f8d 100644
--- a/docs/source/ko/using-diffusers/img2img.md
+++ b/docs/source/ko/using-diffusers/img2img.md
@@ -53,11 +53,8 @@ init_image
     <img src="https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/image_2_image_using_diffusers_cell_8_output_0.jpeg"/>
 </div>
 
-<Tip>
-
-💡 `strength`는 입력 이미지에 추가되는 노이즈의 양을 제어하는 0.0에서 1.0 사이의 값입니다. 1.0에 가까운 값은 다양한 변형을 허용하지만 입력 이미지와 의미적으로 일치하지 않는 이미지를 생성합니다.
-
-</Tip>
+> [!TIP]
+> 💡 `strength`는 입력 이미지에 추가되는 노이즈의 양을 제어하는 0.0에서 1.0 사이의 값입니다. 1.0에 가까운 값은 다양한 변형을 허용하지만 입력 이미지와 의미적으로 일치하지 않는 이미지를 생성합니다.
 
 프롬프트를 정의하고(지브리 스타일(Ghibli-style)에 맞게 조정된 이 체크포인트의 경우 프롬프트 앞에 `ghibli style` 토큰을 붙여야 합니다) 파이프라인을 실행합니다:
 
diff --git a/docs/source/ko/using-diffusers/inpaint.md b/docs/source/ko/using-diffusers/inpaint.md
index adf1251176a6..cefb89218621 100644
--- a/docs/source/ko/using-diffusers/inpaint.md
+++ b/docs/source/ko/using-diffusers/inpaint.md
@@ -59,11 +59,8 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 :-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
 <img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" alt="drawing" width="250"/> | <img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" alt="drawing" width="250"/> | ***Face of a yellow cat, high resolution, sitting on a park bench*** | <img src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/yellow_cat_sitting_on_a_park_bench.png" alt="drawing" width="250"/> |
 
-<Tip warning={true}>
-
-이전의 실험적인 인페인팅 구현에서는 품질이 낮은 다른 프로세스를 사용했습니다. 이전 버전과의 호환성을 보장하기 위해 새 모델이 포함되지 않은 사전학습된 파이프라인을 불러오면 이전 인페인팅 방법이 계속 적용됩니다.
-
-</Tip>
+> [!WARNING]
+> 이전의 실험적인 인페인팅 구현에서는 품질이 낮은 다른 프로세스를 사용했습니다. 이전 버전과의 호환성을 보장하기 위해 새 모델이 포함되지 않은 사전학습된 파이프라인을 불러오면 이전 인페인팅 방법이 계속 적용됩니다.
 
 아래 Space에서 이미지 인페인팅을 직접 해보세요!
 
diff --git a/docs/source/ko/using-diffusers/kandinsky.md b/docs/source/ko/using-diffusers/kandinsky.md
index cc554c67f989..8eff8f5629a5 100644
--- a/docs/source/ko/using-diffusers/kandinsky.md
+++ b/docs/source/ko/using-diffusers/kandinsky.md
@@ -31,15 +31,12 @@ Kandinsky 모델은 일련의 다국어 text-to-image 생성 모델입니다. Ka
 #!pip install -q diffusers transformers accelerate
 ```
 
-<Tip warning={true}>
-
-Kandinsky 2.1과 2.2의 사용법은 매우 유사합니다! 유일한 차이점은 Kandinsky 2.2는 latents를 디코딩할 때 `프롬프트`를 입력으로 받지 않는다는 것입니다. 대신, Kandinsky 2.2는 디코딩 중에는 `image_embeds`만 받아들입니다.
-
-<br>
-
-Kandinsky 3는 더 간결한 아키텍처를 가지고 있으며 prior 모델이 필요하지 않습니다. 즉, [Stable Diffusion XL](sdxl)과 같은 다른 diffusion 모델과 사용법이 동일합니다.
-
-</Tip>
+> [!WARNING]
+> Kandinsky 2.1과 2.2의 사용법은 매우 유사합니다! 유일한 차이점은 Kandinsky 2.2는 latents를 디코딩할 때 `프롬프트`를 입력으로 받지 않는다는 것입니다. 대신, Kandinsky 2.2는 디코딩 중에는 `image_embeds`만 받아들입니다.
+>
+> <br>
+>
+> Kandinsky 3는 더 간결한 아키텍처를 가지고 있으며 prior 모델이 필요하지 않습니다. 즉, [Stable Diffusion XL](sdxl)과 같은 다른 diffusion 모델과 사용법이 동일합니다.
 
 ## Text-to-image
 
@@ -321,20 +318,17 @@ make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], r
 
 ## Inpainting
 
-<Tip warning={true}>
-
-⚠️ Kandinsky 모델은 이제 검은색 픽셀 대신 ⬜️ **흰색 픽셀**을 사용하여 마스크 영역을 표현합니다. 프로덕션에서 [`KandinskyInpaintPipeline`]을 사용하는 경우 흰색 픽셀을 사용하도록 마스크를 변경해야 합니다:
-
-```py
-# PIL 입력에 대해
-import PIL.ImageOps
-mask = PIL.ImageOps.invert(mask)
-
-# PyTorch와 NumPy 입력에 대해
-mask = 1 - mask
-```
-
-</Tip>
+> [!WARNING]
+> ⚠️ Kandinsky 모델은 이제 검은색 픽셀 대신 ⬜️ **흰색 픽셀**을 사용하여 마스크 영역을 표현합니다. 프로덕션에서 [`KandinskyInpaintPipeline`]을 사용하는 경우 흰색 픽셀을 사용하도록 마스크를 변경해야 합니다:
+>
+> ```py
+> # PIL 입력에 대해
+> import PIL.ImageOps
+> mask = PIL.ImageOps.invert(mask)
+>
+> # PyTorch와 NumPy 입력에 대해
+> mask = 1 - mask
+> ```
 
 인페인팅에서는 원본 이미지, 원본 이미지에서 대체할 영역의 마스크, 인페인팅할 내용에 대한 텍스트 프롬프트가 필요합니다. Prior 파이프라인을 불러옵니다:
 
@@ -565,11 +559,8 @@ image
 
 ## ControlNet
 
-<Tip warning={true}>
-
-⚠️ ControlNet은 Kandinsky 2.2에서만 지원됩니다!
-
-</Tip>
+> [!WARNING]
+> ⚠️ ControlNet은 Kandinsky 2.2에서만 지원됩니다!
 
 ControlNet을 사용하면 depth map이나 edge detection와 같은 추가 입력을 통해 사전학습된 large diffusion 모델을 conditioning할 수 있습니다. 예를 들어, 모델이 depth map의 구조를 이해하고 보존할 수 있도록 깊이 맵으로 Kandinsky 2.2를 conditioning할 수 있습니다.
 
diff --git a/docs/source/ko/using-diffusers/loading.md b/docs/source/ko/using-diffusers/loading.md
index 3d6b7634b49a..2160acacc2e0 100644
--- a/docs/source/ko/using-diffusers/loading.md
+++ b/docs/source/ko/using-diffusers/loading.md
@@ -30,11 +30,8 @@ diffusion 모델의 훈련과 추론에 필요한 모든 것은 [`DiffusionPipel
 
 ## Diffusion 파이프라인
 
-<Tip>
-
-💡 [`DiffusionPipeline`] 클래스가 동작하는 방식에 보다 자세한 내용이 궁금하다면,  [DiffusionPipeline explained](#diffusionpipeline에-대해-알아보기) 섹션을 확인해보세요.
-
-</Tip>
+> [!TIP]
+> 💡 [`DiffusionPipeline`] 클래스가 동작하는 방식에 보다 자세한 내용이 궁금하다면,  [DiffusionPipeline explained](#diffusionpipeline에-대해-알아보기) 섹션을 확인해보세요.
 
 [`DiffusionPipeline`] 클래스는 diffusion 모델을 [허브](https://huggingface.co/models?library=diffusers)로부터 불러오는 가장 심플하면서 보편적인 방식입니다. [`DiffusionPipeline.from_pretrained`] 메서드는 적합한 파이프라인 클래스를 자동으로 탐지하고, 필요한 구성요소(configuration)와 가중치(weight) 파일들을 다운로드하고 캐싱한 다음, 해당 파이프라인 인스턴스를 반환합니다.
 
@@ -175,11 +172,8 @@ Variant란 일반적으로 다음과 같은 체크포인트들을 의미합니
 -  `torch.float16`과 같이 정밀도는 더 낮지만, 용량 역시 더 작은 부동소수점 타입의 가중치를 사용하는 체크포인트. *(다만 이와 같은 variant의 경우, 추가적인 훈련과 CPU환경에서의 구동이 불가능합니다.)*
 - Non-EMA 가중치를 사용하는 체크포인트. *(Non-EMA 가중치의 경우, 파인 튜닝 단계에서 사용하는 것이 권장되는데, 추론 단계에선 사용하지 않는 것이 권장됩니다.)*
 
-<Tip>
-
-💡 모델 구조는 동일하지만 서로 다른 학습 환경에서 서로 다른 데이터셋으로 학습된 체크포인트들이 있을 경우, 해당 체크포인트들은 variant 단계가 아닌 리포지토리 단계에서 분리되어 관리되어야 합니다. (즉, 해당 체크포인트들은 서로 다른 리포지토리에서 따로 관리되어야 합니다. 예시: [`stable-diffusion-v1-4`], [`stable-diffusion-v1-5`]).
-
-</Tip>
+> [!TIP]
+> 💡 모델 구조는 동일하지만 서로 다른 학습 환경에서 서로 다른 데이터셋으로 학습된 체크포인트들이 있을 경우, 해당 체크포인트들은 variant 단계가 아닌 리포지토리 단계에서 분리되어 관리되어야 합니다. (즉, 해당 체크포인트들은 서로 다른 리포지토리에서 따로 관리되어야 합니다. 예시: [`stable-diffusion-v1-4`], [`stable-diffusion-v1-5`]).
 
 | **checkpoint type** | **weight name**                     | **argument for loading weights** |
 | ------------------- | ----------------------------------- | -------------------------------- |
diff --git a/docs/source/ko/using-diffusers/loading_adapters.md b/docs/source/ko/using-diffusers/loading_adapters.md
index f0d085bc6a2e..e7ae116575ae 100644
--- a/docs/source/ko/using-diffusers/loading_adapters.md
+++ b/docs/source/ko/using-diffusers/loading_adapters.md
@@ -18,11 +18,8 @@ specific language governing permissions and limitations under the License.
 
 이 가이드에서는 DreamBooth, textual inversion 및 LoRA 가중치를 불러오는 방법을 설명합니다.
 
-<Tip>
-
-사용할 체크포인트와 임베딩은 [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery)에서 찾아보시기 바랍니다.
-
-</Tip>
+> [!TIP]
+> 사용할 체크포인트와 임베딩은 [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery)에서 찾아보시기 바랍니다.
 
 ## DreamBooth
 
@@ -101,11 +98,8 @@ image
 
 [Low-Rank Adaptation (LoRA)](https://huggingface.co/papers/2106.09685)은 속도가 빠르고 파일 크기가 (수백 MB로) 작기 때문에 널리 사용되는 학습 기법입니다. 이 가이드의 다른 방법과 마찬가지로, LoRA는 몇 장의 이미지만으로 새로운 스타일을 학습하도록 모델을 학습시킬 수 있습니다. 이는 diffusion 모델에 새로운 가중치를 삽입한 다음 전체 모델 대신 새로운 가중치만 학습시키는 방식으로 작동합니다. 따라서 LoRA를 더 빠르게 학습시키고 더 쉽게 저장할 수 있습니다.
 
-<Tip>
-
-LoRA는 다른 학습 방법과 함께 사용할 수 있는 매우 일반적인 학습 기법입니다. 예를 들어, DreamBooth와 LoRA로 모델을 학습하는 것이 일반적입니다. 또한 새롭고 고유한 이미지를 생성하기 위해 여러 개의 LoRA를 불러오고 병합하는 것이 점점 더 일반화되고 있습니다. 병합은 이 불러오기 가이드의 범위를 벗어나므로 자세한 내용은 심층적인 [LoRA 병합](merge_loras) 가이드에서 확인할 수 있습니다.
-
-</Tip>
+> [!TIP]
+> LoRA는 다른 학습 방법과 함께 사용할 수 있는 매우 일반적인 학습 기법입니다. 예를 들어, DreamBooth와 LoRA로 모델을 학습하는 것이 일반적입니다. 또한 새롭고 고유한 이미지를 생성하기 위해 여러 개의 LoRA를 불러오고 병합하는 것이 점점 더 일반화되고 있습니다. 병합은 이 불러오기 가이드의 범위를 벗어나므로 자세한 내용은 심층적인 [LoRA 병합](merge_loras) 가이드에서 확인할 수 있습니다.
 
 LoRA는 다른 모델과 함께 사용해야 합니다:
 
@@ -184,11 +178,8 @@ pipe.set_adapters("my_adapter", scales)
 
 이는 여러 어댑터에서도 작동합니다. 방법은 [이 가이드](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength)를 참조하세요.
 
-<Tip warning={true}>
-
-현재 [`~loaders.LoraLoaderMixin.set_adapters`]는 어텐션 가중치의 스케일링만 지원합니다. LoRA에 다른 부분(예: resnets or down-/upsamplers)이 있는 경우 1.0의 스케일을 유지합니다.
-
-</Tip>
+> [!WARNING]
+> 현재 [`~loaders.LoraLoaderMixin.set_adapters`]는 어텐션 가중치의 스케일링만 지원합니다. LoRA에 다른 부분(예: resnets or down-/upsamplers)이 있는 경우 1.0의 스케일을 유지합니다.
 
 ### Kohya와 TheLastBen
 
@@ -222,14 +213,11 @@ image = pipeline(prompt).images[0]
 image
 ```
 
-<Tip warning={true}>
-
-Kohya LoRA를 🤗 Diffusers와 함께 사용할 때 몇 가지 제한 사항이 있습니다:
-
-- [여기](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736)에 설명된 여러 가지 이유로 인해 이미지가 ComfyUI와 같은 UI에서 생성된 이미지와 다르게 보일 수 있습니다.
-- [LyCORIS 체크포인트](https://github.com/KohakuBlueleaf/LyCORIS)가 완전히 지원되지 않습니다. [`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드는 LoRA 및 LoCon 모듈로 LyCORIS 체크포인트를 불러올 수 있지만, Hada 및 LoKR은 지원되지 않습니다.
-
-</Tip>
+> [!WARNING]
+> Kohya LoRA를 🤗 Diffusers와 함께 사용할 때 몇 가지 제한 사항이 있습니다:
+>
+> - [여기](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736)에 설명된 여러 가지 이유로 인해 이미지가 ComfyUI와 같은 UI에서 생성된 이미지와 다르게 보일 수 있습니다.
+> - [LyCORIS 체크포인트](https://github.com/KohakuBlueleaf/LyCORIS)가 완전히 지원되지 않습니다. [`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드는 LoRA 및 LoCon 모듈로 LyCORIS 체크포인트를 불러올 수 있지만, Hada 및 LoKR은 지원되지 않습니다.
 
 </hfoption>
 <hfoption id="TheLastBen">
@@ -326,9 +314,8 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=
 IP-Adapter FaceID 모델은 CLIP 이미지 임베딩 대신 `insightface`에서 생성한 이미지 임베딩을 사용하는 실험적인 IP Adapter입니다. 이러한 모델 중 일부는 LoRA를 사용하여 ID 일관성을 개선하기도 합니다.
 이러한 모델을 사용하려면 `insightface`와 해당 요구 사항을 모두 설치해야 합니다.
 
-<Tip warning={true}>
-InsightFace 사전학습된 모델은 비상업적 연구 목적으로만 사용할 수 있으므로, IP-Adapter-FaceID 모델은 연구 목적으로만 릴리즈되었으며 상업적 용도로는 사용할 수 없습니다.
-</Tip>
+> [!WARNING]
+> InsightFace 사전학습된 모델은 비상업적 연구 목적으로만 사용할 수 있으므로, IP-Adapter-FaceID 모델은 연구 목적으로만 릴리즈되었으며 상업적 용도로는 사용할 수 없습니다.
 
 ```py
 pipeline = AutoPipelineForText2Image.from_pretrained(
diff --git a/docs/source/ko/using-diffusers/other-formats.md b/docs/source/ko/using-diffusers/other-formats.md
index 3034551f4858..f5a71f56ebef 100644
--- a/docs/source/ko/using-diffusers/other-formats.md
+++ b/docs/source/ko/using-diffusers/other-formats.md
@@ -14,11 +14,8 @@ specific language governing permissions and limitations under the License.
 
 Stable Diffusion 모델들은 학습 및 저장된 프레임워크와 다운로드 위치에 따라 다양한 형식으로 제공됩니다. 이러한 형식을 🤗 Diffusers에서 사용할 수 있도록 변환하면 추론을 위한 [다양한 스케줄러 사용](schedulers), 사용자 지정 파이프라인 구축, 추론 속도 최적화를 위한 다양한 기법과 방법 등 라이브러리에서 지원하는 모든 기능을 사용할 수 있습니다.
 
-<Tip>
-
-우리는 `.safetensors` 형식을 추천합니다. 왜냐하면 기존의 pickled 파일은 취약하고 머신에서 코드를 실행할 때 악용될 수 있는 것에 비해 훨씬 더 안전합니다. (safetensors 불러오기 가이드에서 자세히 알아보세요.)
-
-</Tip>
+> [!TIP]
+> 우리는 `.safetensors` 형식을 추천합니다. 왜냐하면 기존의 pickled 파일은 취약하고 머신에서 코드를 실행할 때 악용될 수 있는 것에 비해 훨씬 더 안전합니다. (safetensors 불러오기 가이드에서 자세히 알아보세요.)
 
 이 가이드에서는 다른 Stable Diffusion 형식을 🤗 Diffusers와 호환되도록 변환하는 방법을 설명합니다.
 
diff --git a/docs/source/ko/using-diffusers/schedulers.md b/docs/source/ko/using-diffusers/schedulers.md
index 55424c9982db..b12c08b8c869 100644
--- a/docs/source/ko/using-diffusers/schedulers.md
+++ b/docs/source/ko/using-diffusers/schedulers.md
@@ -318,12 +318,9 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 ```
 
-<Tip warning={true}>
-
-다음 Flax 스케줄러는 *아직* Flax Stable Diffusion 파이프라인과 호환되지 않습니다.
-
-- `FlaxLMSDiscreteScheduler`
-- `FlaxDDPMScheduler`
-
-</Tip>
+> [!WARNING]
+> 다음 Flax 스케줄러는 *아직* Flax Stable Diffusion 파이프라인과 호환되지 않습니다.
+>
+> - `FlaxLMSDiscreteScheduler`
+> - `FlaxDDPMScheduler`
 
diff --git a/docs/source/ko/using-diffusers/shap-e.md b/docs/source/ko/using-diffusers/shap-e.md
index abf5a182b3a6..4c9d7fb7d1aa 100644
--- a/docs/source/ko/using-diffusers/shap-e.md
+++ b/docs/source/ko/using-diffusers/shap-e.md
@@ -151,11 +151,8 @@ images = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=64, fra
 
 메시 출력을 `ply` 파일로 저장하려면 [`~utils.export_to_ply`] 함수를 사용합니다:
 
-<Tip>
-
-선택적으로 [`~utils.export_to_obj`] 함수를 사용하여 메시 출력을 `obj` 파일로 저장할 수 있습니다. 다양한 형식으로 메시 출력을 저장할 수 있어 다운스트림에서 더욱 유연하게 사용할 수 있습니다!
-
-</Tip>
+> [!TIP]
+> 선택적으로 [`~utils.export_to_obj`] 함수를 사용하여 메시 출력을 `obj` 파일로 저장할 수 있습니다. 다양한 형식으로 메시 출력을 저장할 수 있어 다운스트림에서 더욱 유연하게 사용할 수 있습니다!
 
 ```py
 from diffusers.utils import export_to_ply
diff --git a/docs/source/ko/using-diffusers/unconditional_image_generation.md b/docs/source/ko/using-diffusers/unconditional_image_generation.md
index c3eaac4b032f..b8fe800578fe 100644
--- a/docs/source/ko/using-diffusers/unconditional_image_generation.md
+++ b/docs/source/ko/using-diffusers/unconditional_image_generation.md
@@ -20,11 +20,8 @@ Unconditional 이미지 생성은 비교적 간단한 작업입니다. 모델이
 
 먼저 ['DiffusionPipeline']의 인스턴스를 생성하고 다운로드할 파이프라인의 [체크포인트](https://huggingface.co/models?library=diffusers&sort=downloads)를 지정합니다. 허브의 🧨 diffusion 체크포인트 중 하나를 사용할 수 있습니다(사용할 체크포인트는 나비 이미지를 생성합니다).
 
-<Tip>
-
-💡 나만의 unconditional 이미지 생성 모델을 학습시키고 싶으신가요? 학습 가이드를 살펴보고 나만의 이미지를 생성하는 방법을 알아보세요.
-
-</Tip>
+> [!TIP]
+> 💡 나만의 unconditional 이미지 생성 모델을 학습시키고 싶으신가요? 학습 가이드를 살펴보고 나만의 이미지를 생성하는 방법을 알아보세요.
 
 
 이 가이드에서는 unconditional 이미지 생성에 ['DiffusionPipeline']과 [DDPM](https://huggingface.co/papers/2006.11239)을 사용합니다:
diff --git a/docs/source/ko/using-diffusers/write_own_pipeline.md b/docs/source/ko/using-diffusers/write_own_pipeline.md
index 45678763cce5..ae6ce238ac1b 100644
--- a/docs/source/ko/using-diffusers/write_own_pipeline.md
+++ b/docs/source/ko/using-diffusers/write_own_pipeline.md
@@ -110,11 +110,8 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di
 
 보시다시피, 이것은 UNet 모델만 포함된 DDPM 파이프라인보다 더 복잡합니다. Stable Diffusion 모델에는 세 개의 개별 사전학습된 모델이 있습니다.
 
-<Tip>
-
-💡 VAE, UNet 및 텍스트 인코더 모델의 작동방식에 대한 자세한 내용은 [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) 블로그를 참조하세요.
-
-</Tip>
+> [!TIP]
+> 💡 VAE, UNet 및 텍스트 인코더 모델의 작동방식에 대한 자세한 내용은 [How does Stable Diffusion work?](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work) 블로그를 참조하세요.
 
 이제 Stable Diffusion 파이프라인에 필요한 구성요소들이 무엇인지 알았으니, [`~ModelMixin.from_pretrained`] 메서드를 사용해 모든 구성요소를 불러옵니다. 사전학습된 체크포인트 [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)에서 찾을 수 있으며, 각 구성요소들은 별도의 하위 폴더에 저장되어 있습니다:
 
@@ -151,11 +148,8 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di
 
 다음 단계는 임베딩을 생성하기 위해 텍스트를 토큰화하는 것입니다. 이 텍스트는 UNet 모델에서 condition으로 사용되고 입력 프롬프트와 유사한 방향으로 diffusion 프로세스를 조정하는 데 사용됩니다.
 
-<Tip>
-
-💡 `guidance_scale` 매개변수는 이미지를 생성할 때 프롬프트에 얼마나 많은 가중치를 부여할지 결정합니다.
-
-</Tip>
+> [!TIP]
+> 💡 `guidance_scale` 매개변수는 이미지를 생성할 때 프롬프트에 얼마나 많은 가중치를 부여할지 결정합니다.
 
 다른 프롬프트를 생성하고 싶다면 원하는 프롬프트를 자유롭게 선택하세요!
 
@@ -198,15 +192,12 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di
 
 그다음 diffusion 프로세스의 시작점으로 초기 랜덤 노이즈를 생성합니다. 이것이 이미지의 잠재적 표현이며 점차적으로 노이즈가 제거됩니다. 이 시점에서 `latent` 이미지는 최종 이미지 크기보다 작지만 나중에 모델이 이를 512x512 이미지 크기로 변환하므로 괜찮습니다.
 
-<Tip>
-
-💡 `vae` 모델에는 3개의 다운 샘플링 레이어가 있기 때문에 높이와 너비가 8로 나뉩니다. 다음을 실행하여 확인할 수 있습니다:
-
-```py
-2 ** (len(vae.config.block_out_channels) - 1) == 8
-```
-
-</Tip>
+> [!TIP]
+> 💡 `vae` 모델에는 3개의 다운 샘플링 레이어가 있기 때문에 높이와 너비가 8로 나뉩니다. 다음을 실행하여 확인할 수 있습니다:
+>
+> ```py
+> 2 ** (len(vae.config.block_out_channels) - 1) == 8
+> ```
 
 ```py
 >>> latents = torch.randn(
diff --git a/docs/source/pt/installation.md b/docs/source/pt/installation.md
index 1e83e36ca157..acc767110cb9 100644
--- a/docs/source/pt/installation.md
+++ b/docs/source/pt/installation.md
@@ -104,11 +104,8 @@ Esses comandos irá linkar a pasta que você clonou o repositório e os caminhos
 Python então irá procurar dentro da pasta que você clonou além dos caminhos normais das bibliotecas.
 Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.10/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou.
 
-<Tip warning={true}>
-
-Você deve deixar a pasta `diffusers` se você quiser continuar usando a biblioteca.
-
-</Tip>
+> [!WARNING]
+> Você deve deixar a pasta `diffusers` se você quiser continuar usando a biblioteca.
 
 Agora você pode facilmente atualizar seu clone para a última versão do 🤗 Diffusers com o seguinte comando:
 
diff --git a/docs/source/pt/quicktour.md b/docs/source/pt/quicktour.md
index 109f7e271295..5996b65a9cb4 100644
--- a/docs/source/pt/quicktour.md
+++ b/docs/source/pt/quicktour.md
@@ -24,11 +24,8 @@ Seja você um desenvolvedor ou um usuário, esse tour rápido irá introduzir vo
 
 Esse tour rápido mostrará como usar o [`DiffusionPipeline`] para inferência, e então mostrará como combinar um modelo e um agendador para replicar o que está acontecendo dentro do [`DiffusionPipeline`].
 
-<Tip>
-
-Esse tour rápido é uma versão simplificada da introdução 🧨 Diffusers [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) para ajudar você a começar rápido. Se você quer aprender mais sobre o objetivo do 🧨 Diffusers, filosofia de design, e detalhes adicionais sobre a API principal, veja o notebook!
-
-</Tip>
+> [!TIP]
+> Esse tour rápido é uma versão simplificada da introdução 🧨 Diffusers [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) para ajudar você a começar rápido. Se você quer aprender mais sobre o objetivo do 🧨 Diffusers, filosofia de design, e detalhes adicionais sobre a API principal, veja o notebook!
 
 Antes de começar, certifique-se de ter todas as bibliotecas necessárias instaladas:
 
@@ -56,11 +53,8 @@ Comece criando uma instância do [`DiffusionPipeline`] e especifique qual checkp
 Você pode usar o [`DiffusionPipeline`] para qualquer [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) armazenado no Hugging Face Hub.
 Nesse quicktour, você carregará o checkpoint [`stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) para geração de texto para imagem.
 
-<Tip warning={true}>
-
-Para os modelos de [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion), por favor leia cuidadosamente a [licença](https://huggingface.co/spaces/CompVis/stable-diffusion-license) primeiro antes de rodar o modelo. 🧨 Diffusers implementa uma verificação de segurança: [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) para prevenir conteúdo ofensivo ou nocivo, mas as capacidades de geração de imagem aprimorada do modelo podem ainda produzir conteúdo potencialmente nocivo.
-
-</Tip>
+> [!WARNING]
+> Para os modelos de [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion), por favor leia cuidadosamente a [licença](https://huggingface.co/spaces/CompVis/stable-diffusion-license) primeiro antes de rodar o modelo. 🧨 Diffusers implementa uma verificação de segurança: [`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) para prevenir conteúdo ofensivo ou nocivo, mas as capacidades de geração de imagem aprimorada do modelo podem ainda produzir conteúdo potencialmente nocivo.
 
 Para carregar o modelo com o método [`~DiffusionPipeline.from_pretrained`]:
 
@@ -204,11 +198,8 @@ Para geração de exemplos reais, você precisará de um agendador para guiar o
 
 Agendadores gerenciam a retirada do ruído de uma amostra ruidosa para uma amostra menos ruidosa dado a saída do modelo - nesse caso, é o `noisy_residual`.
 
-<Tip>
-
-🧨 Diffusers é uma caixa de ferramentas para construir sistemas de difusão. Enquanto o [`DiffusionPipeline`] é uma forma conveniente de começar com um sistema de difusão pré-construído, você também pode escolher seus próprios modelos e agendadores separadamente para construir um sistema de difusão personalizado.
-
-</Tip>
+> [!TIP]
+> 🧨 Diffusers é uma caixa de ferramentas para construir sistemas de difusão. Enquanto o [`DiffusionPipeline`] é uma forma conveniente de começar com um sistema de difusão pré-construído, você também pode escolher seus próprios modelos e agendadores separadamente para construir um sistema de difusão personalizado.
 
 Para o tour rápido, você irá instanciar o [`DDPMScheduler`] com o método [`~diffusers.ConfigMixin.from_config`]:
 
@@ -232,11 +223,8 @@ DDPMScheduler {
 }
 ```
 
-<Tip>
-
-💡 Perceba como o agendador é instanciado de uma configuração. Diferentemente de um modelo, um agendador não tem pesos treináveis e é livre de parâmetros!
-
-</Tip>
+> [!TIP]
+> 💡 Perceba como o agendador é instanciado de uma configuração. Diferentemente de um modelo, um agendador não tem pesos treináveis e é livre de parâmetros!
 
 Um dos parâmetros mais importante são:
 
diff --git a/docs/source/zh/conceptual/evaluation.md b/docs/source/zh/conceptual/evaluation.md
index e809c8730d34..770d197be041 100644
--- a/docs/source/zh/conceptual/evaluation.md
+++ b/docs/source/zh/conceptual/evaluation.md
@@ -92,11 +92,8 @@ images = sd_pipeline(sample_prompts, num_images_per_prompt=1, generator=generato
 
 当使用多个待评估模型为所有提示词生成若干图像后，这些结果将提交给人类评估员进行打分。有关DrawBench和PartiPrompts基准测试的更多细节，请参阅各自的论文。
 
-<Tip>
-
-在模型训练过程中查看推理样本有助于评估训练进度。我们的[训练脚本](https://github.com/huggingface/diffusers/tree/main/examples/)支持此功能，并额外提供TensorBoard和Weights & Biases日志记录功能。
-
-</Tip>
+> [!TIP]
+> 在模型训练过程中查看推理样本有助于评估训练进度。我们的[训练脚本](https://github.com/huggingface/diffusers/tree/main/examples/)支持此功能，并额外提供TensorBoard和Weights & Biases日志记录功能。
 
 ## 定量评估
 
@@ -189,11 +186,8 @@ print(f"v-1-5版本的CLIP分数: {sd_clip_score_1_5}")
 
 结果表明[v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)检查点性能优于前代。但需注意，我们用于计算CLIP分数的提示词数量较少。实际评估时应使用更多样化且数量更大的提示词集。
 
-<Tip warning={true}>
-
-该分数存在固有局限性：训练数据中的标题是从网络爬取，并提取自图片关联的`alt`等标签。这些描述未必符合人类描述图像的方式，因此我们需要人工"设计"部分提示词。
-
-</Tip>
+> [!WARNING]
+> 该分数存在固有局限性：训练数据中的标题是从网络爬取，并提取自图片关联的`alt`等标签。这些描述未必符合人类描述图像的方式，因此我们需要人工"设计"部分提示词。
 
 ### 图像条件式文本生成图像
 
@@ -402,11 +396,8 @@ print(f"CLIP方向相似度: {np.mean(scores)}")
 
 该度量方法同样适用于类似流程，例如[`StableDiffusionPix2PixZeroPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix_zero#diffusers.StableDiffusionPix2PixZeroPipeline)。
 
-<Tip>
-
-CLIP分数和CLIP方向相似度都依赖CLIP模型，可能导致评估结果存在偏差。
-
-</Tip>
+> [!TIP]
+> CLIP分数和CLIP方向相似度都依赖CLIP模型，可能导致评估结果存在偏差。
 
 ***扩展IS、FID（后文讨论）或KID等指标存在困难***，当被评估模型是在大型图文数据集（如[LAION-5B数据集](https://laion.ai/blog/laion-5b/)）上预训练时。因为这些指标的底层都使用了在ImageNet-1k数据集上预训练的InceptionNet来提取图像特征。Stable Diffusion的预训练数据集与InceptionNet的预训练数据集可能重叠有限，因此不适合作为特征提取器。
 
@@ -536,19 +527,16 @@ FID分数越低越好。以下因素会影响FID结果：
 
 对于最后两点，最佳实践是使用不同的随机种子和推理步数进行多次评估，然后报告平均结果。
 
-<Tip warning={true}>
-
-FID结果往往具有脆弱性，因为它依赖于许多因素：
-
-* 计算过程中使用的特定Inception模型
-* 计算实现的准确性
-* 图像格式（PNG和JPG的起点不同）
-
-需要注意的是，FID通常在比较相似实验时最有用，但除非作者仔细公开FID测量代码，否则很难复现论文结果。
-
-这些注意事项同样适用于其他相关指标，如KID和IS。
-
-</Tip>
+> [!WARNING]
+> FID结果往往具有脆弱性，因为它依赖于许多因素：
+>
+> * 计算过程中使用的特定Inception模型
+> * 计算实现的准确性
+> * 图像格式（PNG和JPG的起点不同）
+>
+> 需要注意的是，FID通常在比较相似实验时最有用，但除非作者仔细公开FID测量代码，否则很难复现论文结果。
+>
+> 这些注意事项同样适用于其他相关指标，如KID和IS。
 
 最后，让我们可视化检查这些`fake_images`。
 
diff --git a/docs/source/zh/installation.md b/docs/source/zh/installation.md
index fc77ea8c48c3..9941ed24aea4 100644
--- a/docs/source/zh/installation.md
+++ b/docs/source/zh/installation.md
@@ -109,11 +109,8 @@ pip install -e ".[flax]"
 现在，不只是在通常的库路径，Python 还会在你克隆的文件夹内寻找包。
 例如，如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.10/Site-packages/`，Python 也会搜索你克隆到的文件夹。`~/diffusers/`。
 
-<Tip warning={true}>
-
-如果你想继续使用这个库，你必须保留 `diffusers` 文件夹。
-
-</Tip>
+> [!WARNING]
+> 如果你想继续使用这个库，你必须保留 `diffusers` 文件夹。
 
 
 现在你可以用下面的命令轻松地将你克隆的 🤗 Diffusers 库更新到最新版本。
diff --git a/docs/source/zh/optimization/coreml.md b/docs/source/zh/optimization/coreml.md
index 1d788667203e..3926a5ddb029 100644
--- a/docs/source/zh/optimization/coreml.md
+++ b/docs/source/zh/optimization/coreml.md
@@ -13,11 +13,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Core ML 模型可以利用 Apple 设备中所有可用的计算引擎：CPU、GPU 和 Apple Neural Engine（或 ANE，一种在 Apple Silicon Mac 和现代 iPhone/iPad 中可用的张量优化加速器）。根据模型及其运行的设备，Core ML 还可以混合和匹配计算引擎，例如，模型的某些部分可能在 CPU 上运行，而其他部分在 GPU 上运行。
 
-<Tip>
-
-您还可以使用 PyTorch 内置的 `mps` 加速器在 Apple Silicon Mac 上运行 `diffusers` Python 代码库。这种方法在 [mps 指南](mps) 中有详细解释，但它与原生应用不兼容。
-
-</Tip>
+> [!TIP]
+> 您还可以使用 PyTorch 内置的 `mps` 加速器在 Apple Silicon Mac 上运行 `diffusers` Python 代码库。这种方法在 [mps 指南](mps) 中有详细解释，但它与原生应用不兼容。
 
 ## Stable Diffusion Core ML 检查点
 
diff --git a/docs/source/zh/optimization/fp16.md b/docs/source/zh/optimization/fp16.md
index 1088482d2432..e1c4c7e57ae7 100644
--- a/docs/source/zh/optimization/fp16.md
+++ b/docs/source/zh/optimization/fp16.md
@@ -238,11 +238,8 @@ pipeline.unet = compile_regions(pipeline.unet, mode="reduce-overhead", fullgraph
 
 一般来说，`sigmas`应该[保持在CPU上](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240)，以避免通信同步和延迟。
 
-<Tip>
-
-参阅[torch.compile和Diffusers：峰值性能实践指南](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/)博客文章，了解如何为扩散模型最大化`torch.compile`的性能。
-
-</Tip>
+> [!TIP]
+> 参阅[torch.compile和Diffusers：峰值性能实践指南](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/)博客文章，了解如何为扩散模型最大化`torch.compile`的性能。
 
 ### 基准测试
 
diff --git a/docs/source/zh/optimization/mps.md b/docs/source/zh/optimization/mps.md
index c76a47533666..48b08c5a12df 100644
--- a/docs/source/zh/optimization/mps.md
+++ b/docs/source/zh/optimization/mps.md
@@ -35,11 +35,8 @@ image = pipe(prompt).images[0]
 image
 ```
 
-<Tip warning={true}>
-
-PyTorch [mps](https://pytorch.org/docs/stable/notes/mps.html) 后端不支持大小超过 `2**32` 的 NDArray。如果您遇到此问题，请提交 [Issue](https://github.com/huggingface/diffusers/issues/new/choose) 以便我们调查。
-
-</Tip>
+> [!WARNING]
+> PyTorch [mps](https://pytorch.org/docs/stable/notes/mps.html) 后端不支持大小超过 `2**32` 的 NDArray。如果您遇到此问题，请提交 [Issue](https://github.com/huggingface/diffusers/issues/new/choose) 以便我们调查。
 
 如果您使用 **PyTorch 1.13**，您需要通过管道进行一次额外的"预热"传递。这是一个临时解决方法，用于解决首次推理传递产生的结果与后续传递略有不同的问题。您只需要执行此传递一次，并且在仅进行一次推理步骤后可以丢弃结果。
 
diff --git a/docs/source/zh/optimization/neuron.md b/docs/source/zh/optimization/neuron.md
index 709404d56b51..99d807a88c0d 100644
--- a/docs/source/zh/optimization/neuron.md
+++ b/docs/source/zh/optimization/neuron.md
@@ -17,11 +17,8 @@ Diffusers 功能可在 [AWS Inf2 实例](https://aws.amazon.com/ec2/instance-typ
 python -m pip install --upgrade-strategy eager optimum[neuronx]
 ```
 
-<Tip>
-
-我们提供预构建的 [Hugging Face Neuron 深度学习 AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)（DLAMI）和用于 Amazon SageMaker 的 Optimum Neuron 容器。建议正确设置您的环境。
-
-</Tip>
+> [!TIP]
+> 我们提供预构建的 [Hugging Face Neuron 深度学习 AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)（DLAMI）和用于 Amazon SageMaker 的 Optimum Neuron 容器。建议正确设置您的环境。
 
 下面的示例演示了如何在 inf2.8xlarge 实例上使用 Stable Diffusion XL 模型生成图像（一旦模型编译完成，您可以切换到更便宜的 inf2.xlarge 实例）。要生成一些图像，请使用 [`~optimum.neuron.NeuronStableDiffusionXLPipeline`] 类，该类类似于 Diffusers 中的 [`StableDiffusionXLPipeline`] 类。
 
diff --git a/docs/source/zh/optimization/onnx.md b/docs/source/zh/optimization/onnx.md
index 4b3804d01500..b70510d51b75 100644
--- a/docs/source/zh/optimization/onnx.md
+++ b/docs/source/zh/optimization/onnx.md
@@ -31,11 +31,8 @@ image = pipeline(prompt).images[0]
 pipeline.save_pretrained("./onnx-stable-diffusion-v1-5")
 ```
 
-<Tip warning={true}>
-
-当前批量生成多个提示可能会占用过高内存。在问题修复前，建议采用迭代方式而非批量处理。
-
-</Tip>
+> [!WARNING]
+> 当前批量生成多个提示可能会占用过高内存。在问题修复前，建议采用迭代方式而非批量处理。
 
 如需离线导出 ONNX 格式流水线供后续推理使用，请使用 [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 命令：
 
diff --git a/docs/source/zh/optimization/xformers.md b/docs/source/zh/optimization/xformers.md
index 9902feeee662..2a3a3d8341e0 100644
--- a/docs/source/zh/optimization/xformers.md
+++ b/docs/source/zh/optimization/xformers.md
@@ -17,16 +17,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 pip install xformers
 ```
 
-<Tip>
-
-xFormers的`pip`安装包需要最新版本的PyTorch。如需使用旧版PyTorch，建议[从源码安装xFormers](https://github.com/facebookresearch/xformers#installing-xformers)。
-
-</Tip>
+> [!TIP]
+> xFormers的`pip`安装包需要最新版本的PyTorch。如需使用旧版PyTorch，建议[从源码安装xFormers](https://github.com/facebookresearch/xformers#installing-xformers)。
 
 安装完成后，您可调用`enable_xformers_memory_efficient_attention()`来实现更快的推理速度和更低的内存占用，具体用法参见[此章节](memory#memory-efficient-attention)。
 
-<Tip warning={true}>
-
-根据[此问题](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)反馈，xFormers `v0.0.16`版本在某些GPU上无法用于训练（微调或DreamBooth）。如遇此问题，请按照该issue评论区指引安装开发版本。
-
-</Tip>
\ No newline at end of file
+> [!WARNING]
+> 根据[此问题](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)反馈，xFormers `v0.0.16`版本在某些GPU上无法用于训练（微调或DreamBooth）。如遇此问题，请按照该issue评论区指引安装开发版本。
\ No newline at end of file
diff --git a/docs/source/zh/quicktour.md b/docs/source/zh/quicktour.md
index 08efaa87d29e..2b8803384f25 100644
--- a/docs/source/zh/quicktour.md
+++ b/docs/source/zh/quicktour.md
@@ -31,11 +31,8 @@ specific language governing permissions and limitations under the License.
 
 快速入门将告诉你如何使用[`DiffusionPipeline`]进行推理，然后指导你如何结合模型和调度器以复现[`DiffusionPipeline`]内部发生的事情。
 
-<Tip>
-
-快速入门是🧨[Diffusers入门](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)的简化版，可以帮助你快速上手。如果你想了解更多关于🧨 Diffusers的目标、设计理念以及关于它的核心API的更多细节，可以点击🧨[Diffusers入门](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)查看。
-
-</Tip>
+> [!TIP]
+> 快速入门是🧨[Diffusers入门](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)的简化版，可以帮助你快速上手。如果你想了解更多关于🧨 Diffusers的目标、设计理念以及关于它的核心API的更多细节，可以点击🧨[Diffusers入门](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)查看。
 
 在开始之前，确认一下你已经安装好了所需要的库：
 
@@ -66,11 +63,10 @@ pip install --upgrade diffusers accelerate transformers
 您可以在Hugging Face Hub上使用[DiffusionPipeline]的任何检查点。
 在本快速入门中，您将加载stable-diffusion-v1-5检查点，用于文本到图像生成。
 
-<Tip warning={true}>。
-
-对于[Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion)模型，在运行该模型之前，请先仔细阅读[许可证](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。🧨 Diffusers实现了一个[`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py)，以防止有攻击性的或有害的内容，但Stable Diffusion模型改进图像的生成能力仍有可能产生潜在的有害内容。
-
-</Tip>
+> [!WARNING]
+> 。
+>
+> 对于[Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion)模型，在运行该模型之前，请先仔细阅读[许可证](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。🧨 Diffusers实现了一个[`safety_checker`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py)，以防止有攻击性的或有害的内容，但Stable Diffusion模型改进图像的生成能力仍有可能产生潜在的有害内容。
 
 用[`~DiffusionPipeline.from_pretrained`]方法加载模型。
 
@@ -221,11 +217,8 @@ torch.Size([1, 3, 256, 256])
 
 
 
-<Tip>
-
-🧨 Diffusers是一个用于构建扩散系统的工具箱。预定义好的扩散系统[`DiffusionPipeline`]能方便你快速试用，你也可以单独选择自己的模型和调度器组件来建立一个自定义的扩散系统。
-
-</Tip>
+> [!TIP]
+> 🧨 Diffusers是一个用于构建扩散系统的工具箱。预定义好的扩散系统[`DiffusionPipeline`]能方便你快速试用，你也可以单独选择自己的模型和调度器组件来建立一个自定义的扩散系统。
 
 在快速入门教程中，你将用它的[`~diffusers.ConfigMixin.from_config`]方法实例化[`DDPMScheduler`]：
 
@@ -249,12 +242,8 @@ DDPMScheduler {
 }
 ```
 
-<Tip>
-
-
-💡 注意调度器是如何从配置中实例化的。与模型不同，调度器没有可训练的权重，而且是无参数的。
-
-</Tip>
+> [!TIP]
+> 💡 注意调度器是如何从配置中实例化的。与模型不同，调度器没有可训练的权重，而且是无参数的。
 
 * `num_train_timesteps`：去噪过程的长度，或者换句话说，将随机高斯噪声处理成数据样本所需的时间步数。
 * `beta_schedule`：用于推理和训练的噪声表。
diff --git a/docs/source/zh/stable_diffusion.md b/docs/source/zh/stable_diffusion.md
index bf9288c5b7f7..d337fb41a0ad 100644
--- a/docs/source/zh/stable_diffusion.md
+++ b/docs/source/zh/stable_diffusion.md
@@ -1,264 +1,258 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# 有效且高效的扩散
-
-[[open-in-colab]]
-
-让 [`DiffusionPipeline`] 生成特定风格或包含你所想要的内容的图像可能会有些棘手。 通常情况下，你需要多次运行 [`DiffusionPipeline`] 才能得到满意的图像。但是从无到有生成图像是一个计算密集的过程，特别是如果你要一遍又一遍地进行推理运算。
-
-这就是为什么从pipeline中获得最高的 *computational* (speed) 和 *memory* (GPU RAM) 非常重要 ，以减少推理周期之间的时间，从而使迭代速度更快。
-
-
-本教程将指导您如何通过 [`DiffusionPipeline`]  更快、更好地生成图像。
-
-
-首先，加载 [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) 模型:
-
-```python
-from diffusers import DiffusionPipeline
-
-model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
-```
-
-本教程将使用的提示词是 [`portrait photo of a old warrior chief`] ，但是你可以随心所欲的想象和构造自己的提示词：
-
-```python
-prompt = "portrait photo of a old warrior chief"
-```
-
-## 速度
-
-<Tip>
-
-💡 如果你没有 GPU, 你可以从像 [Colab](https://colab.research.google.com/) 这样的 GPU 提供商获取免费的 GPU !
-
-</Tip>
-
-加速推理的最简单方法之一是将 pipeline 放在 GPU 上 ，就像使用任何 PyTorch 模块一样：
-
-```python
-pipeline = pipeline.to("cuda")
-```
-
-为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
-
-```python
-import torch
-
-generator = torch.Generator("cuda").manual_seed(0)
-```
-
-现在，你可以生成一个图像：
-
-```python
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_1.png">
-</div>
-
-在 T4 GPU 上，这个过程大概要30秒（如果你的 GPU 比 T4 好，可能会更快）。在默认情况下，[`DiffusionPipeline`] 使用完整的 `float32` 精度进行 50 步推理。你可以通过降低精度（如 `float16` ）或者减少推理步数来加速整个过程
-
-
-让我们把模型的精度降低至 `float16` ，然后生成一张图像：
-
-```python
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True)
-pipeline = pipeline.to("cuda")
-generator = torch.Generator("cuda").manual_seed(0)
-image = pipeline(prompt, generator=generator).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_2.png">
-</div>
-
-这一次，生成图像只花了约 11 秒，比之前快了近 3 倍！
-
-<Tip>
-
-💡 我们强烈建议把 pipeline 精度降低至 `float16` , 到目前为止, 我们很少看到输出质量有任何下降。
-
-</Tip>
-
-另一个选择是减少推理步数。 你可以选择一个更高效的调度器 (*scheduler*) 可以减少推理步数同时保证输出质量。您可以在 [DiffusionPipeline] 中通过调用compatibles方法找到与当前模型兼容的调度器 (*scheduler*)。
-
-```python
-pipeline.scheduler.compatibles
-[
-    diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
-    diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler,
-    diffusers.schedulers.scheduling_k_dpm_2_discrete.KDPM2DiscreteScheduler,
-    diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler,
-    diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
-    diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
-    diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
-    diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
-    diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete.KDPM2AncestralDiscreteScheduler,
-    diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
-    diffusers.schedulers.scheduling_pndm.PNDMScheduler,
-    diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
-    diffusers.schedulers.scheduling_ddim.DDIMScheduler,
-]
-```
-
-Stable Diffusion 模型默认使用的是 [`PNDMScheduler`] ，通常要大概50步推理, 但是像 [`DPMSolverMultistepScheduler`] 这样更高效的调度器只要大概 20 或 25 步推理. 使用 [`ConfigMixin.from_config`] 方法加载新的调度器:
-
-```python
-from diffusers import DPMSolverMultistepScheduler
-
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-```
-
-现在将 `num_inference_steps` 设置为 20:
-
-```python
-generator = torch.Generator("cuda").manual_seed(0)
-image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_3.png">
-</div>
-
-太棒了！你成功把推理时间缩短到 4 秒！⚡️
-
-## 内存
-
-改善 pipeline 性能的另一个关键是减少内存的使用量，这间接意味着速度更快，因为你经常试图最大化每秒生成的图像数量。要想知道你一次可以生成多少张图片，最简单的方法是尝试不同的batch size，直到出现`OutOfMemoryError` (OOM)。
-
-创建一个函数，为每一批要生成的图像分配提示词和 `Generators` 。请务必为每个`Generator` 分配一个种子，以便于复现良好的结果。
-
-
-```python
-def get_inputs(batch_size=1):
-    generator = [torch.Generator("cuda").manual_seed(i) for i in range(batch_size)]
-    prompts = batch_size * [prompt]
-    num_inference_steps = 20
-
-    return {"prompt": prompts, "generator": generator, "num_inference_steps": num_inference_steps}
-```
-
-设置 `batch_size=4` ，然后看一看我们消耗了多少内存:
-
-```python
-from diffusers.utils import make_image_grid
-
-images = pipeline(**get_inputs(batch_size=4)).images
-make_image_grid(images, 2, 2)
-```
-
-除非你有一个更大内存的GPU, 否则上述代码会返回 `OOM` 错误! 大部分内存被 cross-attention 层使用。按顺序运行可以节省大量内存，而不是在批处理中进行。你可以为 pipeline 配置 [`~DiffusionPipeline.enable_attention_slicing`] 函数:
-
-```python
-pipeline.enable_attention_slicing()
-```
-
-现在尝试把 `batch_size` 增加到 8!
-
-```python
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_5.png">
-</div>
-
-以前你不能一批生成 4 张图片，而现在你可以在一张图片里面生成八张图片而只需要大概3.5秒！这可能是 T4 GPU 在不牺牲质量的情况运行速度最快的一种方法。
-
-## 质量
-
-在最后两节中, 你要学习如何通过 `fp16` 来优化 pipeline 的速度, 通过使用性能更高的调度器来减少推理步数, 使用注意力切片（*enabling attention slicing*）方法来节省内存。现在，你将关注的是如何提高图像的质量。
-
-### 更好的 checkpoints
-
-有个显而易见的方法是使用更好的 checkpoints。 Stable Diffusion 模型是一个很好的起点, 自正式发布以来，还发布了几个改进版本。然而, 使用更新的版本并不意味着你会得到更好的结果。你仍然需要尝试不同的 checkpoints ，并做一些研究 (例如使用 [negative prompts](https://minimaxir.com/2022/11/stable-diffusion-negative-prompt/)) 来获得更好的结果。
-
-随着该领域的发展, 有越来越多经过微调的高质量的 checkpoints 用来生成不一样的风格. 在 [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) 和 [Diffusers Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) 寻找你感兴趣的一种!
-
-### 更好的 pipeline 组件
-
-也可以尝试用新版本替换当前 pipeline 组件。让我们加载最新的 [autodecoder](https://huggingface.co/stabilityai/stable-diffusion-2-1/tree/main/vae) 从 Stability AI 加载到 pipeline, 并生成一些图像:
-
-```python
-from diffusers import AutoencoderKL
-
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda")
-pipeline.vae = vae
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_6.png">
-</div>
-
-### 更好的提示词工程
-
-用于生成图像的文本非常重要, 因此被称为 *提示词工程*。 在设计提示词工程应注意如下事项:
-
-- 我想生成的图像或类似图像如何存储在互联网上？
-- 我可以提供哪些额外的细节来引导模型朝着我想要的风格生成？
-
-考虑到这一点，让我们改进提示词，以包含颜色和更高质量的细节：
-
-```python
-prompt += ", tribal panther make up, blue on red, side profile, looking away, serious eyes"
-prompt += " 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta"
-```
-
-使用新的提示词生成一批图像:
-
-```python
-images = pipeline(**get_inputs(batch_size=8)).images
-make_image_grid(images, rows=2, cols=4)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_7.png">
-</div>
-
-非常的令人印象深刻! Let's tweak the second image - 把 `Generator` 的种子设置为 `1` - 添加一些关于年龄的主题文本:
-
-```python
-prompts = [
-    "portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-]
-
-generator = [torch.Generator("cuda").manual_seed(1) for _ in range(len(prompts))]
-images = pipeline(prompt=prompts, generator=generator, num_inference_steps=25).images
-make_image_grid(images, 2, 2)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_8.png">
-</div>
-
-## 最后
-
-在本教程中, 您学习了如何优化[`DiffusionPipeline`]以提高计算和内存效率，以及提高生成输出的质量. 如果你有兴趣让你的 pipeline 更快, 可以看一看以下资源:
-
-- 学习 [PyTorch 2.0](./optimization/torch2.0) 和 [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) 可以让推理速度提高 5 - 300% . 在 A100 GPU 上, 推理速度可以提高 50% !
-- 如果你没法用 PyTorch 2, 我们建议你安装 [xFormers](./optimization/xformers)。它的内存高效注意力机制（*memory-efficient attention mechanism*）与PyTorch 1.13.1配合使用，速度更快，内存消耗更少。
-- 其他的优化技术, 如：模型卸载（*model offloading*）, 包含在 [这份指南](./optimization/fp16).
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 有效且高效的扩散
+
+[[open-in-colab]]
+
+让 [`DiffusionPipeline`] 生成特定风格或包含你所想要的内容的图像可能会有些棘手。 通常情况下，你需要多次运行 [`DiffusionPipeline`] 才能得到满意的图像。但是从无到有生成图像是一个计算密集的过程，特别是如果你要一遍又一遍地进行推理运算。
+
+这就是为什么从pipeline中获得最高的 *computational* (speed) 和 *memory* (GPU RAM) 非常重要 ，以减少推理周期之间的时间，从而使迭代速度更快。
+
+
+本教程将指导您如何通过 [`DiffusionPipeline`]  更快、更好地生成图像。
+
+
+首先，加载 [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) 模型:
+
+```python
+from diffusers import DiffusionPipeline
+
+model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
+```
+
+本教程将使用的提示词是 [`portrait photo of a old warrior chief`] ，但是你可以随心所欲的想象和构造自己的提示词：
+
+```python
+prompt = "portrait photo of a old warrior chief"
+```
+
+## 速度
+
+> [!TIP]
+> 💡 如果你没有 GPU, 你可以从像 [Colab](https://colab.research.google.com/) 这样的 GPU 提供商获取免费的 GPU !
+
+加速推理的最简单方法之一是将 pipeline 放在 GPU 上 ，就像使用任何 PyTorch 模块一样：
+
+```python
+pipeline = pipeline.to("cuda")
+```
+
+为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
+
+```python
+import torch
+
+generator = torch.Generator("cuda").manual_seed(0)
+```
+
+现在，你可以生成一个图像：
+
+```python
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_1.png">
+</div>
+
+在 T4 GPU 上，这个过程大概要30秒（如果你的 GPU 比 T4 好，可能会更快）。在默认情况下，[`DiffusionPipeline`] 使用完整的 `float32` 精度进行 50 步推理。你可以通过降低精度（如 `float16` ）或者减少推理步数来加速整个过程
+
+
+让我们把模型的精度降低至 `float16` ，然后生成一张图像：
+
+```python
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True)
+pipeline = pipeline.to("cuda")
+generator = torch.Generator("cuda").manual_seed(0)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_2.png">
+</div>
+
+这一次，生成图像只花了约 11 秒，比之前快了近 3 倍！
+
+> [!TIP]
+> 💡 我们强烈建议把 pipeline 精度降低至 `float16` , 到目前为止, 我们很少看到输出质量有任何下降。
+
+另一个选择是减少推理步数。 你可以选择一个更高效的调度器 (*scheduler*) 可以减少推理步数同时保证输出质量。您可以在 [DiffusionPipeline] 中通过调用compatibles方法找到与当前模型兼容的调度器 (*scheduler*)。
+
+```python
+pipeline.scheduler.compatibles
+[
+    diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
+    diffusers.schedulers.scheduling_unipc_multistep.UniPCMultistepScheduler,
+    diffusers.schedulers.scheduling_k_dpm_2_discrete.KDPM2DiscreteScheduler,
+    diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler,
+    diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
+    diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
+    diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
+    diffusers.schedulers.scheduling_dpmsolver_singlestep.DPMSolverSinglestepScheduler,
+    diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete.KDPM2AncestralDiscreteScheduler,
+    diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler,
+    diffusers.schedulers.scheduling_pndm.PNDMScheduler,
+    diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler,
+    diffusers.schedulers.scheduling_ddim.DDIMScheduler,
+]
+```
+
+Stable Diffusion 模型默认使用的是 [`PNDMScheduler`] ，通常要大概50步推理, 但是像 [`DPMSolverMultistepScheduler`] 这样更高效的调度器只要大概 20 或 25 步推理. 使用 [`ConfigMixin.from_config`] 方法加载新的调度器:
+
+```python
+from diffusers import DPMSolverMultistepScheduler
+
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+```
+
+现在将 `num_inference_steps` 设置为 20:
+
+```python
+generator = torch.Generator("cuda").manual_seed(0)
+image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_3.png">
+</div>
+
+太棒了！你成功把推理时间缩短到 4 秒！⚡️
+
+## 内存
+
+改善 pipeline 性能的另一个关键是减少内存的使用量，这间接意味着速度更快，因为你经常试图最大化每秒生成的图像数量。要想知道你一次可以生成多少张图片，最简单的方法是尝试不同的batch size，直到出现`OutOfMemoryError` (OOM)。
+
+创建一个函数，为每一批要生成的图像分配提示词和 `Generators` 。请务必为每个`Generator` 分配一个种子，以便于复现良好的结果。
+
+
+```python
+def get_inputs(batch_size=1):
+    generator = [torch.Generator("cuda").manual_seed(i) for i in range(batch_size)]
+    prompts = batch_size * [prompt]
+    num_inference_steps = 20
+
+    return {"prompt": prompts, "generator": generator, "num_inference_steps": num_inference_steps}
+```
+
+设置 `batch_size=4` ，然后看一看我们消耗了多少内存:
+
+```python
+from diffusers.utils import make_image_grid
+
+images = pipeline(**get_inputs(batch_size=4)).images
+make_image_grid(images, 2, 2)
+```
+
+除非你有一个更大内存的GPU, 否则上述代码会返回 `OOM` 错误! 大部分内存被 cross-attention 层使用。按顺序运行可以节省大量内存，而不是在批处理中进行。你可以为 pipeline 配置 [`~DiffusionPipeline.enable_attention_slicing`] 函数:
+
+```python
+pipeline.enable_attention_slicing()
+```
+
+现在尝试把 `batch_size` 增加到 8!
+
+```python
+images = pipeline(**get_inputs(batch_size=8)).images
+make_image_grid(images, rows=2, cols=4)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_5.png">
+</div>
+
+以前你不能一批生成 4 张图片，而现在你可以在一张图片里面生成八张图片而只需要大概3.5秒！这可能是 T4 GPU 在不牺牲质量的情况运行速度最快的一种方法。
+
+## 质量
+
+在最后两节中, 你要学习如何通过 `fp16` 来优化 pipeline 的速度, 通过使用性能更高的调度器来减少推理步数, 使用注意力切片（*enabling attention slicing*）方法来节省内存。现在，你将关注的是如何提高图像的质量。
+
+### 更好的 checkpoints
+
+有个显而易见的方法是使用更好的 checkpoints。 Stable Diffusion 模型是一个很好的起点, 自正式发布以来，还发布了几个改进版本。然而, 使用更新的版本并不意味着你会得到更好的结果。你仍然需要尝试不同的 checkpoints ，并做一些研究 (例如使用 [negative prompts](https://minimaxir.com/2022/11/stable-diffusion-negative-prompt/)) 来获得更好的结果。
+
+随着该领域的发展, 有越来越多经过微调的高质量的 checkpoints 用来生成不一样的风格. 在 [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) 和 [Diffusers Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) 寻找你感兴趣的一种!
+
+### 更好的 pipeline 组件
+
+也可以尝试用新版本替换当前 pipeline 组件。让我们加载最新的 [autodecoder](https://huggingface.co/stabilityai/stable-diffusion-2-1/tree/main/vae) 从 Stability AI 加载到 pipeline, 并生成一些图像:
+
+```python
+from diffusers import AutoencoderKL
+
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda")
+pipeline.vae = vae
+images = pipeline(**get_inputs(batch_size=8)).images
+make_image_grid(images, rows=2, cols=4)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_6.png">
+</div>
+
+### 更好的提示词工程
+
+用于生成图像的文本非常重要, 因此被称为 *提示词工程*。 在设计提示词工程应注意如下事项:
+
+- 我想生成的图像或类似图像如何存储在互联网上？
+- 我可以提供哪些额外的细节来引导模型朝着我想要的风格生成？
+
+考虑到这一点，让我们改进提示词，以包含颜色和更高质量的细节：
+
+```python
+prompt += ", tribal panther make up, blue on red, side profile, looking away, serious eyes"
+prompt += " 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta"
+```
+
+使用新的提示词生成一批图像:
+
+```python
+images = pipeline(**get_inputs(batch_size=8)).images
+make_image_grid(images, rows=2, cols=4)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_7.png">
+</div>
+
+非常的令人印象深刻! Let's tweak the second image - 把 `Generator` 的种子设置为 `1` - 添加一些关于年龄的主题文本:
+
+```python
+prompts = [
+    "portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+    "portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+    "portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+    "portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+]
+
+generator = [torch.Generator("cuda").manual_seed(1) for _ in range(len(prompts))]
+images = pipeline(prompt=prompts, generator=generator, num_inference_steps=25).images
+make_image_grid(images, 2, 2)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/stable_diffusion_101/sd_101_8.png">
+</div>
+
+## 最后
+
+在本教程中, 您学习了如何优化[`DiffusionPipeline`]以提高计算和内存效率，以及提高生成输出的质量. 如果你有兴趣让你的 pipeline 更快, 可以看一看以下资源:
+
+- 学习 [PyTorch 2.0](./optimization/torch2.0) 和 [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) 可以让推理速度提高 5 - 300% . 在 A100 GPU 上, 推理速度可以提高 50% !
+- 如果你没法用 PyTorch 2, 我们建议你安装 [xFormers](./optimization/xformers)。它的内存高效注意力机制（*memory-efficient attention mechanism*）与PyTorch 1.13.1配合使用，速度更快，内存消耗更少。
+- 其他的优化技术, 如：模型卸载（*model offloading*）, 包含在 [这份指南](./optimization/fp16).
diff --git a/docs/source/zh/training/controlnet.md b/docs/source/zh/training/controlnet.md
index e943177cedaf..84bc3263a842 100644
--- a/docs/source/zh/training/controlnet.md
+++ b/docs/source/zh/training/controlnet.md
@@ -68,11 +68,8 @@ pip install -r requirements_flax.txt
 </hfoption>
 </hfoptions>
 
-<Tip>
-
-🤗 Accelerate 是一个支持多GPU/TPU训练和混合精度的库，它能根据硬件环境自动配置训练方案。参阅 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 了解更多。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个支持多GPU/TPU训练和混合精度的库，它能根据硬件环境自动配置训练方案。参阅 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 了解更多。
 
 初始化🤗 Accelerate环境：
 
@@ -96,11 +93,8 @@ write_basic_config()
 
 最后，如需训练自定义数据集，请参阅 [创建训练数据集](create_dataset) 指南了解数据准备方法。
 
-<Tip>
-
-下文重点解析脚本中的关键模块，但不会覆盖所有实现细节。如需深入了解，建议直接阅读 [脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py)，如有疑问欢迎反馈。
-
-</Tip>
+> [!TIP]
+> 下文重点解析脚本中的关键模块，但不会覆盖所有实现细节。如需深入了解，建议直接阅读 [脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py)，如有疑问欢迎反馈。
 
 ## 脚本参数
 
@@ -135,11 +129,8 @@ accelerate launch train_controlnet.py \
 
 脚本中的 [`make_train_dataset`](https://github.com/huggingface/diffusers/blob/64603389da01082055a901f2883c4810d1144edb/examples/controlnet/train_controlnet.py#L582) 函数负责数据预处理，除常规的文本标注分词和图像变换外，还包含条件图像的特效处理：
 
-<Tip>
-
-在TPU上流式加载数据集时，🤗 Datasets库可能成为性能瓶颈（因其未针对图像数据优化）。建议考虑 [WebDataset](https://webdataset.github.io/webdataset/)、[TorchData](https://github.com/pytorch/data) 或 [TensorFlow Datasets](https://www.tensorflow.org/datasets/tfless_tfds) 等高效数据格式。
-
-</Tip>
+> [!TIP]
+> 在TPU上流式加载数据集时，🤗 Datasets库可能成为性能瓶颈（因其未针对图像数据优化）。建议考虑 [WebDataset](https://webdataset.github.io/webdataset/)、[TorchData](https://github.com/pytorch/data) 或 [TensorFlow Datasets](https://www.tensorflow.org/datasets/tfless_tfds) 等高效数据格式。
 
 ```py
 conditioning_image_transforms = transforms.Compose(
@@ -304,11 +295,8 @@ tensorboard --logdir runs/fill-circle-100steps-20230411_165612/
 
 在 [http://localhost:6006/#profile](http://localhost:6006/#profile) 查看分析结果。
 
-<Tip warning={true}>
-
-若遇到插件版本冲突，建议重新安装TensorFlow和Tensorboard。注意性能分析插件仍处实验阶段，部分视图可能不完整。`trace_viewer` 会截断超过1M的事件记录，在编译步骤分析时可能导致设备轨迹丢失。
-
-</Tip>
+> [!WARNING]
+> 若遇到插件版本冲突，建议重新安装TensorFlow和Tensorboard。注意性能分析插件仍处实验阶段，部分视图可能不完整。`trace_viewer` 会截断超过1M的事件记录，在编译步骤分析时可能导致设备轨迹丢失。
 
 ```bash
 python3 train_controlnet_flax.py \
diff --git a/docs/source/zh/training/distributed_inference.md b/docs/source/zh/training/distributed_inference.md
index e0537735b2ba..60297371d6be 100644
--- a/docs/source/zh/training/distributed_inference.md
+++ b/docs/source/zh/training/distributed_inference.md
@@ -43,11 +43,8 @@ with distributed_state.split_between_processes(["a dog", "a cat"]) as prompt:
 accelerate launch run_distributed.py --num_processes=2
 ```
 
-<Tip>
-
-参考这个最小示例 [脚本](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) 以在多个 GPU 上运行推理。要了解更多信息，请查看 [使用 🤗 Accelerate 进行分布式推理](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) 指南。
-
-</Tip>
+> [!TIP]
+> 参考这个最小示例 [脚本](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) 以在多个 GPU 上运行推理。要了解更多信息，请查看 [使用 🤗 Accelerate 进行分布式推理](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) 指南。
 
 ## PyTorch Distributed
 
diff --git a/docs/source/zh/training/dreambooth.md b/docs/source/zh/training/dreambooth.md
index 493c5385ff71..cae5e30be011 100644
--- a/docs/source/zh/training/dreambooth.md
+++ b/docs/source/zh/training/dreambooth.md
@@ -44,11 +44,8 @@ pip install -r requirements_flax.txt
 </hfoption>
 </hfoptions>
 
-<Tip>
-
-🤗 Accelerate 是一个库，用于帮助您在多个 GPU/TPU 上或使用混合精度进行训练。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个库，用于帮助您在多个 GPU/TPU 上或使用混合精度进行训练。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
 
 初始化 🤗 Accelerate 环境：
 
@@ -73,19 +70,13 @@ write_basic_config()
 最后，如果您想在自己的数据集上训练模型，请查看 [创建用于训练的数据集](create_dataset) 指南，了解如何创建与
 训练脚本。
 
-<Tip>
-
-以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读[脚本](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)，并告诉我们如果您有任何问题或疑虑。
-
-</Tip>
+> [!TIP]
+> 以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读[脚本](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)，并告诉我们如果您有任何问题或疑虑。
 
 ## 脚本参数
 
-<Tip warning={true}>
-
-DreamBooth 对训练超参数非常敏感，容易过拟合。阅读 [使用 🧨 Diffusers 训练 Stable Diffusion 与 Dreambooth](https://huggingface.co/blog/dreambooth) 博客文章，了解针对不同主题的推荐设置，以帮助您选择合适的超参数。
-
-</Tip>
+> [!WARNING]
+> DreamBooth 对训练超参数非常敏感，容易过拟合。阅读 [使用 🧨 Diffusers 训练 Stable Diffusion 与 Dreambooth](https://huggingface.co/blog/dreambooth) 博客文章，了解针对不同主题的推荐设置，以帮助您选择合适的超参数。
 
 训练脚本提供了许多参数来自定义您的训练运行。所有参数及其描述都可以在 [`parse_args()`](https://github.com/huggingface/diffusers/blob/072e00897a7cf4302c347a63ec917b4b8add16d4/examples/dreambooth/train_dreambooth.py#L228) 函数中找到。参数设置了默认值，这些默认值应该开箱即用效果不错，但如果您愿意，也可以在训练命令中设置自己的值。
 
@@ -359,29 +350,26 @@ python train_dreambooth_flax.py \
 
 训练完成后，您可以使用新训练的模型进行推理！
 
-<Tip>
-
-等不及在训练完成前就尝试您的模型进行推理？🤭 请确保安装了最新版本的 🤗 Accelerate。
-
-```py
-from diffusers import DiffusionPipeline, UNet2DConditionModel
-from transformers import CLIPTextModel
-import torch
-
-unet = UNet2DConditionModel.from_pretrained("path/to/model/checkpoint-100/unet")
-
-# 如果您使用了 `--args.train_text_encoder` 进行训练，请确保也加载文本编码器
-text_encoder = CLIPTextModel.from_pretrained("path/to/model/checkpoint-100/checkpoint-100/text_encoder")
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
-).to("cuda")
-
-image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guidance_scale=7.5).images[0]
-image.save("dog-bucket.png")
-```
-
-</Tip>
+> [!TIP]
+> 等不及在训练完成前就尝试您的模型进行推理？🤭 请确保安装了最新版本的 🤗 Accelerate。
+>
+> ```py
+> from diffusers import DiffusionPipeline, UNet2DConditionModel
+> from transformers import CLIPTextModel
+> import torch
+>
+> unet = UNet2DConditionModel.from_pretrained("path/to/model/checkpoint-100/unet")
+>
+> # 如果您使用了 `--args.train_text_encoder` 进行训练，请确保也加载文本编码器
+> text_encoder = CLIPTextModel.from_pretrained("path/to/model/checkpoint-100/checkpoint-100/text_encoder")
+>
+> pipeline = DiffusionPipeline.from_pretrained(
+>     "stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet, text_encoder=text_encoder, dtype=torch.float16,
+> ).to("cuda")
+>
+> image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guidance_scale=7.5).images[0]
+> image.save("dog-bucket.png")
+> ```
 
 <hfoptions id="training-inference">
 <hfoption id="PyTorch">
diff --git a/docs/source/zh/training/instructpix2pix.md b/docs/source/zh/training/instructpix2pix.md
index b1b616366ab7..1f9f4eb21ec3 100644
--- a/docs/source/zh/training/instructpix2pix.md
+++ b/docs/source/zh/training/instructpix2pix.md
@@ -31,11 +31,8 @@ cd examples/instruct_pix2pix
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate 是一个库，用于帮助您在多个 GPU/TPU 上或使用混合精度进行训练。它将根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速导览](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个库，用于帮助您在多个 GPU/TPU 上或使用混合精度进行训练。它将根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速导览](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
 
 初始化一个 🤗 Accelerate 环境：
 
@@ -59,11 +56,8 @@ write_basic_config()
 
 最后，如果您想在自己的数据集上训练模型，请查看 [创建用于训练的数据集](create_dataset) 指南，了解如何创建与训练脚本兼容的数据集。
 
-<Tip>
-
-以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读 [脚本](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py)，并告诉我们如果您有任何问题或疑虑。
-
-</Tip>
+> [!TIP]
+> 以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读 [脚本](https://github.com/huggingface/diffusers/blob/main/examples/instruct_pix2pix/train_instruct_pix2pix.py)，并告诉我们如果您有任何问题或疑虑。
 
 ## 脚本参数
 
@@ -176,15 +170,12 @@ if args.conditioning_dropout_prob is not None:
 
 将 `MODEL_NAME` 环境变量设置为模型名称（可以是 Hub 上的模型 ID 或本地模型的路径），并将 `DATASET_ID` 设置为 Hub 上数据集的名称。脚本会创建并保存所有组件（特征提取器、调度器、文本编码器、UNet 等）到您的仓库中的一个子文件夹。
 
-<Tip>
-
-为了获得更好的结果，尝试使用更大的数据集进行更长时间的训练。我们只在较小规模的数据集上测试过此训练脚本。
-
-<br>
-
-要使用 Weights and Biases 监控训练进度，请将 `--report_to=wandb` 参数添加到训练命令中，并使用 `--val_image_url` 指定验证图像，使用 `--validation_prompt` 指定验证提示。这对于调试模型非常有用。
-
-</Tip>
+> [!TIP]
+> 为了获得更好的结果，尝试使用更大的数据集进行更长时间的训练。我们只在较小规模的数据集上测试过此训练脚本。
+>
+> <br>
+>
+> 要使用 Weights and Biases 监控训练进度，请将 `--report_to=wandb` 参数添加到训练命令中，并使用 `--val_image_url` 指定验证图像，使用 `--validation_prompt` 指定验证提示。这对于调试模型非常有用。
 
 如果您在多个 GPU 上训练，请将 `--multi_gpu` 参数添加到 `accelerate launch` 命令中。
 
diff --git a/docs/source/zh/training/kandinsky.md b/docs/source/zh/training/kandinsky.md
index 8da5c0c3a0de..8ef3524ee7c4 100644
--- a/docs/source/zh/training/kandinsky.md
+++ b/docs/source/zh/training/kandinsky.md
@@ -9,11 +9,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 # Kandinsky 2.2
 
-<Tip warning={true}>
-
-此脚本是实验性的，容易过拟合并遇到灾难性遗忘等问题。尝试探索不同的超参数以在您的数据集上获得最佳结果。
-
-</Tip>
+> [!WARNING]
+> 此脚本是实验性的，容易过拟合并遇到灾难性遗忘等问题。尝试探索不同的超参数以在您的数据集上获得最佳结果。
 
 Kandinsky 2.2 是一个多语言文本到图像模型，能够生成更逼真的图像。该模型包括一个图像先验模型，用于从文本提示创建图像嵌入，以及一个解码器模型，基于先验模型的嵌入生成图像。这就是为什么在 Diffusers 中您会找到两个独立的脚本用于 Kandinsky 2.2，一个用于训练先验模型，另一个用于训练解码器模型。您可以分别训练这两个模型，但为了获得最佳结果，您应该同时训练先验和解码器模型。
 
@@ -36,12 +33,9 @@ cd examples/kandinsky2_2/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate 是一个帮助您在多个 GPU/TPU 上或使用混合精度进行训练的库。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate 的 [快速入门](https://huggingface.co/docs/accelerate/quicktour
-) 了解更多。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个帮助您在多个 GPU/TPU 上或使用混合精度进行训练的库。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate 的 [快速入门](https://huggingface.co/docs/accelerate/quicktour
+> ) 了解更多。
 
 初始化一个 🤗 Accelerate 环境：
 
@@ -65,11 +59,8 @@ write_basic_config()
 
 最后，如果您想在自己的数据集上训练模型，请查看 [创建用于训练的数据集](create_dataset) 指南，了解如何创建与训练脚本兼容的数据集。
 
-<Tip>
-
-以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读脚本，并让我们知道您有任何疑问或顾虑。
-
-</Tip>
+> [!TIP]
+> 以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未详细涵盖脚本的每个方面。如果您有兴趣了解更多，请随时阅读脚本，并让我们知道您有任何疑问或顾虑。
 
 ## 脚本参数
 
@@ -209,12 +200,9 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k
 
 如果您在多个GPU上训练，请在 `accelerate launch` 命令中添加 `--multi_gpu` 参数。
 
-<Tip>
-
-要使用Weights & Biases监控训练进度，请在训练命令中添加 `--report_to=wandb` 参数。您还需要
-建议在训练命令中添加 `--validation_prompt` 以跟踪结果。这对于调试模型和查看中间结果非常有用。
-
-</Tip>
+> [!TIP]
+> 要使用Weights & Biases监控训练进度，请在训练命令中添加 `--report_to=wandb` 参数。您还需要
+> 建议在训练命令中添加 `--validation_prompt` 以跟踪结果。这对于调试模型和查看中间结果非常有用。
 
 <hfoptions id="training-inference">
 <hfoption id="prior model">
@@ -284,11 +272,8 @@ prompt="A robot naruto, 4k photo"
 image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
 ```
 
-<Tip>
-
-可以随意将 `kandinsky-community/kandinsky-2-2-decoder` 替换为您自己训练的 decoder 检查点！
-
-</Tip>
+> [!TIP]
+> 可以随意将 `kandinsky-community/kandinsky-2-2-decoder` 替换为您自己训练的 decoder 检查点！
 
 </hfoption>
 <hfoption id="decoder model">
diff --git a/docs/source/zh/training/lora.md b/docs/source/zh/training/lora.md
index a7b7abb32d00..ce29365450bd 100644
--- a/docs/source/zh/training/lora.md
+++ b/docs/source/zh/training/lora.md
@@ -12,19 +12,13 @@ specific language governing permissions and limitations under the License.
 
 # LoRA 低秩适配
 
-<Tip warning={true}>
-
-当前功能处于实验阶段，API可能在未来版本中变更。
-
-</Tip>
+> [!WARNING]
+> 当前功能处于实验阶段，API可能在未来版本中变更。
 
 [LoRA（大语言模型的低秩适配）](https://hf.co/papers/2106.09685) 是一种轻量级训练技术，能显著减少可训练参数量。其原理是通过向模型注入少量新权重参数，仅训练这些新增参数。这使得LoRA训练速度更快、内存效率更高，并生成更小的模型权重文件（通常仅数百MB），便于存储和分享。LoRA还可与DreamBooth等其他训练技术结合以加速训练过程。
 
-<Tip>
-
-LoRA具有高度通用性，目前已支持以下应用场景：[DreamBooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py)、[Kandinsky 2.2](https://github.com/huggingface/diffusers/blob/main/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py)、[Stable Diffusion XL](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora_sdxl.py)、[文生图](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)以及[Wuerstchen](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py)。
-
-</Tip>
+> [!TIP]
+> LoRA具有高度通用性，目前已支持以下应用场景：[DreamBooth](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py)、[Kandinsky 2.2](https://github.com/huggingface/diffusers/blob/main/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_decoder.py)、[Stable Diffusion XL](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora_sdxl.py)、[文生图](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)以及[Wuerstchen](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py)。
 
 本指南将通过解析[train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)脚本，帮助您深入理解其工作原理，并掌握如何针对具体需求进行定制化修改。
 
@@ -57,11 +51,8 @@ pip install -r requirements_flax.txt
 </hfoption>
 </hfoptions>
 
-<Tip>
-
-🤗 Accelerate是一个支持多GPU/TPU训练和混合精度计算的库，它能根据硬件环境自动配置训练方案。参阅🤗 Accelerate[快速入门](https://huggingface.co/docs/accelerate/quicktour)了解更多。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate是一个支持多GPU/TPU训练和混合精度计算的库，它能根据硬件环境自动配置训练方案。参阅🤗 Accelerate[快速入门](https://huggingface.co/docs/accelerate/quicktour)了解更多。
 
 初始化🤗 Accelerate环境：
 
@@ -85,11 +76,8 @@ write_basic_config()
 
 如需训练自定义数据集，请参考[创建训练数据集指南](create_dataset)了解数据准备流程。
 
-<Tip>
-
-以下章节重点解析训练脚本中与LoRA相关的核心部分，但不会涵盖所有实现细节。如需完整理解，建议直接阅读[脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)，如有疑问欢迎反馈。
-
-</Tip>
+> [!TIP]
+> 以下章节重点解析训练脚本中与LoRA相关的核心部分，但不会涵盖所有实现细节。如需完整理解，建议直接阅读[脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)，如有疑问欢迎反馈。
 
 ## 脚本参数
 
@@ -177,11 +165,8 @@ optimizer = optimizer_cls(
 
 多GPU训练请添加`--multi_gpu`参数。
 
-<Tip warning={true}>
-
-在11GB显存的2080 Ti显卡上完整训练约需5小时。
-
-</Tip>
+> [!WARNING]
+> 在11GB显存的2080 Ti显卡上完整训练约需5小时。
 
 ```bash
 export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
diff --git a/docs/source/zh/training/text2image.md b/docs/source/zh/training/text2image.md
index 193b839e9b93..4465adbe2ad7 100644
--- a/docs/source/zh/training/text2image.md
+++ b/docs/source/zh/training/text2image.md
@@ -12,11 +12,8 @@ specific language governing permissions and limitations under the License.
 
 # 文生图
 
-<Tip warning={true}>
-
-文生图训练脚本目前处于实验阶段，容易出现过拟合和灾难性遗忘等问题。建议尝试不同超参数以获得最佳数据集适配效果。
-
-</Tip>
+> [!WARNING]
+> 文生图训练脚本目前处于实验阶段，容易出现过拟合和灾难性遗忘等问题。建议尝试不同超参数以获得最佳数据集适配效果。
 
 Stable Diffusion 等文生图模型能够根据文本提示生成对应图像。
 
@@ -49,11 +46,8 @@ pip install -r requirements_flax.txt
 </hfoption>
 </hfoptions>
 
-<Tip>
-
-🤗 Accelerate 是支持多GPU/TPU训练和混合精度的工具库，能根据硬件环境自动配置训练参数。参阅 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 了解更多。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是支持多GPU/TPU训练和混合精度的工具库，能根据硬件环境自动配置训练参数。参阅 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 了解更多。
 
 初始化 🤗 Accelerate 环境：
 
@@ -79,11 +73,8 @@ write_basic_config()
 
 ## 脚本参数
 
-<Tip>
-
-以下重点介绍脚本中影响训练效果的关键参数，如需完整参数说明可查阅 [脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)。如有疑问欢迎反馈。
-
-</Tip>
+> [!TIP]
+> 以下重点介绍脚本中影响训练效果的关键参数，如需完整参数说明可查阅 [脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)。如有疑问欢迎反馈。
 
 训练脚本提供丰富参数供自定义训练流程，所有参数及说明详见 [`parse_args()`](https://github.com/huggingface/diffusers/blob/8959c5b9dec1c94d6ba482c94a58d2215c5fd026/examples/text_to_image/train_text_to_image.py#L193) 函数。该函数为每个参数提供默认值（如批次大小、学习率等），也可通过命令行参数覆盖。
 
@@ -160,11 +151,8 @@ def preprocess_train(examples):
 
 以 [火影忍者BLIP标注数据集](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 为例训练生成火影角色。设置环境变量 `MODEL_NAME` 和 `dataset_name` 指定模型和数据集（Hub或本地路径）。多GPU训练需在 `accelerate launch` 命令中添加 `--multi_gpu` 参数。
 
-<Tip>
-
-使用本地数据集时，设置 `TRAIN_DIR` 和 `OUTPUT_DIR` 环境变量为数据集路径和模型保存路径。
-
-</Tip>
+> [!TIP]
+> 使用本地数据集时，设置 `TRAIN_DIR` 和 `OUTPUT_DIR` 环境变量为数据集路径和模型保存路径。
 
 ```bash
 export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
@@ -194,11 +182,8 @@ Flax训练方案在TPU/GPU上效率更高（由 [@duongna211](https://github.com
 
 设置环境变量 `MODEL_NAME` 和 `dataset_name` 指定模型和数据集（Hub或本地路径）。
 
-<Tip>
-
-使用本地数据集时，设置 `TRAIN_DIR` 和 `OUTPUT_DIR` 环境变量为数据集路径和模型保存路径。
-
-</Tip>
+> [!TIP]
+> 使用本地数据集时，设置 `TRAIN_DIR` 和 `OUTPUT_DIR` 环境变量为数据集路径和模型保存路径。
 
 ```bash
 export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
diff --git a/docs/source/zh/training/text_inversion.md b/docs/source/zh/training/text_inversion.md
index 2945699c6141..eda9f911441b 100644
--- a/docs/source/zh/training/text_inversion.md
+++ b/docs/source/zh/training/text_inversion.md
@@ -45,11 +45,8 @@ pip install -r requirements_flax.txt
 </hfoption>
 </hfoptions>
 
-<Tip>
-
-🤗 Accelerate 是一个帮助您在多GPU/TPU或混合精度环境下训练的工具库。它会根据硬件和环境自动配置训练设置。查看🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour)了解更多。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个帮助您在多GPU/TPU或混合精度环境下训练的工具库。它会根据硬件和环境自动配置训练设置。查看🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour)了解更多。
 
 初始化🤗 Accelerate环境：
 
@@ -73,11 +70,8 @@ write_basic_config()
 
 最后，如果想在自定义数据集上训练模型，请参阅[创建训练数据集](create_dataset)指南，了解如何创建适用于训练脚本的数据集。
 
-<Tip>
-
-以下部分重点介绍训练脚本中需要理解的关键修改点，但未涵盖脚本所有细节。如需深入了解，可随时查阅[脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py)，如有疑问欢迎反馈。
-
-</Tip>
+> [!TIP]
+> 以下部分重点介绍训练脚本中需要理解的关键修改点，但未涵盖脚本所有细节。如需深入了解，可随时查阅[脚本源码](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py)，如有疑问欢迎反馈。
 
 ## 脚本参数
 
@@ -173,11 +167,8 @@ snapshot_download(
 - `token_identifier.txt`：特殊占位符词汇
 - `type_of_concept.txt`：训练概念类型（"object"或"style"）
 
-<Tip warning={true}>
-
-在单块V100 GPU上完整训练约需1小时。
-
-</Tip>
+> [!WARNING]
+> 在单块V100 GPU上完整训练约需1小时。
 
 启动脚本前还有最后一步。如果想实时观察训练过程，可以定期保存生成图像。在训练命令中添加以下参数：
 
diff --git a/docs/source/zh/training/wuerstchen.md b/docs/source/zh/training/wuerstchen.md
index 8a6abe662439..c80cc944a3d8 100644
--- a/docs/source/zh/training/wuerstchen.md
+++ b/docs/source/zh/training/wuerstchen.md
@@ -33,11 +33,8 @@ cd examples/wuerstchen/text_to_image
 pip install -r requirements.txt
 ```
 
-<Tip>
-
-🤗 Accelerate 是一个帮助您在多个 GPU/TPU 上或使用混合精度进行训练的库。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
-
-</Tip>
+> [!TIP]
+> 🤗 Accelerate 是一个帮助您在多个 GPU/TPU 上或使用混合精度进行训练的库。它会根据您的硬件和环境自动配置训练设置。查看 🤗 Accelerate [快速入门](https://huggingface.co/docs/accelerate/quicktour) 以了解更多信息。
 
 初始化一个 🤗 Accelerate 环境：
 
@@ -61,11 +58,8 @@ write_basic_config()
 
 最后，如果您想在自己的数据集上训练模型，请查看 [创建训练数据集](create_dataset) 指南，了解如何创建与训练脚本兼容的数据集。
 
-<Tip>
-
-以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未涵盖 [脚本](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_prior.py) 的详细信息。如果您有兴趣了解更多，请随时阅读脚本，并告诉我们您是否有任何问题或疑虑。
-
-</Tip>
+> [!TIP]
+> 以下部分重点介绍了训练脚本中对于理解如何修改它很重要的部分，但并未涵盖 [脚本](https://github.com/huggingface/diffusers/blob/main/examples/wuerstchen/text_to_image/train_text_to_image_prior.py) 的详细信息。如果您有兴趣了解更多，请随时阅读脚本，并告诉我们您是否有任何问题或疑虑。
 
 ## 脚本参数
 
@@ -134,11 +128,8 @@ pred_noise = prior(noisy_latents, timesteps, prompt_embeds)
 
 设置`DATASET_NAME`环境变量为Hub中的数据集名称。本指南使用[Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions)数据集，但您也可以创建和训练自己的数据集（参见[创建用于训练的数据集](create_dataset)指南）。
 
-<Tip>
-
-要使用Weights & Biases监控训练进度，请在训练命令中添加`--report_to=wandb`参数。您还需要在训练命令中添加`--validation_prompt`以跟踪结果。这对于调试模型和查看中间结果非常有用。
-
-</Tip>
+> [!TIP]
+> 要使用Weights & Biases监控训练进度，请在训练命令中添加`--report_to=wandb`参数。您还需要在训练命令中添加`--validation_prompt`以跟踪结果。这对于调试模型和查看中间结果非常有用。
 
 ```bash
 export DATASET_NAME="lambdalabs/naruto-blip-captions"
diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py
index 274851e2acf4..3871552672a6 100644
--- a/examples/community/matryoshka.py
+++ b/examples/community/matryoshka.py
@@ -1475,11 +1475,8 @@ class MatryoshkaFusedAttnProcessor2_0:
     fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
     For cross-attention modules, key and value projection matrices are fused.
 
-    <Tip warning={true}>
-
-    This API is currently 🧪 experimental in nature and can change in future.
-
-    </Tip>
+    > [!WARNING]
+    > This API is currently 🧪 experimental in nature and can change in future.
     """
 
     def __init__(self):
@@ -2696,11 +2693,8 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -2719,11 +2713,8 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/examples/community/pipeline_stable_diffusion_boxdiff.py b/examples/community/pipeline_stable_diffusion_boxdiff.py
index 1133321fccef..07e29b9c05b7 100644
--- a/examples/community/pipeline_stable_diffusion_boxdiff.py
+++ b/examples/community/pipeline_stable_diffusion_boxdiff.py
@@ -948,11 +948,8 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
         key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
 
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
@@ -978,11 +975,8 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
     def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """Disable QKV projection fusion if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
 
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py
index 6728e2a60bb2..6b62b610afa2 100644
--- a/examples/community/pipeline_stable_diffusion_pag.py
+++ b/examples/community/pipeline_stable_diffusion_pag.py
@@ -940,9 +940,8 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
         key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
@@ -966,9 +965,8 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
     def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """Disable QKV projection fusion if enabled.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
+        > [!WARNING]
+        > This API is 🧪 experimental.
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
diff --git a/examples/model_search/pipeline_easy.py b/examples/model_search/pipeline_easy.py
index fcce297c3784..ee5dced817ec 100644
--- a/examples/model_search/pipeline_easy.py
+++ b/examples/model_search/pipeline_easy.py
@@ -1246,12 +1246,9 @@ def from_huggingface(cls, pretrained_model_link_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login`.
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login`.
 
         Examples:
 
@@ -1355,12 +1352,9 @@ def from_civitai(cls, pretrained_model_link_or_path, **kwargs):
                 class). The overwritten components are passed directly to the pipelines `__init__` method. See example
                 below for more information.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login`.
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login`.
 
         Examples:
 
@@ -1504,12 +1498,9 @@ def from_huggingface(cls, pretrained_model_link_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login`.
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login`.
 
         Examples:
 
@@ -1614,12 +1605,9 @@ def from_civitai(cls, pretrained_model_link_or_path, **kwargs):
                 class). The overwritten components are passed directly to the pipelines `__init__` method. See example
                 below for more information.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login`.
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login`.
 
         Examples:
 
@@ -1763,12 +1751,9 @@ def from_huggingface(cls, pretrained_model_link_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login
 
         Examples:
 
@@ -1872,12 +1857,9 @@ def from_civitai(cls, pretrained_model_link_or_path, **kwargs):
                 class). The overwritten components are passed directly to the pipelines `__init__` method. See example
                 below for more information.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
-        `hf auth login
-
-        </Tip>
+        > [!TIP]
+        > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        > `hf auth login
 
         Examples:
 
diff --git a/src/diffusers/guiders/guider_utils.py b/src/diffusers/guiders/guider_utils.py
index a6f2e76dc337..7524b5a3eacc 100644
--- a/src/diffusers/guiders/guider_utils.py
+++ b/src/diffusers/guiders/guider_utils.py
@@ -247,15 +247,11 @@ def from_pretrained(
                 The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                 allowed by Git.
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login`. You can also activate the special >
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a >
         firewalled environment.
 
-        </Tip>
-
         """
         config, kwargs, commit_hash = cls.load_config(
             pretrained_model_name_or_path=pretrained_model_name_or_path,
diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py
index 0ee32f820b56..3d75a7d875a4 100644
--- a/src/diffusers/loaders/lora_base.py
+++ b/src/diffusers/loaders/lora_base.py
@@ -544,11 +544,7 @@ def fuse_lora(
         r"""
         Fuses the LoRA parameters into the original parameters of the corresponding blocks.
 
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
+        > [!WARNING] > This is an experimental API.
 
         Args:
             components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
@@ -628,11 +624,7 @@ def unfuse_lora(self, components: List[str] = [], **kwargs):
         Reverses the effect of
         [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
 
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
+        > [!WARNING] > This is an experimental API.
 
         Args:
             components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
index 65bdae692070..e25a29e1c00e 100644
--- a/src/diffusers/loaders/lora_pipeline.py
+++ b/src/diffusers/loaders/lora_pipeline.py
@@ -246,13 +246,8 @@ def lora_state_dict(
         r"""
         Return state dict for lora weights and the network alphas.
 
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
+        > [!WARNING] > We support loading A1111 formatted LoRA checkpoints in a limited capacity. > > This function is
+        experimental and might change in the future.
 
         Parameters:
             pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
@@ -545,11 +540,7 @@ def fuse_lora(
         r"""
         Fuses the LoRA parameters into the original parameters of the corresponding blocks.
 
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
+        > [!WARNING] > This is an experimental API.
 
         Args:
             components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
@@ -586,11 +577,7 @@ def unfuse_lora(self, components: List[str] = ["unet", "text_encoder"], **kwargs
         Reverses the effect of
         [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
 
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
+        > [!WARNING] > This is an experimental API.
 
         Args:
             components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
@@ -698,13 +685,8 @@ def lora_state_dict(
         r"""
         Return state dict for lora weights and the network alphas.
 
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
+        > [!WARNING] > We support loading A1111 formatted LoRA checkpoints in a limited capacity. > > This function is
+        experimental and might change in the future.
 
         Parameters:
             pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
@@ -2007,11 +1989,7 @@ def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], *
         Reverses the effect of
         [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
 
-        <Tip warning={true}>
-
-        This is an experimental API.
-
-        </Tip>
+        > [!WARNING] > This is an experimental API.
 
         Args:
             components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 3f59c8da8ea7..5164cf311d3c 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -111,11 +111,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         for module in self.modules():
             if isinstance(module, AttentionModuleMixin):
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 990245de1742..66455d733aee 100755
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -3669,11 +3669,7 @@ class FusedAttnProcessor2_0:
     fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
     For cross-attention modules, key and value projection matrices are fused.
 
-    <Tip warning={true}>
-
-    This API is currently 🧪 experimental in nature and can change in future.
-
-    </Tip>
+    > [!WARNING] > This API is currently 🧪 experimental in nature and can change in future.
     """
 
     def __init__(self):
diff --git a/src/diffusers/models/auto_model.py b/src/diffusers/models/auto_model.py
index 47f3a992b360..a95b0ae64a8e 100644
--- a/src/diffusers/models/auto_model.py
+++ b/src/diffusers/models/auto_model.py
@@ -118,15 +118,11 @@ def from_pretrained(cls, pretrained_model_or_path: Optional[Union[str, os.PathLi
             trust_remote_cocde (`bool`, *optional*, defaults to `False`):
                 Whether to trust remote code
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login`. You can also activate the special >
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a >
         firewalled environment.
 
-        </Tip>
-
         Example:
 
         ```py
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 9a4375a36bdf..d823c2fb8b04 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -532,11 +532,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -556,11 +552,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py
index 8d892cb3b697..0641c8bc0114 100644
--- a/src/diffusers/models/controlnets/controlnet_sd3.py
+++ b/src/diffusers/models/controlnets/controlnet_sd3.py
@@ -270,11 +270,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -294,11 +290,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/controlnets/controlnet_xs.py b/src/diffusers/models/controlnets/controlnet_xs.py
index aabae709e988..bcb4e259867f 100644
--- a/src/diffusers/models/controlnets/controlnet_xs.py
+++ b/src/diffusers/models/controlnets/controlnet_xs.py
@@ -980,11 +980,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -1004,11 +1000,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py
index 8050afff2767..fd195783212e 100644
--- a/src/diffusers/models/modeling_flax_utils.py
+++ b/src/diffusers/models/modeling_flax_utils.py
@@ -227,15 +227,9 @@ def from_pretrained(
                 This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
                 specified, all the computation will be performed with the given `dtype`.
 
-                <Tip>
-
-                This only specifies the dtype of the *computation* and does not influence the dtype of model
-                parameters.
-
-                If you wish to change the dtype of the model parameters, see [`~FlaxModelMixin.to_fp16`] and
-                [`~FlaxModelMixin.to_bf16`].
-
-                </Tip>
+                > [!TIP] > This only specifies the dtype of the *computation* and does not influence the dtype of model
+                > parameters. > > If you wish to change the dtype of the model parameters, see
+                [`~FlaxModelMixin.to_fp16`] and > [`~FlaxModelMixin.to_bf16`].
 
             model_args (sequence of positional arguments, *optional*):
                 All remaining positional arguments are passed to the underlying model's `__init__` method.
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index b3d74954bd26..1af7ba9ac511 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -403,12 +403,8 @@ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Call
         When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
         inference. Speed up during training is not guaranteed.
 
-        <Tip warning={true}>
-
-        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
-        precedent.
-
-        </Tip>
+        > [!WARNING] > ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient
+        attention takes > precedent.
 
         Parameters:
             attention_op (`Callable`, *optional*):
@@ -917,15 +913,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Whether to disable mmap when loading a Safetensors model. This option can perform better when the model
                 is on a network mount or hard drive, which may not handle the seeky-ness of mmap very well.
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login`. You can also activate the special >
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a >
         firewalled environment.
 
-        </Tip>
-
         Example:
 
         ```py
diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
index 4d7d1ba40e92..bf6d9e1b3803 100644
--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -431,11 +431,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -455,11 +451,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
index 50381096903c..9e0afdee6615 100644
--- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py
+++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -397,11 +397,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -421,11 +417,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
index f63471878857..fbe9fe8df91c 100644
--- a/src/diffusers/models/transformers/hunyuan_transformer_2d.py
+++ b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -324,11 +324,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -348,11 +344,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
index 40a14bfd9b27..5a22144228ae 100644
--- a/src/diffusers/models/transformers/pixart_transformer_2d.py
+++ b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -258,11 +258,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -282,11 +278,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
index edf77a7df793..762d89c303d7 100644
--- a/src/diffusers/models/transformers/transformer_sd3.py
+++ b/src/diffusers/models/transformers/transformer_sd3.py
@@ -280,11 +280,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -304,11 +300,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index 736deb28c376..33bda8cb1ead 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -872,11 +872,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -895,11 +891,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
index bd67ea414ab8..b5151f3c9a1f 100644
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -508,11 +508,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -532,11 +528,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
index 8449bf894cc9..7148723a84d8 100644
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -472,11 +472,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -496,11 +492,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 0a112b524911..26616e53bdfd 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -1911,11 +1911,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -1935,11 +1931,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py
index ed847fa41460..9dd8035c44e7 100644
--- a/src/diffusers/modular_pipelines/components_manager.py
+++ b/src/diffusers/modular_pipelines/components_manager.py
@@ -286,11 +286,7 @@ class ComponentsManager:
     encoders, etc.) across different modular pipelines. It includes features for duplicate detection, memory
     management, and component organization.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Example:
         ```python
diff --git a/src/diffusers/modular_pipelines/flux/modular_pipeline.py b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
index 7d869041f2a9..563b0333431f 100644
--- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
@@ -25,11 +25,7 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
     """
     A ModularPipeline for Flux.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     default_blocks_name = "FluxAutoBlocks"
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 206d19f17371..037c9e323c6b 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -226,11 +226,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
 
     [`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     config_name = "modular_config.json"
@@ -525,11 +521,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Attributes:
         block_classes: List of block classes to be used
@@ -787,11 +779,7 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Attributes:
         block_classes: List of block classes to be used
@@ -1146,11 +1134,7 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Attributes:
         block_classes: List of block classes to be used
@@ -1433,11 +1417,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
     """
     Base class for all Modular pipelines.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Args:
         blocks: ModularPipelineBlocks, the blocks to be used in the pipeline
@@ -2173,12 +2153,8 @@ def to(self, *args, **kwargs) -> Self:
         Performs Pipeline dtype and/or device conversion. A torch.dtype and torch.device are inferred from the
         arguments of `self.to(*args, **kwargs).`
 
-        <Tip>
-
-            If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is. Otherwise,
-            the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
-
-        </Tip>
+        > [!TIP] > If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is.
+        Otherwise, > the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
 
 
         Here are the ways to call `to`:
diff --git a/src/diffusers/modular_pipelines/node_utils.py b/src/diffusers/modular_pipelines/node_utils.py
new file mode 100644
index 000000000000..f7ee1dd3097b
--- /dev/null
+++ b/src/diffusers/modular_pipelines/node_utils.py
@@ -0,0 +1,661 @@
+import json
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ..configuration_utils import ConfigMixin
+from ..image_processor import PipelineImageInput
+from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks
+from .modular_pipeline_utils import InputParam
+
+
+logger = logging.getLogger(__name__)
+
+# YiYi Notes: this is actually for SDXL, put it here for now
+SDXL_INPUTS_SCHEMA = {
+    "prompt": InputParam(
+        "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"
+    ),
+    "prompt_2": InputParam(
+        "prompt_2",
+        type_hint=Union[str, List[str]],
+        description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2",
+    ),
+    "negative_prompt": InputParam(
+        "negative_prompt",
+        type_hint=Union[str, List[str]],
+        description="The prompt or prompts not to guide the image generation",
+    ),
+    "negative_prompt_2": InputParam(
+        "negative_prompt_2",
+        type_hint=Union[str, List[str]],
+        description="The negative prompt or prompts for text_encoder_2",
+    ),
+    "cross_attention_kwargs": InputParam(
+        "cross_attention_kwargs",
+        type_hint=Optional[dict],
+        description="Kwargs dictionary passed to the AttentionProcessor",
+    ),
+    "clip_skip": InputParam(
+        "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"
+    ),
+    "image": InputParam(
+        "image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="The image(s) to modify for img2img or inpainting",
+    ),
+    "mask_image": InputParam(
+        "mask_image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="Mask image for inpainting, white pixels will be repainted",
+    ),
+    "generator": InputParam(
+        "generator",
+        type_hint=Optional[Union[torch.Generator, List[torch.Generator]]],
+        description="Generator(s) for deterministic generation",
+    ),
+    "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
+    "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
+    "num_images_per_prompt": InputParam(
+        "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"
+    ),
+    "num_inference_steps": InputParam(
+        "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"
+    ),
+    "timesteps": InputParam(
+        "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"
+    ),
+    "sigmas": InputParam(
+        "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"
+    ),
+    "denoising_end": InputParam(
+        "denoising_end",
+        type_hint=Optional[float],
+        description="Fraction of denoising process to complete before termination",
+    ),
+    # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
+    "strength": InputParam(
+        "strength", type_hint=float, default=0.3, description="How much to transform the reference image"
+    ),
+    "denoising_start": InputParam(
+        "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"
+    ),
+    "latents": InputParam(
+        "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"
+    ),
+    "padding_mask_crop": InputParam(
+        "padding_mask_crop",
+        type_hint=Optional[Tuple[int, int]],
+        description="Size of margin in crop for image and mask",
+    ),
+    "original_size": InputParam(
+        "original_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Original size of the image for SDXL's micro-conditioning",
+    ),
+    "target_size": InputParam(
+        "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"
+    ),
+    "negative_original_size": InputParam(
+        "negative_original_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Negative conditioning based on image resolution",
+    ),
+    "negative_target_size": InputParam(
+        "negative_target_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Negative conditioning based on target resolution",
+    ),
+    "crops_coords_top_left": InputParam(
+        "crops_coords_top_left",
+        type_hint=Tuple[int, int],
+        default=(0, 0),
+        description="Top-left coordinates for SDXL's micro-conditioning",
+    ),
+    "negative_crops_coords_top_left": InputParam(
+        "negative_crops_coords_top_left",
+        type_hint=Tuple[int, int],
+        default=(0, 0),
+        description="Negative conditioning crop coordinates",
+    ),
+    "aesthetic_score": InputParam(
+        "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"
+    ),
+    "negative_aesthetic_score": InputParam(
+        "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"
+    ),
+    "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
+    "output_type": InputParam(
+        "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"
+    ),
+    "ip_adapter_image": InputParam(
+        "ip_adapter_image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="Image(s) to be used as IP adapter",
+    ),
+    "control_image": InputParam(
+        "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"
+    ),
+    "control_guidance_start": InputParam(
+        "control_guidance_start",
+        type_hint=Union[float, List[float]],
+        default=0.0,
+        description="When ControlNet starts applying",
+    ),
+    "control_guidance_end": InputParam(
+        "control_guidance_end",
+        type_hint=Union[float, List[float]],
+        default=1.0,
+        description="When ControlNet stops applying",
+    ),
+    "controlnet_conditioning_scale": InputParam(
+        "controlnet_conditioning_scale",
+        type_hint=Union[float, List[float]],
+        default=1.0,
+        description="Scale factor for ControlNet outputs",
+    ),
+    "guess_mode": InputParam(
+        "guess_mode",
+        type_hint=bool,
+        default=False,
+        description="Enables ControlNet encoder to recognize input without prompts",
+    ),
+    "control_mode": InputParam(
+        "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet"
+    ),
+}
+
+SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
+    "prompt_embeds": InputParam(
+        "prompt_embeds",
+        type_hint=torch.Tensor,
+        required=True,
+        description="Text embeddings used to guide image generation",
+    ),
+    "negative_prompt_embeds": InputParam(
+        "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
+    ),
+    "pooled_prompt_embeds": InputParam(
+        "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"
+    ),
+    "negative_pooled_prompt_embeds": InputParam(
+        "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
+    ),
+    "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
+    "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
+    "preprocess_kwargs": InputParam(
+        "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"
+    ),
+    "latents": InputParam(
+        "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"
+    ),
+    "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
+    "num_inference_steps": InputParam(
+        "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"
+    ),
+    "latent_timestep": InputParam(
+        "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"
+    ),
+    "image_latents": InputParam(
+        "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"
+    ),
+    "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
+    "masked_image_latents": InputParam(
+        "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
+    ),
+    "add_time_ids": InputParam(
+        "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"
+    ),
+    "negative_add_time_ids": InputParam(
+        "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
+    ),
+    "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
+    "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
+    "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
+    "ip_adapter_embeds": InputParam(
+        "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
+    ),
+    "negative_ip_adapter_embeds": InputParam(
+        "negative_ip_adapter_embeds",
+        type_hint=List[torch.Tensor],
+        description="Negative image embeddings for IP-Adapter",
+    ),
+    "images": InputParam(
+        "images",
+        type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+        required=True,
+        description="Generated images",
+    ),
+}
+
+SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA}
+
+
+DEFAULT_PARAM_MAPS = {
+    "prompt": {
+        "label": "Prompt",
+        "type": "string",
+        "default": "a bear sitting in a chair drinking a milkshake",
+        "display": "textarea",
+    },
+    "negative_prompt": {
+        "label": "Negative Prompt",
+        "type": "string",
+        "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
+        "display": "textarea",
+    },
+    "num_inference_steps": {
+        "label": "Steps",
+        "type": "int",
+        "default": 25,
+        "min": 1,
+        "max": 1000,
+    },
+    "seed": {
+        "label": "Seed",
+        "type": "int",
+        "default": 0,
+        "min": 0,
+        "display": "random",
+    },
+    "width": {
+        "label": "Width",
+        "type": "int",
+        "display": "text",
+        "default": 1024,
+        "min": 8,
+        "max": 8192,
+        "step": 8,
+        "group": "dimensions",
+    },
+    "height": {
+        "label": "Height",
+        "type": "int",
+        "display": "text",
+        "default": 1024,
+        "min": 8,
+        "max": 8192,
+        "step": 8,
+        "group": "dimensions",
+    },
+    "images": {
+        "label": "Images",
+        "type": "image",
+        "display": "output",
+    },
+    "image": {
+        "label": "Image",
+        "type": "image",
+        "display": "input",
+    },
+}
+
+DEFAULT_TYPE_MAPS = {
+    "int": {
+        "type": "int",
+        "default": 0,
+        "min": 0,
+    },
+    "float": {
+        "type": "float",
+        "default": 0.0,
+        "min": 0.0,
+    },
+    "str": {
+        "type": "string",
+        "default": "",
+    },
+    "bool": {
+        "type": "boolean",
+        "default": False,
+    },
+    "image": {
+        "type": "image",
+    },
+}
+
+DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"]
+DEFAULT_CATEGORY = "Modular Diffusers"
+DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"]
+DEFAULT_PARAMS_GROUPS_KEYS = {
+    "text_encoders": ["text_encoder", "tokenizer"],
+    "ip_adapter_embeds": ["ip_adapter_embeds"],
+    "prompt_embeddings": ["prompt_embeds"],
+}
+
+
+def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS):
+    """
+    Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" ->
+    "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None
+    """
+    if name is None:
+        return None
+    for group_name, group_keys in group_params_keys.items():
+        for group_key in group_keys:
+            if group_key in name:
+                return group_name
+    return None
+
+
+class ModularNode(ConfigMixin):
+    """
+    A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper
+    around a ModularPipelineBlocks object.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    config_name = "node_config.json"
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        trust_remote_code: Optional[bool] = None,
+        **kwargs,
+    ):
+        blocks = ModularPipelineBlocks.from_pretrained(
+            pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+        )
+        return cls(blocks, **kwargs)
+
+    def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs):
+        self.blocks = blocks
+
+        if label is None:
+            label = self.blocks.__class__.__name__
+        # blocks param name -> mellon param name
+        self.name_mapping = {}
+
+        input_params = {}
+        # pass or create a default param dict for each input
+        # e.g. for prompt,
+        #       prompt = {
+        #               "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers
+        #               "label": "Prompt",
+        #               "type": "string",
+        #               "default": "a bear sitting in a chair drinking a milkshake",
+        #               "display": "textarea"}
+        # if type is not specified, it'll be a "custom" param of its own type
+        # e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
+        #  it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
+        #  name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
+        inputs = self.blocks.inputs + self.blocks.intermediate_inputs
+        for inp in inputs:
+            param = kwargs.pop(inp.name, None)
+            if param:
+                # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...})
+                input_params[inp.name] = param
+                mellon_name = param.pop("name", inp.name)
+                if mellon_name != inp.name:
+                    self.name_mapping[inp.name] = mellon_name
+                continue
+
+            if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name):
+                continue
+
+            if inp.name in DEFAULT_PARAM_MAPS:
+                # first check if it's in the default param map, if so, directly use that
+                param = DEFAULT_PARAM_MAPS[inp.name].copy()
+            elif get_group_name(inp.name):
+                param = get_group_name(inp.name)
+                if inp.name not in self.name_mapping:
+                    self.name_mapping[inp.name] = param
+            else:
+                # if not, check if it's in the SDXL input schema, if so,
+                # 1. use the type hint to determine the type
+                # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}}
+                if inp.type_hint is not None:
+                    type_str = str(inp.type_hint).lower()
+                else:
+                    inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None)
+                    type_str = str(inp_spec.type_hint).lower() if inp_spec else ""
+                for type_key, type_param in DEFAULT_TYPE_MAPS.items():
+                    if type_key in type_str:
+                        param = type_param.copy()
+                        param["label"] = inp.name
+                        param["display"] = "input"
+                        break
+                else:
+                    param = inp.name
+            # add the param dict to the inp_params dict
+            input_params[inp.name] = param
+
+        component_params = {}
+        for comp in self.blocks.expected_components:
+            param = kwargs.pop(comp.name, None)
+            if param:
+                component_params[comp.name] = param
+                mellon_name = param.pop("name", comp.name)
+                if mellon_name != comp.name:
+                    self.name_mapping[comp.name] = mellon_name
+                continue
+
+            to_exclude = False
+            for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS:
+                if exclude_key in comp.name:
+                    to_exclude = True
+                    break
+            if to_exclude:
+                continue
+
+            if get_group_name(comp.name):
+                param = get_group_name(comp.name)
+                if comp.name not in self.name_mapping:
+                    self.name_mapping[comp.name] = param
+            elif comp.name in DEFAULT_MODEL_KEYS:
+                param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"}
+            else:
+                param = comp.name
+            # add the param dict to the model_params dict
+            component_params[comp.name] = param
+
+        output_params = {}
+        if isinstance(self.blocks, SequentialPipelineBlocks):
+            last_block_name = list(self.blocks.sub_blocks.keys())[-1]
+            outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs
+        else:
+            outputs = self.blocks.intermediate_outputs
+
+        for out in outputs:
+            param = kwargs.pop(out.name, None)
+            if param:
+                output_params[out.name] = param
+                mellon_name = param.pop("name", out.name)
+                if mellon_name != out.name:
+                    self.name_mapping[out.name] = mellon_name
+                continue
+
+            if out.name in DEFAULT_PARAM_MAPS:
+                param = DEFAULT_PARAM_MAPS[out.name].copy()
+                param["display"] = "output"
+            else:
+                group_name = get_group_name(out.name)
+                if group_name:
+                    param = group_name
+                    if out.name not in self.name_mapping:
+                        self.name_mapping[out.name] = param
+                else:
+                    param = out.name
+            # add the param dict to the outputs dict
+            output_params[out.name] = param
+
+        if len(kwargs) > 0:
+            logger.warning(f"Unused kwargs: {kwargs}")
+
+        register_dict = {
+            "category": category,
+            "label": label,
+            "input_params": input_params,
+            "component_params": component_params,
+            "output_params": output_params,
+            "name_mapping": self.name_mapping,
+        }
+        self.register_to_config(**register_dict)
+
+    def setup(self, components_manager, collection=None):
+        self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection)
+        self._components_manager = components_manager
+
+    @property
+    def mellon_config(self):
+        return self._convert_to_mellon_config()
+
+    def _convert_to_mellon_config(self):
+        node = {}
+        node["label"] = self.config.label
+        node["category"] = self.config.category
+
+        node_param = {}
+        for inp_name, inp_param in self.config.input_params.items():
+            if inp_name in self.name_mapping:
+                mellon_name = self.name_mapping[inp_name]
+            else:
+                mellon_name = inp_name
+            if isinstance(inp_param, str):
+                param = {
+                    "label": inp_param,
+                    "type": inp_param,
+                    "display": "input",
+                }
+            else:
+                param = inp_param
+
+            if mellon_name not in node_param:
+                node_param[mellon_name] = param
+            else:
+                logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}")
+
+        for comp_name, comp_param in self.config.component_params.items():
+            if comp_name in self.name_mapping:
+                mellon_name = self.name_mapping[comp_name]
+            else:
+                mellon_name = comp_name
+            if isinstance(comp_param, str):
+                param = {
+                    "label": comp_param,
+                    "type": comp_param,
+                    "display": "input",
+                }
+            else:
+                param = comp_param
+
+            if mellon_name not in node_param:
+                node_param[mellon_name] = param
+            else:
+                logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}")
+
+        for out_name, out_param in self.config.output_params.items():
+            if out_name in self.name_mapping:
+                mellon_name = self.name_mapping[out_name]
+            else:
+                mellon_name = out_name
+            if isinstance(out_param, str):
+                param = {
+                    "label": out_param,
+                    "type": out_param,
+                    "display": "output",
+                }
+            else:
+                param = out_param
+
+            if mellon_name not in node_param:
+                node_param[mellon_name] = param
+            else:
+                logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}")
+        node["params"] = node_param
+        return node
+
+    def save_mellon_config(self, file_path):
+        """
+        Save the Mellon configuration to a JSON file.
+
+        Args:
+            file_path (str or Path): Path where the JSON file will be saved
+
+        Returns:
+            Path: Path to the saved config file
+        """
+        file_path = Path(file_path)
+
+        # Create directory if it doesn't exist
+        os.makedirs(file_path.parent, exist_ok=True)
+
+        # Create a combined dictionary with module definition and name mapping
+        config = {"module": self.mellon_config, "name_mapping": self.name_mapping}
+
+        # Save the config to file
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2)
+
+        logger.info(f"Mellon config and name mapping saved to {file_path}")
+
+        return file_path
+
+    @classmethod
+    def load_mellon_config(cls, file_path):
+        """
+        Load a Mellon configuration from a JSON file.
+
+        Args:
+            file_path (str or Path): Path to the JSON file containing Mellon config
+
+        Returns:
+            dict: The loaded combined configuration containing 'module' and 'name_mapping'
+        """
+        file_path = Path(file_path)
+
+        if not file_path.exists():
+            raise FileNotFoundError(f"Config file not found: {file_path}")
+
+        with open(file_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+        logger.info(f"Mellon config loaded from {file_path}")
+
+        return config
+
+    def process_inputs(self, **kwargs):
+        params_components = {}
+        for comp_name, comp_param in self.config.component_params.items():
+            logger.debug(f"component: {comp_name}")
+            mellon_comp_name = self.name_mapping.get(comp_name, comp_name)
+            if mellon_comp_name in kwargs:
+                if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]:
+                    comp = kwargs[mellon_comp_name].pop(comp_name)
+                else:
+                    comp = kwargs.pop(mellon_comp_name)
+                if comp:
+                    params_components[comp_name] = self._components_manager.get_one(comp["model_id"])
+
+        params_run = {}
+        for inp_name, inp_param in self.config.input_params.items():
+            logger.debug(f"input: {inp_name}")
+            mellon_inp_name = self.name_mapping.get(inp_name, inp_name)
+            if mellon_inp_name in kwargs:
+                if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]:
+                    inp = kwargs[mellon_inp_name].pop(inp_name)
+                else:
+                    inp = kwargs.pop(mellon_inp_name)
+                if inp is not None:
+                    params_run[inp_name] = inp
+
+        return_output_names = list(self.config.output_params.keys())
+
+        return params_components, params_run, return_output_names
+
+    def execute(self, **kwargs):
+        params_components, params_run, return_output_names = self.process_inputs(**kwargs)
+
+        self.pipeline.update_components(**params_components)
+        output = self.pipeline(**params_run, output=return_output_names)
+        return output
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
index 3248d131590f..7200169923a5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -97,11 +97,7 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
     """
     A ModularPipeline for QwenImage.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     default_blocks_name = "QwenImageAutoBlocks"
@@ -153,11 +149,7 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
     """
     A ModularPipeline for QwenImage-Edit.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     default_blocks_name = "QwenImageEditAutoBlocks"
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
index 29a717f72e59..f2a4c96073ea 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -47,11 +47,7 @@ class StableDiffusionXLModularPipeline(
     """
     A ModularPipeline for Stable Diffusion XL.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     default_blocks_name = "StableDiffusionXLAutoBlocks"
diff --git a/src/diffusers/modular_pipelines/wan/modular_pipeline.py b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
index da4aada43839..e4adf3d151d6 100644
--- a/src/diffusers/modular_pipelines/wan/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
@@ -30,11 +30,7 @@ class WanModularPipeline(
     """
     A ModularPipeline for Wan.
 
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
     """
 
     default_blocks_name = "WanAutoBlocks"
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 880984eeb8a0..d265bfdcaf3d 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -407,12 +407,8 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf
-        auth login`.
-
-        </Tip>
+        > [!TIP] > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in
+        with `hf > auth login`.
 
         Examples:
 
@@ -702,12 +698,8 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf
-        auth login`.
-
-        </Tip>
+        > [!TIP] > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in
+        with `hf > auth login`.
 
         Examples:
 
@@ -1012,12 +1004,8 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
                 Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
                 loading `from_flax`.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf
-        auth login`.
-
-        </Tip>
+        > [!TIP] > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in
+        with `hf > auth login`.
 
         Examples:
 
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 41303d9c5c5a..6de8e5747b02 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -146,16 +146,13 @@ class StableDiffusionControlNetInpaintPipeline(
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
-    <Tip>
-
-    This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting
+    > [!TIP] > This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting >
     ([stable-diffusion-v1-5/stable-diffusion-inpainting](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-inpainting))
-    as well as default text-to-image Stable Diffusion checkpoints
+    > as well as default text-to-image Stable Diffusion checkpoints >
     ([stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5)).
-    Default text-to-image Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on
-    those, such as [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
-
-    </Tip>
+    > Default text-to-image Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned
+    on > those, such as
+    [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
index 1de1d4bde7f5..d4c6f336dfef 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
@@ -394,12 +394,8 @@ def __call__(
             jit (`bool`, defaults to `False`):
                 Whether to run `pmap` versions of the generation and safety scoring functions.
 
-                    <Tip warning={true}>
-
-                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
-                    future release.
-
-                    </Tip>
+                    > [!WARNING] > This argument exists because `__call__` is not yet end-to-end pmap-able. It will be
+                    removed in a > future release.
 
         Examples:
 
diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
index eda950998d67..397fbc0d85b8 100644
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -1000,11 +1000,7 @@ def fuse_qkv_projections(self):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
         """
         self.original_attn_processors = None
 
@@ -1021,11 +1017,7 @@ def fuse_qkv_projections(self):
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         """
         if self.original_attn_processors is not None:
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py
index 4c02b3dd6dc7..3daaac328caa 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py
@@ -150,17 +150,13 @@ class StableDiffusionControlNetPAGInpaintPipeline(
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
-    <Tip>
-
-    This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting
-    ([runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)) as well as
-    default text-to-image Stable Diffusion checkpoints
-    ([runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)). Default text-to-image
-    Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on those, such as
+    > [!TIP] > This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting >
+    ([runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)) as well as >
+    default text-to-image Stable Diffusion checkpoints >
+    ([runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)). Default text-to-image >
+    Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on those, such as >
     [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
 
-    </Tip>
-
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index 3e22c9a84545..61435b80ca5a 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -158,11 +158,7 @@ def prepare_mask_and_masked_image(image, mask):
 class PaintByExamplePipeline(DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin):
     _last_supported_version = "0.33.1"
     r"""
-    <Tip warning={true}>
-
-    🧪 This is an experimental feature!
-
-    </Tip>
+    > [!WARNING] > 🧪 This is an experimental feature!
 
     Pipeline for image-guided image inpainting using Stable Diffusion.
 
diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py
index f69968022ed7..2724c764c771 100644
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -276,12 +276,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
                 class. The overwritten components are passed directly to the pipelines `__init__` method.
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login`.
-
-        </Tip>
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login`.
 
         Examples:
 
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 3f6e53099b38..392d5fb3feb4 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -372,12 +372,8 @@ def to(self, *args, **kwargs) -> Self:
         Performs Pipeline dtype and/or device conversion. A torch.dtype and torch.device are inferred from the
         arguments of `self.to(*args, **kwargs).`
 
-        <Tip>
-
-            If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is. Otherwise,
-            the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
-
-        </Tip>
+        > [!TIP] > If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is.
+        Otherwise, > the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
 
 
         Here are the ways to call `to`:
@@ -627,11 +623,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 `torch.float32` is used.
             custom_pipeline (`str`, *optional*):
 
-                <Tip warning={true}>
-
-                🧪 This is an experimental feature and may change in the future.
-
-                </Tip>
+                > [!WARNING] > 🧪 This is an experimental feature and may change in the future.
 
                 Can be either:
 
@@ -716,12 +708,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             dduf_file(`str`, *optional*):
                 Load weights from the specified dduf file.
 
-        <Tip>
-
-        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `hf
-        auth login`.
-
-        </Tip>
+        > [!TIP] > To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in
+        with `hf > auth login`.
 
         Examples:
 
@@ -1508,11 +1496,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
                     - A path to a *directory* (`./my_pipeline_directory/`) containing a custom pipeline. The directory
                       must contain a file called `pipeline.py` that defines the custom pipeline.
 
-                <Tip warning={true}>
-
-                🧪 This is an experimental feature and may change in the future.
-
-                </Tip>
+                > [!WARNING] > 🧪 This is an experimental feature and may change in the future.
 
                 For more information on how to load and create custom pipelines, take a look at [How to contribute a
                 community pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/contribute_pipeline).
@@ -1566,12 +1550,8 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
             `os.PathLike`:
                 A path to the downloaded pipeline.
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login
-
-        </Tip>
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login
 
         """
         cache_dir = kwargs.pop("cache_dir", None)
@@ -1944,12 +1924,8 @@ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Call
         option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
         up during training is not guaranteed.
 
-        <Tip warning={true}>
-
-        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
-        precedent.
-
-        </Tip>
+        > [!WARNING] > ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient
+        attention takes > precedent.
 
         Parameters:
             attention_op (`Callable`, *optional*):
@@ -2005,13 +1981,10 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
         in slices to compute attention in several steps. For more than one attention head, the computation is performed
         sequentially over each head. This is useful to save some memory in exchange for a small speed decrease.
 
-        <Tip warning={true}>
-
-        ⚠️ Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch
-        2.0 or xFormers. These attention computations are already very memory efficient so you won't need to enable
-        this function. If you enable attention slicing with SDPA or xFormers, it can lead to serious slow downs!
-
-        </Tip>
+        > [!WARNING] > ⚠️ Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA)
+        from PyTorch > 2.0 or xFormers. These attention computations are already very memory efficient so you won't
+        need to enable > this function. If you enable attention slicing with SDPA or xFormers, it can lead to serious
+        slow downs!
 
         Args:
             slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
@@ -2288,11 +2261,7 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
         are fused. For cross-attention modules, key and value projection matrices are fused.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
@@ -2317,11 +2286,7 @@ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
     def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """Disable QKV projection fusion if enabled.
 
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
+        > [!WARNING] > This API is 🧪 experimental.
 
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 1afa7698da7c..6befe77aa4b1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -349,12 +349,8 @@ def __call__(
             jit (`bool`, defaults to `False`):
                 Whether to run `pmap` versions of the generation and safety scoring functions.
 
-                    <Tip warning={true}>
-
-                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
-                    future release.
-
-                    </Tip>
+                    > [!WARNING] > This argument exists because `__call__` is not yet end-to-end pmap-able. It will be
+                    removed in a > future release.
 
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
index 78e3ba239c4e..81656beba7e1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
@@ -389,12 +389,8 @@ def __call__(
             jit (`bool`, defaults to `False`):
                 Whether to run `pmap` versions of the generation and safety scoring functions.
 
-                    <Tip warning={true}>
-
-                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
-                    future release.
-
-                    </Tip>
+                    > [!WARNING] > This argument exists because `__call__` is not yet end-to-end pmap-able. It will be
+                    removed in a > future release.
 
         Examples:
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
index b7e17ba681a2..5938fe232a71 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
@@ -103,11 +103,7 @@ class FlaxStableDiffusionInpaintPipeline(FlaxDiffusionPipeline):
     r"""
     Flax-based pipeline for text-guided image inpainting using Stable Diffusion.
 
-    <Tip warning={true}>
-
-    🧪 This is an experimental feature!
-
-    </Tip>
+    > [!WARNING] > 🧪 This is an experimental feature!
 
     This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
@@ -435,12 +431,8 @@ def __call__(
             jit (`bool`, defaults to `False`):
                 Whether to run `pmap` versions of the generation and safety scoring functions.
 
-                    <Tip warning={true}>
-
-                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
-                    future release.
-
-                    </Tip>
+                    > [!WARNING] > This argument exists because `__call__` is not yet end-to-end pmap-able. It will be
+                    removed in a > future release.
 
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
index 87bd9f4444ac..65c25ffbe492 100644
--- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
@@ -249,11 +249,7 @@ class StableDiffusionDiffEditPipeline(
     StableDiffusionLoraLoaderMixin,
 ):
     r"""
-    <Tip warning={true}>
-
-    This is an experimental feature!
-
-    </Tip>
+    > [!WARNING] > This is an experimental feature!
 
     Pipeline for text-guided image inpainting using Stable Diffusion and DiffEdit.
 
diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
index df2564a89b1d..feebd6adf8f8 100755
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -81,11 +81,7 @@ class StableDiffusionKDiffusionPipeline(
         - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
 
-    <Tip warning={true}>
-
-        This is an experimental pipeline and is likely to change in the future.
-
-    </Tip>
+    > [!WARNING] > This is an experimental pipeline and is likely to change in the future.
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
index 6b968e708145..9206ee80a6b6 100644
--- a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
+++ b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
@@ -53,13 +53,9 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
     This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
     methods the library implements for all schedulers such as loading and saving.
 
-    <Tip>
-
-    For more details on the parameters, see [Appendix E](https://huggingface.co/papers/2206.00364). The grid search
-    values used to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in Table 5 of
-    the paper.
-
-    </Tip>
+    > [!TIP] > For more details on the parameters, see [Appendix E](https://huggingface.co/papers/2206.00364). The grid
+    search > values used to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in
+    Table 5 of > the paper.
 
     Args:
         sigma_min (`float`, defaults to 0.02):
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 0f5062258800..5d81d5eb8ac0 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -268,11 +268,7 @@ def get_scalings_for_boundary_condition(self, sigma):
         Gets the scalings used in the consistency model parameterization (from Appendix C of the
         [paper](https://huggingface.co/papers/2303.01469)) to enforce boundary condition.
 
-        <Tip>
-
-        `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`.
-
-        </Tip>
+        > [!TIP] > `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`.
 
         Args:
             sigma (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
index 66ed296da8ea..b9567f2c47d5 100644
--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -304,12 +304,8 @@ def convert_model_output(
         designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
         integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both
+        noise > prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index d07ff8b2007b..8b523cd13f1f 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -630,12 +630,8 @@ def convert_model_output(
         designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
         integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both
+        noise > prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index 9ec958851111..f1a1ac3d8216 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -491,12 +491,8 @@ def convert_model_output(
         designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
         integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both
+        noise > prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 8663210a6244..1ae824973034 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -568,12 +568,8 @@ def convert_model_output(
         designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
         integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both
+        noise > prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index f1b38aaff56c..e9ba695e1f39 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -370,12 +370,8 @@ def convert_model_output(
         designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
         integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both
+        noise > prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 2df7d560ddfb..2979ce193a36 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -500,12 +500,8 @@ def convert_model_output(
         Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is
         designed to discretize an integral of the data prediction model.
 
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both
-        noise prediction and data prediction models.
-
-        </Tip>
+        > [!TIP] > The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction
+        for both > noise prediction and data prediction models.
 
         Args:
             model_output (`torch.Tensor`):
diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py
index f0e162ea6b1c..a355c7bb1a51 100644
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -138,15 +138,11 @@ def from_pretrained(
                 The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                 allowed by Git.
 
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
-        auth login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
+        with `hf > auth login`. You can also activate the special >
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a >
         firewalled environment.
 
-        </Tip>
-
         """
         config, kwargs, commit_hash = cls.load_config(
             pretrained_model_name_or_path=pretrained_model_name_or_path,
diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index ffbe3b90207b..0534e47d8a30 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -120,19 +120,12 @@ def from_pretrained(
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
 
-        <Tip>
+        > [!TIP] > It is required to be logged in (`hf auth login`) when you want to use private or [gated >
+        models](https://huggingface.co/docs/hub/models-gated#gated-models).
 
-         It is required to be logged in (`hf auth login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-
-        </Tip>
+        > [!TIP] > Activate the special
+        ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to > use this method in a
+        firewalled environment.
 
         """
         logger.warning(
diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index de947a12e200..35bef51229ba 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -290,12 +290,8 @@ def get_cached_module_file(
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
 
-    <Tip>
-
-    You may pass a token in `token` if you are not logged in (`hf auth login`) and want to use private or [gated
-    models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-    </Tip>
+    > [!TIP] > You may pass a token in `token` if you are not logged in (`hf auth login`) and want to use private or
+    [gated > models](https://huggingface.co/docs/hub/models-gated#gated-models).
 
     Returns:
         `str`: The path to the module inside the cache.
@@ -440,12 +436,8 @@ def get_class_from_dynamic_module(
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
 
-    <Tip warning={true}>
-
-    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
-    therefore only be called on trusted repos.
-
-    </Tip>
+    > [!WARNING] > Calling this function will execute the code in the module file found locally or downloaded from the
+    Hub. It should > therefore only be called on trusted repos.
 
     Args:
         pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -480,12 +472,8 @@ def get_class_from_dynamic_module(
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
 
-    <Tip>
-
-    You may pass a token in `token` if you are not logged in (`hf auth login`) and want to use private or [gated
-    models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-    </Tip>
+    > [!TIP] > You may pass a token in `token` if you are not logged in (`hf auth login`) and want to use private or
+    [gated > models](https://huggingface.co/docs/hub/models-gated#gated-models).
 
     Returns:
         `type`: The class, dynamically imported from the module.
diff --git a/src/diffusers/utils/outputs.py b/src/diffusers/utils/outputs.py
index 35691496a182..2b20f6120ce3 100644
--- a/src/diffusers/utils/outputs.py
+++ b/src/diffusers/utils/outputs.py
@@ -43,12 +43,8 @@ class BaseOutput(OrderedDict):
     tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
     Python dictionary.
 
-    <Tip warning={true}>
-
-    You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
-    first.
-
-    </Tip>
+    > [!WARNING] > You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert
+    it to a tuple > first.
     """
 
     def __init_subclass__(cls) -> None:

From 34fa9dd4c1a77523cb58039147b6c487b0308593 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 19:05:51 +0000
Subject: [PATCH 23/69] remove dependencies to old checkpoints

---
 scripts/convert_mirage_to_diffusers.py | 229 +++++++++++++++++++++----
 1 file changed, 192 insertions(+), 37 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 5e2a2ff768f4..eb6de1a37481 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -6,11 +6,12 @@
 import argparse
 import json
 import os
-import shutil
 import sys
 
 import torch
 from safetensors.torch import save_file
+from dataclasses import dataclass, asdict
+from typing import Tuple, Dict
 
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
@@ -18,35 +19,53 @@
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 from diffusers.pipelines.mirage import MiragePipeline
 
+@dataclass(frozen=True)
+class MirageBase:
+    context_in_dim: int = 2304
+    hidden_size: int = 1792
+    mlp_ratio: float = 3.5
+    num_heads: int = 28
+    depth: int = 16
+    axes_dim: Tuple[int, int] = (32, 32)
+    theta: int = 10_000
+    time_factor: float = 1000.0
+    time_max_period: int = 10_000
 
-def load_reference_config(vae_type: str) -> dict:
-    """Load transformer config from existing pipeline checkpoint."""
 
+@dataclass(frozen=True)
+class MirageFlux(MirageBase):
+    in_channels: int = 16
+    patch_size: int = 2
+
+
+@dataclass(frozen=True)
+class MirageDCAE(MirageBase):
+    in_channels: int = 32
+    patch_size: int = 1
+
+
+def build_config(vae_type: str) -> dict:
     if vae_type == "flux":
-        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated/transformer/config.json"
+        cfg = MirageFlux()
     elif vae_type == "dc-ae":
-        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated/transformer/config.json"
+        cfg = MirageDCAE()
     else:
         raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
 
-    if not os.path.exists(config_path):
-        raise FileNotFoundError(f"Reference config not found: {config_path}")
-
-    with open(config_path, "r") as f:
-        config = json.load(f)
+    config_dict = asdict(cfg)
+    config_dict["axes_dim"] = list(config_dict["axes_dim"])  # type: ignore[index]
+    return config_dict
 
-    print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
-    return config
 
 
-def create_parameter_mapping() -> dict:
+def create_parameter_mapping(depth: int) -> dict:
     """Create mapping from old parameter names to new diffusers names."""
 
     # Key mappings for structural changes
     mapping = {}
 
     # RMSNorm: scale -> weight
-    for i in range(16):  # 16 layers
+    for i in range(depth):
         mapping[f"blocks.{i}.qk_norm.query_norm.scale"] = f"blocks.{i}.qk_norm.query_norm.weight"
         mapping[f"blocks.{i}.qk_norm.key_norm.scale"] = f"blocks.{i}.qk_norm.key_norm.weight"
         mapping[f"blocks.{i}.k_norm.scale"] = f"blocks.{i}.k_norm.weight"
@@ -57,12 +76,12 @@ def create_parameter_mapping() -> dict:
     return mapping
 
 
-def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
+def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth: int) -> Dict[str, torch.Tensor]:
     """Convert old checkpoint parameters to new diffusers format."""
 
     print("Converting checkpoint parameters...")
 
-    mapping = create_parameter_mapping()
+    mapping = create_parameter_mapping(depth)
     converted_state_dict = {}
 
     # First, print available keys to understand structure
@@ -135,7 +154,8 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     print(f"✓ Loaded checkpoint with {len(state_dict)} parameters")
 
     # Convert parameter names if needed
-    converted_state_dict = convert_checkpoint_parameters(state_dict)
+    model_depth = int(config.get("depth", 16))
+    converted_state_dict = convert_checkpoint_parameters(state_dict, depth=model_depth)
 
     # Create transformer with config
     print("Creating MirageTransformer2DModel...")
@@ -156,28 +176,164 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     return transformer
 
 
-def copy_pipeline_components(vae_type: str, output_path: str):
-    """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
+
+
+def create_scheduler_config(output_path: str):
+    """Create FlowMatchEulerDiscreteScheduler config."""
+
+    scheduler_config = {
+        "_class_name": "FlowMatchEulerDiscreteScheduler",
+        "num_train_timesteps": 1000,
+        "shift": 1.0
+    }
+
+    scheduler_path = os.path.join(output_path, "scheduler")
+    os.makedirs(scheduler_path, exist_ok=True)
+
+    with open(os.path.join(scheduler_path, "scheduler_config.json"), "w") as f:
+        json.dump(scheduler_config, f, indent=2)
+
+    print("✓ Created scheduler config")
+
+
+def create_vae_config(vae_type: str, output_path: str):
+    """Create VAE config based on type."""
 
     if vae_type == "flux":
-        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated"
+        vae_config = {
+            "_class_name": "AutoencoderKL",
+            "latent_channels": 16,
+            "block_out_channels": [128, 256, 512, 512],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D"
+            ],
+            "up_block_types": [
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D"
+            ],
+            "scaling_factor": 0.3611,
+            "shift_factor": 0.1159,
+            "use_post_quant_conv": False,
+            "use_quant_conv": False
+        }
     else:  # dc-ae
-        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated"
+        vae_config = {
+            "_class_name": "AutoencoderDC",
+            "latent_channels": 32,
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock"
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock"
+            ],
+            "encoder_layers_per_block": [2, 2, 2, 3, 3, 3],
+            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
+            "encoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
+            "decoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
+            "scaling_factor": 0.41407,
+            "upsample_block_type": "interpolate"
+        }
+
+    vae_path = os.path.join(output_path, "vae")
+    os.makedirs(vae_path, exist_ok=True)
+
+    with open(os.path.join(vae_path, "config.json"), "w") as f:
+        json.dump(vae_config, f, indent=2)
+
+    print("✓ Created VAE config")
+
+
+def create_text_encoder_config(output_path: str):
+    """Create T5GemmaEncoder config."""
+
+    text_encoder_config = {
+        "model_name": "google/t5gemma-2b-2b-ul2",
+        "model_max_length": 256,
+        "use_attn_mask": True,
+        "use_last_hidden_state": True
+    }
 
-    components = ["vae", "scheduler", "text_encoder", "tokenizer"]
+    text_encoder_path = os.path.join(output_path, "text_encoder")
+    os.makedirs(text_encoder_path, exist_ok=True)
+
+    with open(os.path.join(text_encoder_path, "config.json"), "w") as f:
+        json.dump(text_encoder_config, f, indent=2)
+
+    print("✓ Created text encoder config")
+
+
+def create_tokenizer_config(output_path: str):
+    """Create GemmaTokenizerFast config and files."""
+
+    tokenizer_config = {
+        "add_bos_token": False,
+        "add_eos_token": False,
+        "added_tokens_decoder": {
+            "0": {"content": "<pad>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "1": {"content": "<eos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "2": {"content": "<bos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "3": {"content": "<unk>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "106": {"content": "<start_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "107": {"content": "<end_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True}
+        },
+        "additional_special_tokens": ["<start_of_turn>", "<end_of_turn>"],
+        "bos_token": "<bos>",
+        "clean_up_tokenization_spaces": False,
+        "eos_token": "<eos>",
+        "extra_special_tokens": {},
+        "model_max_length": 256,
+        "pad_token": "<pad>",
+        "padding_side": "right",
+        "sp_model_kwargs": {},
+        "spaces_between_special_tokens": False,
+        "tokenizer_class": "GemmaTokenizer",
+        "unk_token": "<unk>",
+        "use_default_system_prompt": False
+    }
 
-    for component in components:
-        src_path = os.path.join(ref_pipeline, component)
-        dst_path = os.path.join(output_path, component)
+    special_tokens_map = {
+        "bos_token": "<bos>",
+        "eos_token": "<eos>",
+        "pad_token": "<pad>",
+        "unk_token": "<unk>"
+    }
 
-        if os.path.exists(src_path):
-            if os.path.isdir(src_path):
-                shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
-            else:
-                shutil.copy2(src_path, dst_path)
-            print(f"✓ Copied {component}")
-        else:
-            print(f"⚠ Component not found: {src_path}")
+    tokenizer_path = os.path.join(output_path, "tokenizer")
+    os.makedirs(tokenizer_path, exist_ok=True)
+
+    with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w") as f:
+        json.dump(tokenizer_config, f, indent=2)
+
+    with open(os.path.join(tokenizer_path, "special_tokens_map.json"), "w") as f:
+        json.dump(special_tokens_map, f, indent=2)
+
+    print("✓ Created tokenizer config (Note: tokenizer.json and tokenizer.model files need to be provided separately)")
+
+
+def create_pipeline_components(vae_type: str, output_path: str):
+    """Create all pipeline components with proper configs."""
+
+    create_scheduler_config(output_path)
+    create_vae_config(vae_type, output_path)
+    create_text_encoder_config(output_path)
+    create_tokenizer_config(output_path)
 
 
 def create_model_index(vae_type: str, output_path: str):
@@ -211,8 +367,7 @@ def main(args):
     if not os.path.exists(args.checkpoint_path):
         raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint_path}")
 
-    # Load reference config based on VAE type
-    config = load_reference_config(args.vae_type)
+    config = build_config(args.vae_type)
 
     # Create output directory
     os.makedirs(args.output_path, exist_ok=True)
@@ -234,8 +389,8 @@ def main(args):
     save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
     print(f"✓ Saved transformer to {transformer_path}")
 
-    # Copy other pipeline components
-    copy_pipeline_components(args.vae_type, args.output_path)
+    # Create other pipeline components
+    create_pipeline_components(args.vae_type, args.output_path)
 
     # Create model index
     create_model_index(args.vae_type, args.output_path)

From 5cc965a7570022b959bc38f8dd167e2eaed18254 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 22:30:58 +0200
Subject: [PATCH 24/69] remove old checkpoints dependency

---
 scripts/convert_mirage_to_diffusers.py        | 170 ++----------------
 .../pipelines/mirage/pipeline_mirage.py       |  68 +++++--
 2 files changed, 63 insertions(+), 175 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index eb6de1a37481..2ddb708bc704 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -84,13 +84,6 @@ def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth
     mapping = create_parameter_mapping(depth)
     converted_state_dict = {}
 
-    # First, print available keys to understand structure
-    print("Available keys in checkpoint:")
-    for key in sorted(old_state_dict.keys())[:10]:  # Show first 10 keys
-        print(f"  {key}")
-    if len(old_state_dict) > 10:
-        print(f"  ... and {len(old_state_dict) - 10} more")
-
     for key, value in old_state_dict.items():
         new_key = key
 
@@ -196,172 +189,37 @@ def create_scheduler_config(output_path: str):
     print("✓ Created scheduler config")
 
 
-def create_vae_config(vae_type: str, output_path: str):
-    """Create VAE config based on type."""
-
-    if vae_type == "flux":
-        vae_config = {
-            "_class_name": "AutoencoderKL",
-            "latent_channels": 16,
-            "block_out_channels": [128, 256, 512, 512],
-            "down_block_types": [
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D"
-            ],
-            "up_block_types": [
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D"
-            ],
-            "scaling_factor": 0.3611,
-            "shift_factor": 0.1159,
-            "use_post_quant_conv": False,
-            "use_quant_conv": False
-        }
-    else:  # dc-ae
-        vae_config = {
-            "_class_name": "AutoencoderDC",
-            "latent_channels": 32,
-            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "encoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock"
-            ],
-            "decoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock"
-            ],
-            "encoder_layers_per_block": [2, 2, 2, 3, 3, 3],
-            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
-            "encoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
-            "decoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
-            "scaling_factor": 0.41407,
-            "upsample_block_type": "interpolate"
-        }
-
-    vae_path = os.path.join(output_path, "vae")
-    os.makedirs(vae_path, exist_ok=True)
-
-    with open(os.path.join(vae_path, "config.json"), "w") as f:
-        json.dump(vae_config, f, indent=2)
-
-    print("✓ Created VAE config")
-
-
-def create_text_encoder_config(output_path: str):
-    """Create T5GemmaEncoder config."""
-
-    text_encoder_config = {
-        "model_name": "google/t5gemma-2b-2b-ul2",
-        "model_max_length": 256,
-        "use_attn_mask": True,
-        "use_last_hidden_state": True
-    }
-
-    text_encoder_path = os.path.join(output_path, "text_encoder")
-    os.makedirs(text_encoder_path, exist_ok=True)
-
-    with open(os.path.join(text_encoder_path, "config.json"), "w") as f:
-        json.dump(text_encoder_config, f, indent=2)
-
-    print("✓ Created text encoder config")
-
-
-def create_tokenizer_config(output_path: str):
-    """Create GemmaTokenizerFast config and files."""
-
-    tokenizer_config = {
-        "add_bos_token": False,
-        "add_eos_token": False,
-        "added_tokens_decoder": {
-            "0": {"content": "<pad>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "1": {"content": "<eos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "2": {"content": "<bos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "3": {"content": "<unk>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "106": {"content": "<start_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "107": {"content": "<end_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True}
-        },
-        "additional_special_tokens": ["<start_of_turn>", "<end_of_turn>"],
-        "bos_token": "<bos>",
-        "clean_up_tokenization_spaces": False,
-        "eos_token": "<eos>",
-        "extra_special_tokens": {},
-        "model_max_length": 256,
-        "pad_token": "<pad>",
-        "padding_side": "right",
-        "sp_model_kwargs": {},
-        "spaces_between_special_tokens": False,
-        "tokenizer_class": "GemmaTokenizer",
-        "unk_token": "<unk>",
-        "use_default_system_prompt": False
-    }
-
-    special_tokens_map = {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>"
-    }
-
-    tokenizer_path = os.path.join(output_path, "tokenizer")
-    os.makedirs(tokenizer_path, exist_ok=True)
-
-    with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w") as f:
-        json.dump(tokenizer_config, f, indent=2)
-
-    with open(os.path.join(tokenizer_path, "special_tokens_map.json"), "w") as f:
-        json.dump(special_tokens_map, f, indent=2)
-
-    print("✓ Created tokenizer config (Note: tokenizer.json and tokenizer.model files need to be provided separately)")
-
-
-def create_pipeline_components(vae_type: str, output_path: str):
-    """Create all pipeline components with proper configs."""
-
-    create_scheduler_config(output_path)
-    create_vae_config(vae_type, output_path)
-    create_text_encoder_config(output_path)
-    create_tokenizer_config(output_path)
 
 
 def create_model_index(vae_type: str, output_path: str):
-    """Create model_index.json for the pipeline."""
+    """Create model_index.json for the pipeline with HuggingFace model references."""
 
     if vae_type == "flux":
-        vae_class = "AutoencoderKL"
+        vae_model_name = "black-forest-labs/FLUX.1-dev"
+        vae_subfolder = "vae"
     else:  # dc-ae
-        vae_class = "AutoencoderDC"
+        vae_model_name = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
+        vae_subfolder = None
+
+    # Text encoder and tokenizer always use T5Gemma
+    text_model_name = "google/t5gemma-2b-2b-ul2"
 
     model_index = {
         "_class_name": "MiragePipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
         "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
-        "text_encoder": ["transformers", "T5GemmaEncoder"],
-        "tokenizer": ["transformers", "GemmaTokenizerFast"],
+        "text_encoder": text_model_name,
+        "tokenizer": text_model_name,
         "transformer": ["diffusers", "MirageTransformer2DModel"],
-        "vae": ["diffusers", vae_class],
+        "vae": vae_model_name,
+        "vae_subfolder": vae_subfolder,
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
     with open(model_index_path, "w") as f:
         json.dump(model_index, f, indent=2)
 
-    print("✓ Created model_index.json")
-
-
 def main(args):
     # Validate inputs
     if not os.path.exists(args.checkpoint_path):
@@ -389,10 +247,8 @@ def main(args):
     save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
     print(f"✓ Saved transformer to {transformer_path}")
 
-    # Create other pipeline components
-    create_pipeline_components(args.vae_type, args.output_path)
+    create_scheduler_config(args.output_path)
 
-    # Create model index
     create_model_index(args.vae_type, args.output_path)
 
     # Verify the pipeline can be loaded
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index c4a4783c5f38..e6a13ff226cd 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -247,26 +247,61 @@ class MiragePipeline(
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         """
-        Override from_pretrained to ensure T5GemmaEncoder is available for loading.
+        Override from_pretrained to load VAE and text encoder from HuggingFace models.
 
-        This ensures that T5GemmaEncoder from transformers is accessible in the module namespace
-        during component loading, which is required for MiragePipeline checkpoints that use
-        T5GemmaEncoder as the text encoder.
+        The MiragePipeline checkpoints only store transformer and scheduler locally.
+        VAE and text encoder are loaded from external HuggingFace models as specified
+        in model_index.json.
         """
-        # Ensure T5GemmaEncoder is available for loading
-        import transformers
+        import json
+        from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
+
+        model_index_path = os.path.join(pretrained_model_name_or_path, "model_index.json")
+        if not os.path.exists(model_index_path):
+            raise ValueError(f"model_index.json not found in {pretrained_model_name_or_path}")
+
+        with open(model_index_path, "r") as f:
+            model_index = json.load(f)
+
+        vae_model_name = model_index.get("vae")
+        vae_subfolder = model_index.get("vae_subfolder")
+        text_model_name = model_index.get("text_encoder")
+        tokenizer_model_name = model_index.get("tokenizer")
+
+        logger.info(f"Loading VAE from {vae_model_name}...")
+        if "FLUX" in vae_model_name or "flux" in vae_model_name:
+            vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder=vae_subfolder)
+        else:  # DC-AE
+            vae = AutoencoderDC.from_pretrained(vae_model_name)
+
+        logger.info(f"Loading text encoder from {text_model_name}...")
+        t5gemma_model = T5GemmaModel.from_pretrained(text_model_name)
+        text_encoder = t5gemma_model.encoder
+
+        logger.info(f"Loading tokenizer from {tokenizer_model_name}...")
+        tokenizer = GemmaTokenizerFast.from_pretrained(tokenizer_model_name)
+        tokenizer.model_max_length = 256
+
+        # Load transformer and scheduler from local checkpoint
+        logger.info(f"Loading transformer from {pretrained_model_name_or_path}...")
+        transformer = MirageTransformer2DModel.from_pretrained(
+            pretrained_model_name_or_path, subfolder="transformer"
+        )
 
-        if not hasattr(transformers, "T5GemmaEncoder"):
-            try:
-                from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+        logger.info(f"Loading scheduler from {pretrained_model_name_or_path}...")
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            pretrained_model_name_or_path, subfolder="scheduler"
+        )
 
-                transformers.T5GemmaEncoder = T5GemmaEncoder
-            except ImportError:
-                # T5GemmaEncoder not available in this transformers version
-                pass
+        pipeline = cls(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+        )
 
-        # Proceed with standard loading
-        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return pipeline
 
     def __init__(
         self,
@@ -283,11 +318,8 @@ def __init__(
                 "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
             )
 
-        # Store standard components
         self.text_encoder = text_encoder
         self.tokenizer = tokenizer
-
-        # Initialize text preprocessor
         self.text_preprocessor = TextPreprocessor()
 
         self.register_modules(

From d79cd8fffb959ada9bfeb4d3929b7aa2ce69f993 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 20:56:51 +0000
Subject: [PATCH 25/69] move default height and width in checkpoint config

---
 scripts/convert_mirage_to_diffusers.py           |  9 +++++++++
 .../pipelines/mirage/pipeline_mirage.py          | 16 ++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 2ddb708bc704..37de253d1448 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -19,6 +19,9 @@
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 from diffusers.pipelines.mirage import MiragePipeline
 
+DEFAULT_HEIGHT = 512
+DEFAULT_WIDTH = 512
+
 @dataclass(frozen=True)
 class MirageBase:
     context_in_dim: int = 2304
@@ -197,9 +200,13 @@ def create_model_index(vae_type: str, output_path: str):
     if vae_type == "flux":
         vae_model_name = "black-forest-labs/FLUX.1-dev"
         vae_subfolder = "vae"
+        default_height = DEFAULT_HEIGHT
+        default_width = DEFAULT_WIDTH
     else:  # dc-ae
         vae_model_name = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
         vae_subfolder = None
+        default_height = DEFAULT_HEIGHT
+        default_width = DEFAULT_WIDTH
 
     # Text encoder and tokenizer always use T5Gemma
     text_model_name = "google/t5gemma-2b-2b-ul2"
@@ -214,6 +221,8 @@ def create_model_index(vae_type: str, output_path: str):
         "transformer": ["diffusers", "MirageTransformer2DModel"],
         "vae": vae_model_name,
         "vae_subfolder": vae_subfolder,
+        "default_height": default_height,
+        "default_width": default_width,
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index e6a13ff226cd..9d247eecbd7f 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -31,6 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
+from ...models.transformers.transformer_mirage import seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -46,6 +47,9 @@
 except ImportError:
     MirageTransformer2DModel = None
 
+DEFAULT_HEIGHT = 512
+DEFAULT_WIDTH = 512
+
 logger = logging.get_logger(__name__)
 
 
@@ -267,6 +271,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         vae_subfolder = model_index.get("vae_subfolder")
         text_model_name = model_index.get("text_encoder")
         tokenizer_model_name = model_index.get("tokenizer")
+        default_height = model_index.get("default_height", DEFAULT_HEIGHT)
+        default_width = model_index.get("default_width", DEFAULT_WIDTH)
 
         logger.info(f"Loading VAE from {vae_model_name}...")
         if "FLUX" in vae_model_name or "flux" in vae_model_name:
@@ -301,6 +307,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             vae=vae,
         )
 
+        # Store default dimensions as pipeline attributes
+        pipeline.default_height = default_height
+        pipeline.default_width = default_width
+
         return pipeline
 
     def __init__(
@@ -558,8 +568,8 @@ def __call__(
         """
 
         # 0. Default height and width to transformer config
-        height = height or 256
-        width = width or 256
+        height = height or getattr(self, 'default_height', DEFAULT_HEIGHT)
+        width = width or getattr(self, 'default_width', DEFAULT_WIDTH)
 
         # 1. Check inputs
         self.check_inputs(
@@ -642,8 +652,6 @@ def __call__(
                 )
 
                 # Convert back to image format
-                from ...models.transformers.transformer_mirage import seq2img
-
                 noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
 
                 # Apply CFG

From f2759fd0a8ea934ea0ecea9bfb68f43dffdca5f7 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:26:03 +0000
Subject: [PATCH 26/69] add docstrings

---
 .../models/transformers/transformer_mirage.py | 367 +++++++++++++++++-
 .../test_models_transformer_mirage.py         |   6 +-
 2 files changed, 351 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 923d44d4f1ec..c509f797fb8b 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -33,20 +33,70 @@
 logger = logging.get_logger(__name__)
 
 
-def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
-    img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
-    img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
-    img_ids[..., 1] = torch.arange(w // patch_size, device=device)[None, :]
-    return img_ids.reshape((h // patch_size) * (w // patch_size), 2).unsqueeze(0).repeat(bs, 1, 1)
+def get_image_ids(batch_size: int, height: int, width: int, patch_size: int, device: torch.device) -> Tensor:
+    r"""
+    Generates 2D patch coordinate indices for a batch of images.
+
+    Parameters:
+        batch_size (`int`):
+            Number of images in the batch.
+        height (`int`):
+            Height of the input images (in pixels).
+        width (`int`):
+            Width of the input images (in pixels).
+        patch_size (`int`):
+            Size of the square patches that the image is divided into.
+        device (`torch.device`):
+            The device on which to create the tensor.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of shape `(batch_size, num_patches, 2)` containing the (row, col)
+            coordinates of each patch in the image grid.
+    """
+    
+    img_ids = torch.zeros(height // patch_size, width // patch_size, 2, device=device)
+    img_ids[..., 0] = torch.arange(height // patch_size, device=device)[:, None]
+    img_ids[..., 1] = torch.arange(width // patch_size, device=device)[None, :]
+    return img_ids.reshape((height // patch_size) * (width // patch_size), 2).unsqueeze(0).repeat(batch_size, 1, 1)
 
 
 def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
+    r"""
+    Applies rotary positional embeddings (RoPE) to a query tensor.
+
+    Parameters:
+        xq (`torch.Tensor`):
+            Input tensor of shape `(..., dim)` representing the queries.
+        freqs_cis (`torch.Tensor`):
+            Precomputed rotary frequency components of shape `(..., dim/2, 2)` 
+            containing cosine and sine pairs.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of the same shape as `xq` with rotary embeddings applied.
+    """
     xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
     xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
     return xq_out.reshape(*xq.shape).type_as(xq)
 
 
 class EmbedND(nn.Module):
+    r"""
+    N-dimensional rotary positional embedding.
+
+    This module creates rotary embeddings (RoPE) across multiple axes, where each
+    axis can have its own embedding dimension. The embeddings are combined and
+    returned as a single tensor
+    
+    Parameters:
+        dim (int):
+        Base embedding dimension (must be even).
+        theta (int):
+        Scaling factor that controls the frequency spectrum of the rotary embeddings.
+        axes_dim (list[int]):
+        List of embedding dimensions for each axis (each must be even).
+    """
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         super().__init__()
         self.dim = dim
@@ -73,6 +123,19 @@ def forward(self, ids: Tensor) -> Tensor:
 
 
 class MLPEmbedder(nn.Module):
+    r"""
+    A simple 2-layer MLP used for embedding inputs.
+
+    Parameters:
+        in_dim (`int`):
+            Dimensionality of the input features.
+        hidden_dim (`int`):
+            Dimensionality of the hidden and output embedding space.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of shape `(..., hidden_dim)` containing the embedded representations.
+    """
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
@@ -84,6 +147,19 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class QKNorm(torch.nn.Module):
+    r"""
+    Applies RMS normalization to query and key tensors separately before attention
+    which can help stabilize training and improve numerical precision.
+
+    Parameters:
+        dim (`int`):
+            Dimensionality of the query and key vectors.
+
+    Returns:
+        (`torch.Tensor`, `torch.Tensor`):
+            A tuple `(q, k)` where both are normalized and cast to the same dtype
+            as the value tensor `v`.
+    """
     def __init__(self, dim: int):
         super().__init__()
         self.query_norm = RMSNorm(dim, eps=1e-6)
@@ -103,6 +179,22 @@ class ModulationOut:
 
 
 class Modulation(nn.Module):
+    r"""
+    Modulation network that generates scale, shift, and gating parameters.
+
+    Given an input vector, the module projects it through a linear layer to
+    produce six chunks, which are grouped into two `ModulationOut` objects.
+
+    Parameters:
+        dim (`int`):
+            Dimensionality of the input vector. The output will have `6 * dim`
+            features internally.
+
+    Returns:
+        (`ModulationOut`, `ModulationOut`):
+            A tuple of two modulation outputs. Each `ModulationOut` contains
+            three components (e.g., scale, shift, gate).
+    """
     def __init__(self, dim: int):
         super().__init__()
         self.lin = nn.Linear(dim, 6 * dim, bias=True)
@@ -115,6 +207,68 @@ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
 
 
 class MirageBlock(nn.Module):
+    r"""
+    Multimodal transformer block with text–image cross-attention, modulation, and MLP.
+
+    Parameters:
+        hidden_size (`int`):
+            Dimension of the hidden representations.
+        num_heads (`int`):
+            Number of attention heads.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Expansion ratio for the hidden dimension inside the MLP.
+        qk_scale (`float`, *optional*):
+            Scale factor for queries and keys. If not provided, defaults to
+            ``head_dim**-0.5``.
+
+    Attributes:
+        img_pre_norm (`nn.LayerNorm`):
+            Pre-normalization applied to image tokens before QKV projection.
+        img_qkv_proj (`nn.Linear`):
+            Linear projection to produce image queries, keys, and values.
+        qk_norm (`QKNorm`):
+            RMS normalization applied separately to image queries and keys.
+        txt_kv_proj (`nn.Linear`):
+            Linear projection to produce text keys and values.
+        k_norm (`RMSNorm`):
+            RMS normalization applied to text keys.
+        attention (`Attention`):
+            Multi-head attention module for cross-attention between image, text,
+            and optional spatial conditioning tokens.
+        post_attention_layernorm (`nn.LayerNorm`):
+            Normalization applied after attention.
+        gate_proj / up_proj / down_proj (`nn.Linear`):
+            Feedforward layers forming the gated MLP.
+        mlp_act (`nn.GELU`):
+            Nonlinear activation used in the MLP.
+        modulation (`Modulation`):
+            Produces scale/shift/gating parameters for modulated layers.
+        spatial_cond_kv_proj (`nn.Linear`, *optional*):
+            Projection for optional spatial conditioning tokens.
+
+    Methods:
+        attn_forward(img, txt, pe, modulation, spatial_conditioning=None, attention_mask=None):
+            Compute cross-attention between image and text tokens, with optional
+            spatial conditioning and attention masking.
+
+            Parameters:
+                img (`torch.Tensor`):
+                    Image tokens of shape `(B, L_img, hidden_size)`.
+                txt (`torch.Tensor`):
+                    Text tokens of shape `(B, L_txt, hidden_size)`.
+                pe (`torch.Tensor`):
+                    Rotary positional embeddings to apply to queries and keys.
+                modulation (`ModulationOut`):
+                    Scale and shift parameters for modulating image tokens.
+                spatial_conditioning (`torch.Tensor`, *optional*):
+                    Extra conditioning tokens of shape `(B, L_cond, hidden_size)`.
+                attention_mask (`torch.Tensor`, *optional*):
+                    Boolean mask of shape `(B, L_txt)` where 0 marks padding.
+
+            Returns:
+                `torch.Tensor`:
+                    Attention output of shape `(B, L_img, hidden_size)`.
+    """
     def __init__(
         self,
         hidden_size: int,
@@ -163,7 +317,7 @@ def __init__(
         self.modulation = Modulation(hidden_size)
         self.spatial_cond_kv_proj: None | nn.Linear = None
 
-    def attn_forward(
+    def _attn_forward(
         self,
         img: Tensor,
         txt: Tensor,
@@ -236,7 +390,7 @@ def attn_forward(
 
         return attn
 
-    def ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
+    def _ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
         x = (1 + modulation.scale) * self.post_attention_layernorm(x) + modulation.shift
         return self.down_proj(self.mlp_act(self.gate_proj(x)) * self.up_proj(x))
 
@@ -250,9 +404,36 @@ def forward(
         attention_mask: Tensor | None = None,
         **_: dict[str, Any],
     ) -> Tensor:
+        r"""
+    Runs modulation-gated cross-attention and MLP, with residual connections.
+
+    Parameters:
+        img (`torch.Tensor`):
+            Image tokens of shape `(B, L_img, hidden_size)`.
+        txt (`torch.Tensor`):
+            Text tokens of shape `(B, L_txt, hidden_size)`.
+        vec (`torch.Tensor`):
+            Conditioning vector used by `Modulation` to produce scale/shift/gates,
+            shape `(B, hidden_size)` (or broadcastable).
+        pe (`torch.Tensor`):
+            Rotary positional embeddings applied inside attention.
+        spatial_conditioning (`torch.Tensor`, *optional*):
+            Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only
+            if spatial conditioning is enabled in the block.
+        attention_mask (`torch.Tensor`, *optional*):
+            Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
+        **_:
+            Ignored additional keyword arguments for API compatibility.
+
+    Returns:
+        `torch.Tensor`:
+            Updated image tokens of shape `(B, L_img, hidden_size)`.
+    """
+
+
         mod_attn, mod_mlp = self.modulation(vec)
 
-        img = img + mod_attn.gate * self.attn_forward(
+        img = img + mod_attn.gate * self._attn_forward(
             img,
             txt,
             pe,
@@ -260,12 +441,39 @@ def forward(
             spatial_conditioning=spatial_conditioning,
             attention_mask=attention_mask,
         )
-        img = img + mod_mlp.gate * self.ffn_forward(img, mod_mlp)
+        img = img + mod_mlp.gate * self._ffn_forward(img, mod_mlp)
         return img
 
 
 class LastLayer(nn.Module):
+    r"""
+    Final projection layer with adaptive LayerNorm modulation.
+
+    This layer applies a normalized and modulated transformation to input tokens
+    and projects them into patch-level outputs.
+
+    Parameters:
+        hidden_size (`int`):
+            Dimensionality of the input tokens.
+        patch_size (`int`):
+            Size of the square image patches.
+        out_channels (`int`):
+            Number of output channels per pixel (e.g. RGB = 3).
+
+    Forward Inputs:
+        x (`torch.Tensor`):
+            Input tokens of shape `(B, L, hidden_size)`, where `L` is the number of patches.
+        vec (`torch.Tensor`):
+            Conditioning vector of shape `(B, hidden_size)` used to generate
+            shift and scale parameters for adaptive LayerNorm.
+
+    Returns:
+        `torch.Tensor`:
+            Projected patch outputs of shape `(B, L, patch_size * patch_size * out_channels)`.
+    """
+
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
@@ -284,12 +492,41 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
 
 
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
-    """Flatten an image into a sequence of patches"""
+    r"""
+    Flattens an image tensor into a sequence of non-overlapping patches.
+
+    Parameters:
+        img (`torch.Tensor`):
+            Input image tensor of shape `(B, C, H, W)`.
+        patch_size (`int`):
+            Size of each square patch. Must evenly divide both `H` and `W`.
+
+    Returns:
+        `torch.Tensor`:
+            Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`,
+            where `L = (H // patch_size) * (W // patch_size)` is the number of patches.
+    """
     return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
 
 
 def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
-    """Revert img2seq"""
+    r"""
+    Reconstructs an image tensor from a sequence of patches (inverse of `img2seq`).
+
+    Parameters:
+        seq (`torch.Tensor`):
+            Patch sequence of shape `(B, L, C * patch_size * patch_size)`,
+            where `L = (H // patch_size) * (W // patch_size)`.
+        patch_size (`int`):
+            Size of each square patch.
+        shape (`tuple` or `torch.Tensor`):
+            The original image spatial shape `(H, W)`. If a tensor is provided,
+            the first two values are interpreted as height and width.
+
+    Returns:
+        `torch.Tensor`:
+            Reconstructed image tensor of shape `(B, C, H, W)`.
+    """
     if isinstance(shape, tuple):
         shape = shape[-2:]
     elif isinstance(shape, torch.Tensor):
@@ -300,7 +537,70 @@ def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
 
 
 class MirageTransformer2DModel(ModelMixin, ConfigMixin):
-    """Mirage Transformer model with IP-Adapter support."""
+    r"""
+    Transformer-based 2D model for text to image generation.
+    It supports attention processor injection and LoRA scaling.
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 16):
+            Number of input channels in the latent image.
+        patch_size (`int`, *optional*, defaults to 2):
+            Size of the square patches used to flatten the input image.
+        context_in_dim (`int`, *optional*, defaults to 2304):
+            Dimensionality of the text conditioning input.
+        hidden_size (`int`, *optional*, defaults to 1792):
+            Dimension of the hidden representation.
+        mlp_ratio (`float`, *optional*, defaults to 3.5):
+            Expansion ratio for the hidden dimension inside MLP blocks.
+        num_heads (`int`, *optional*, defaults to 28):
+            Number of attention heads.
+        depth (`int`, *optional*, defaults to 16):
+            Number of transformer blocks.
+        axes_dim (`list[int]`, *optional*):
+            List of dimensions for each positional embedding axis. Defaults to `[32, 32]`.
+        theta (`int`, *optional*, defaults to 10000):
+            Frequency scaling factor for rotary embeddings.
+        time_factor (`float`, *optional*, defaults to 1000.0):
+            Scaling factor applied in timestep embeddings.
+        time_max_period (`int`, *optional*, defaults to 10000):
+            Maximum frequency period for timestep embeddings.
+        conditioning_block_ids (`list[int]`, *optional*):
+            Indices of blocks that receive conditioning. Defaults to all blocks.
+        **kwargs:
+            Additional keyword arguments forwarded to the config.
+
+    Attributes:
+        pe_embedder (`EmbedND`):
+            Multi-axis rotary embedding generator for positional encodings.
+        img_in (`nn.Linear`):
+            Projection layer for image patch tokens.
+        time_in (`MLPEmbedder`):
+            Embedding layer for timestep embeddings.
+        txt_in (`nn.Linear`):
+            Projection layer for text conditioning.
+        blocks (`nn.ModuleList`):
+            Stack of transformer blocks (`MirageBlock`).
+        final_layer (`LastLayer`):
+            Projection layer mapping hidden tokens back to patch outputs.
+
+    Methods:
+        attn_processors:
+            Returns a dictionary of all attention processors in the model.
+        set_attn_processor(processor):
+            Replaces attention processors across all attention layers.
+        process_inputs(image_latent, txt):
+            Converts inputs into patch tokens, encodes text, and produces positional encodings.
+        compute_timestep_embedding(timestep, dtype):
+            Creates a timestep embedding of dimension 256, scaled and projected.
+        forward_transformers(image_latent, cross_attn_conditioning, timestep, time_embedding, attention_mask, **block_kwargs):
+            Runs the sequence of transformer blocks over image and text tokens.
+        forward(image_latent, timestep, cross_attn_conditioning, micro_conditioning, cross_attn_mask=None, attention_kwargs=None, return_dict=True):
+            Full forward pass from latent input to reconstructed output image.
+
+    Returns:
+        `Transformer2DModelOutput` if `return_dict=True` (default), otherwise a tuple containing:
+            - `sample` (`torch.Tensor`): Reconstructed image of shape `(B, C, H, W)`.
+    """
 
     config_name = "config.json"
     _supports_gradient_checkpointing = True
@@ -424,8 +724,8 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
 
-    def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
-        """Timestep independent stuff"""
+    def _process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
+        
         txt = self.txt_in(txt)
         img = img2seq(image_latent, self.patch_size)
         bs, _, h, w = image_latent.shape
@@ -433,7 +733,7 @@ def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[T
         pe = self.pe_embedder(img_ids)
         return img, txt, pe
 
-    def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
+    def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
             get_timestep_embedding(
                 timesteps=timestep,
@@ -444,7 +744,7 @@ def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Te
             ).to(dtype)
         )
 
-    def forward_transformers(
+    def _forward_transformers(
         self,
         image_latent: Tensor,
         cross_attn_conditioning: Tensor,
@@ -460,7 +760,7 @@ def forward_transformers(
         else:
             if timestep is None:
                 raise ValueError("Please provide either a timestep or a timestep_embedding")
-            vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
+            vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
 
         for block in self.blocks:
             img = block(img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs)
@@ -478,6 +778,35 @@ def forward(
         attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        r"""
+        Forward pass of the MirageTransformer2DModel.
+
+        The latent image is split into patch tokens, combined with text conditioning,
+        and processed through a stack of transformer blocks modulated by the timestep.
+        The output is reconstructed into the latent image space.
+
+        Parameters:
+            image_latent (`torch.Tensor`):
+                Input latent image tensor of shape `(B, C, H, W)`.
+            timestep (`torch.Tensor`):
+                Timestep tensor of shape `(B,)` or `(1,)`, used for temporal conditioning.
+            cross_attn_conditioning (`torch.Tensor`):
+                Text conditioning tensor of shape `(B, L_txt, context_in_dim)`.
+            micro_conditioning (`torch.Tensor`):
+                Extra conditioning vector (currently unused, reserved for future use).
+            cross_attn_mask (`torch.Tensor`, *optional*):
+                Boolean mask of shape `(B, L_txt)`, where `0` marks padding in the text sequence.
+            attention_kwargs (`dict`, *optional*):
+                Additional arguments passed to attention layers. If using the PEFT backend,
+                the key `"scale"` controls LoRA scaling (default: 1.0).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a `Transformer2DModelOutput` or a tuple.
+
+        Returns:
+            `Transformer2DModelOutput` if `return_dict=True`, otherwise a tuple:
+
+                - `sample` (`torch.Tensor`): Output latent image of shape `(B, C, H, W)`.
+        """
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
             lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -491,8 +820,8 @@ def forward(
                 logger.warning(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
-        img_seq, txt, pe = self.process_inputs(image_latent, cross_attn_conditioning)
-        img_seq = self.forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
+        img_seq, txt, pe = self._process_inputs(image_latent, cross_attn_conditioning)
+        img_seq = self._forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
         output = seq2img(img_seq, self.patch_size, image_latent.shape)
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 0085627aa7e4..fe7436debc4c 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -132,7 +132,7 @@ def test_process_inputs(self):
         model.eval()
 
         with torch.no_grad():
-            img_seq, txt, pe = model.process_inputs(
+            img_seq, txt, pe = model._process_inputs(
                 inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
@@ -160,12 +160,12 @@ def test_forward_transformers(self):
 
         with torch.no_grad():
             # Process inputs first
-            img_seq, txt, pe = model.process_inputs(
+            img_seq, txt, pe = model._process_inputs(
                 inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
             # Test forward_transformers
-            output_seq = model.forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
+            output_seq = model._forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
 
         # Check output shape
         expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"] ** 2

From 394f725139a57e88f5e0b0d6e458db774606a7d5 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:28:43 +0000
Subject: [PATCH 27/69] if conditions and raised as ValueError instead of
 asserts

---
 .../models/transformers/transformer_mirage.py          | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index c509f797fb8b..90ba11fb2d24 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -360,10 +360,12 @@ def _attn_forward(
             bs, _, l_img, _ = img_q.shape
             l_txt = txt_k.shape[2]
 
-            assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
-            assert attention_mask.shape[-1] == l_txt, (
-                f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
-            )
+            if attention_mask.dim() != 2:
+                raise ValueError(f"Unsupported attention_mask shape: {attention_mask.shape}")
+            if attention_mask.shape[-1] != l_txt:
+                raise ValueError(
+                    f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+                )
 
             device = img_q.device
 

From 54fb0632d80e8516cc9c716144a609f226cde4e9 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:33:28 +0000
Subject: [PATCH 28/69] small fix

---
 src/diffusers/pipelines/mirage/pipeline_mirage.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 9d247eecbd7f..50304ae1a3ad 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -640,13 +640,13 @@ def __call__(
                 t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).repeat(2).to(device)
 
                 # Process inputs for transformer
-                img_seq, txt, pe = self.transformer.process_inputs(latents_in, ca_embed)
+                img_seq, txt, pe = self.transformer._process_inputs(latents_in, ca_embed)
 
                 # Forward through transformer layers
-                img_seq = self.transformer.forward_transformers(
+                img_seq = self.transformer._forward_transformers(
                     img_seq,
                     txt,
-                    time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    time_embedding=self.transformer._compute_timestep_embedding(t_cont, img_seq.dtype),
                     pe=pe,
                     attention_mask=ca_mask,
                 )

From c49fafbaaba17d5a9470af42d11ea1a30389fcb2 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:34:47 +0000
Subject: [PATCH 29/69] nit remove try block at import

---
 src/diffusers/pipelines/mirage/pipeline_mirage.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 50304ae1a3ad..ced78adec786 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -31,7 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
-from ...models.transformers.transformer_mirage import seq2img
+from ...models.transformers.transformer_mirage import MirageTransformer2DModel, seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -42,11 +42,6 @@
 from .pipeline_output import MiragePipelineOutput
 
 
-try:
-    from ...models.transformers.transformer_mirage import MirageTransformer2DModel
-except ImportError:
-    MirageTransformer2DModel = None
-
 DEFAULT_HEIGHT = 512
 DEFAULT_WIDTH = 512
 

From 7e7df3569204a62f27797b60b7db2e1716c29a3b Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:35:16 +0000
Subject: [PATCH 30/69] mirage pipeline doc

---
 docs/source/en/api/pipelines/mirage.md | 158 +++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/mirage.md

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/mirage.md
new file mode 100644
index 000000000000..3383bbecae2a
--- /dev/null
+++ b/docs/source/en/api/pipelines/mirage.md
@@ -0,0 +1,158 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# MiragePipeline
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+Mirage is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
+
+Key features:
+
+- **Transformer Architecture**: Uses a modern transformer-based denoising model with attention mechanisms optimized for image generation
+- **Flow Matching**: Employs flow matching with Euler discrete scheduling for efficient sampling
+- **Flexible VAE Support**: Compatible with both Flux VAE (8x compression, 16 latent channels) and DC-AE (32x compression, 32 latent channels)
+- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding with strong text-image alignment
+- **Efficient Architecture**: ~1.3B parameters in the transformer, enabling fast inference while maintaining quality
+- **Modular Design**: Text encoder and VAE weights are loaded from HuggingFace, keeping checkpoint sizes small
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## Loading the Pipeline
+
+Mirage checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
+
+```py
+from diffusers import MiragePipeline
+
+# Load pipeline - VAE and text encoder will be loaded from HuggingFace
+pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+pipe.to("cuda")
+
+prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+image.save("mirage_output.png")
+```
+
+### Manual Component Loading
+
+You can also load components individually:
+
+```py
+import torch
+from diffusers import MiragePipeline
+from diffusers.models import AutoencoderKL, AutoencoderDC
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from transformers import T5GemmaModel, GemmaTokenizerFast
+
+# Load transformer
+transformer = MirageTransformer2DModel.from_pretrained(
+    "path/to/checkpoint", subfolder="transformer"
+)
+
+# Load scheduler
+scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+    "path/to/checkpoint", subfolder="scheduler"
+)
+
+# Load T5Gemma text encoder
+t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
+text_encoder = t5gemma_model.encoder
+tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
+
+# Load VAE - choose either Flux VAE or DC-AE
+# Flux VAE (16 latent channels):
+vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
+# Or DC-AE (32 latent channels):
+# vae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers")
+
+pipe = MiragePipeline(
+    transformer=transformer,
+    scheduler=scheduler,
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    vae=vae
+)
+pipe.to("cuda")
+```
+
+## VAE Variants
+
+Mirage supports two VAE configurations:
+
+### Flux VAE (AutoencoderKL)
+- **Compression**: 8x spatial compression
+- **Latent channels**: 16
+- **Model**: `black-forest-labs/FLUX.1-dev` (subfolder: "vae")
+- **Use case**: Balanced quality and speed
+
+### DC-AE (AutoencoderDC)
+- **Compression**: 32x spatial compression
+- **Latent channels**: 32
+- **Model**: `mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers`
+- **Use case**: Higher compression for faster processing
+
+The VAE type is automatically determined from the checkpoint's `model_index.json` configuration.
+
+## Generation Parameters
+
+Key parameters for image generation:
+
+- **num_inference_steps**: Number of denoising steps (default: 28). More steps generally improve quality at the cost of speed.
+- **guidance_scale**: Classifier-free guidance strength (default: 4.0). Higher values produce images more closely aligned with the prompt.
+- **height/width**: Output image dimensions (default: 512x512). Can be customized in the checkpoint configuration.
+
+```py
+# Example with custom parameters
+image = pipe(
+    prompt="A serene mountain landscape at sunset",
+    num_inference_steps=28,
+    guidance_scale=4.0,
+    height=1024,
+    width=1024,
+    generator=torch.Generator("cuda").manual_seed(42)
+).images[0]
+```
+
+## Memory Optimization
+
+For memory-constrained environments:
+
+```py
+import torch
+from diffusers import MiragePipeline
+
+pipe = MiragePipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
+pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use
+
+# Or use sequential CPU offload for even lower memory
+pipe.enable_sequential_cpu_offload()
+```
+
+## MiragePipeline
+
+[[autodoc]] MiragePipeline
+  - all
+  - __call__
+
+## MiragePipelineOutput
+
+[[autodoc]] pipelines.mirage.pipeline_output.MiragePipelineOutput

From 814d710e56fb072542120662f0d22e2556168a91 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 1 Oct 2025 09:02:55 +0530
Subject: [PATCH 31/69] [tests] cache non lora pipeline outputs. (#12298)

* cache non lora pipeline outputs.

* up

* up

* up

* up

* Revert "up"

This reverts commit 772c32e43397f25919c29bbbe8ef9dc7d581cfb8.

* up

* Revert "up"

This reverts commit cca03df7fce55550ed28b59cadec12d1db188283.

* up

* up

* add .

* up

* up

* up

* up

* up

* up
---
 tests/lora/test_lora_layers_cogview4.py |   3 -
 tests/lora/test_lora_layers_flux.py     |  11 +--
 tests/lora/test_lora_layers_wanvace.py  |   2 +-
 tests/lora/utils.py                     | 109 +++++++++---------------
 4 files changed, 42 insertions(+), 83 deletions(-)

diff --git a/tests/lora/test_lora_layers_cogview4.py b/tests/lora/test_lora_layers_cogview4.py
index 9c62d2f0b84b..30eb8fbb6367 100644
--- a/tests/lora/test_lora_layers_cogview4.py
+++ b/tests/lora/test_lora_layers_cogview4.py
@@ -129,9 +129,6 @@ def test_simple_inference_save_pretrained(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
 
         with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 6c22a3488940..b840d7ac72ce 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -122,9 +122,6 @@ def test_with_alpha_in_state_dict(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe.transformer.add_adapter(denoiser_lora_config)
         self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer")
 
@@ -170,8 +167,7 @@ def test_lora_expansion_works_for_absent_keys(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         # Modify the config to have a layer which won't be present in the second LoRA we will load.
         modified_denoiser_lora_config = copy.deepcopy(denoiser_lora_config)
@@ -218,9 +214,7 @@ def test_lora_expansion_works_for_extra_keys(self):
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
-
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         # Modify the config to have a layer which won't be present in the first LoRA we will load.
         modified_denoiser_lora_config = copy.deepcopy(denoiser_lora_config)
@@ -329,6 +323,7 @@ def get_dummy_inputs(self, with_generator=True):
         noise = floats_tensor((batch_size, num_channels) + sizes)
         input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
 
+        np.random.seed(0)
         pipeline_inputs = {
             "prompt": "A painting of a squirrel eating a burger",
             "control_image": Image.fromarray(np.random.randint(0, 255, size=(32, 32, 3), dtype="uint8")),
diff --git a/tests/lora/test_lora_layers_wanvace.py b/tests/lora/test_lora_layers_wanvace.py
index c3244e150e13..ab1f57bfc9da 100644
--- a/tests/lora/test_lora_layers_wanvace.py
+++ b/tests/lora/test_lora_layers_wanvace.py
@@ -169,7 +169,7 @@ def test_lora_exclude_modules_wanvace(self):
         pipe = self.pipeline_class(**components).to(torch_device)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
         self.assertTrue(output_no_lora.shape == self.output_shape)
 
         # only supported for `denoiser` now
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index ecaa553ce4c4..3d4344bb86a9 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -126,13 +126,20 @@ class PeftLoraLoaderMixinTests:
     text_encoder_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
     denoiser_target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
 
-    def get_dummy_components(self, use_dora=False, lora_alpha=None):
+    cached_non_lora_output = None
+
+    def get_base_pipe_output(self):
+        if self.cached_non_lora_output is None:
+            self.cached_non_lora_output = self._compute_baseline_output()
+        return self.cached_non_lora_output
+
+    def get_dummy_components(self, scheduler_cls=None, use_dora=False, lora_alpha=None):
         if self.unet_kwargs and self.transformer_kwargs:
             raise ValueError("Both `unet_kwargs` and `transformer_kwargs` cannot be specified.")
         if self.has_two_text_encoders and self.has_three_text_encoders:
             raise ValueError("Both `has_two_text_encoders` and `has_three_text_encoders` cannot be True.")
 
-        scheduler_cls = self.scheduler_cls
+        scheduler_cls = scheduler_cls if scheduler_cls is not None else self.scheduler_cls
         rank = 4
         lora_alpha = rank if lora_alpha is None else lora_alpha
 
@@ -238,15 +245,16 @@ def get_dummy_inputs(self, with_generator=True):
 
         return noise, input_ids, pipeline_inputs
 
-    # Copied from: https://colab.research.google.com/gist/sayakpaul/df2ef6e1ae6d8c10a49d859883b10860/scratchpad.ipynb
-    def get_dummy_tokens(self):
-        max_seq_length = 77
-
-        inputs = torch.randint(2, 56, size=(1, max_seq_length), generator=torch.manual_seed(0))
+    def _compute_baseline_output(self):
+        components, _, _ = self.get_dummy_components(self.scheduler_cls)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
 
-        prepared_inputs = {}
-        prepared_inputs["input_ids"] = inputs
-        return prepared_inputs
+        # Always ensure the inputs are without the `generator`. Make sure to pass the `generator`
+        # explicitly.
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+        return pipe(**inputs, generator=torch.manual_seed(0))[0]
 
     def _get_lora_state_dicts(self, modules_to_save):
         state_dicts = {}
@@ -316,14 +324,8 @@ def test_simple_inference(self):
         """
         Tests a simple inference and makes sure it works as expected
         """
-        components, text_lora_config, _ = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        _, _, inputs = self.get_dummy_inputs()
-        output_no_lora = pipe(**inputs)[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
+        assert output_no_lora.shape == self.output_shape
 
     def test_simple_inference_with_text_lora(self):
         """
@@ -336,9 +338,7 @@ def test_simple_inference_with_text_lora(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
+        output_no_lora = self.get_base_pipe_output()
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
         output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -414,9 +414,6 @@ def test_low_cpu_mem_usage_with_loading(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -466,8 +463,7 @@ def test_simple_inference_with_text_lora_and_scale(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
@@ -503,8 +499,7 @@ def test_simple_inference_with_text_lora_fused(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
@@ -534,8 +529,7 @@ def test_simple_inference_with_text_lora_unloaded(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
@@ -566,9 +560,6 @@ def test_simple_inference_with_text_lora_save_load(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
         images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -616,8 +607,7 @@ def test_simple_inference_with_partial_text_lora(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
 
@@ -666,9 +656,6 @@ def test_simple_inference_save_pretrained_with_text_lora(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None)
         images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
 
@@ -708,9 +695,6 @@ def test_simple_inference_with_text_denoiser_lora_save_load(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -747,9 +731,7 @@ def test_simple_inference_with_text_denoiser_lora_and_scale(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
+        output_no_lora = self.get_base_pipe_output()
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -790,8 +772,7 @@ def test_simple_inference_with_text_lora_denoiser_fused(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
@@ -825,8 +806,7 @@ def test_simple_inference_with_text_denoiser_lora_unloaded(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
+        output_no_lora = self.get_base_pipe_output()
 
         pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
@@ -900,7 +880,7 @@ def test_simple_inference_with_text_denoiser_multi_adapter(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
@@ -1024,7 +1004,7 @@ def test_simple_inference_with_text_denoiser_block_scale(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
         self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
@@ -1080,7 +1060,7 @@ def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
@@ -1240,7 +1220,7 @@ def test_simple_inference_with_text_denoiser_multi_adapter_delete_adapter(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
@@ -1331,7 +1311,7 @@ def test_simple_inference_with_text_denoiser_multi_adapter_weighted(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
@@ -1551,7 +1531,6 @@ def test_get_list_adapters(self):
 
         self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked)
 
-    @require_peft_version_greater(peft_version="0.6.2")
     def test_simple_inference_with_text_lora_denoiser_fused_multi(
         self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3
     ):
@@ -1565,9 +1544,6 @@ def test_simple_inference_with_text_lora_denoiser_fused_multi(
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
             self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
@@ -1641,8 +1617,7 @@ def test_lora_scale_kwargs_match_fusion(self, expected_atol: float = 1e-3, expec
             pipe.set_progress_bar_config(disable=None)
             _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-            self.assertTrue(output_no_lora.shape == self.output_shape)
+            output_no_lora = self.get_base_pipe_output()
 
             if "text_encoder" in self.pipeline_class._lora_loadable_modules:
                 pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
@@ -1685,7 +1660,6 @@ def test_lora_scale_kwargs_match_fusion(self, expected_atol: float = 1e-3, expec
                 "LoRA should change the output",
             )
 
-    @require_peft_version_greater(peft_version="0.9.0")
     def test_simple_inference_with_dora(self):
         components, text_lora_config, denoiser_lora_config = self.get_dummy_components(use_dora=True)
         pipe = self.pipeline_class(**components)
@@ -1695,7 +1669,6 @@ def test_simple_inference_with_dora(self):
 
         output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
         self.assertTrue(output_no_dora_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -1783,7 +1756,6 @@ def test_simple_inference_with_text_denoiser_lora_unfused_torch_compile(self):
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
-
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
@@ -1820,7 +1792,7 @@ def test_logs_info_when_no_lora_keys_found(self):
         pipe.set_progress_bar_config(disable=None)
 
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
-        original_out = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         no_op_state_dict = {"lora_foo": torch.tensor(2.0), "lora_bar": torch.tensor(3.0)}
         logger = logging.get_logger("diffusers.loaders.peft")
@@ -1832,7 +1804,7 @@ def test_logs_info_when_no_lora_keys_found(self):
 
         denoiser = getattr(pipe, "unet") if self.unet_kwargs is not None else getattr(pipe, "transformer")
         self.assertTrue(cap_logger.out.startswith(f"No LoRA keys associated to {denoiser.__class__.__name__}"))
-        self.assertTrue(np.allclose(original_out, out_after_lora_attempt, atol=1e-5, rtol=1e-5))
+        self.assertTrue(np.allclose(output_no_lora, out_after_lora_attempt, atol=1e-5, rtol=1e-5))
 
         # test only for text encoder
         for lora_module in self.pipeline_class._lora_loadable_modules:
@@ -1864,9 +1836,7 @@ def test_set_adapters_match_attention_kwargs(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
+        output_no_lora = self.get_base_pipe_output()
         pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config)
 
         lora_scale = 0.5
@@ -2212,9 +2182,6 @@ def test_lora_adapter_metadata_save_load_inference(self, lora_alpha):
         pipe = self.pipeline_class(**components).to(torch_device)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
-        self.assertTrue(output_no_lora.shape == self.output_shape)
-
         pipe, _ = self.add_adapters_to_pipeline(
             pipe, text_lora_config=text_lora_config, denoiser_lora_config=denoiser_lora_config
         )
@@ -2260,7 +2227,7 @@ def test_inference_load_delete_load_adapters(self):
         pipe.set_progress_bar_config(disable=None)
         _, _, inputs = self.get_dummy_inputs(with_generator=False)
 
-        output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        output_no_lora = self.get_base_pipe_output()
 
         if "text_encoder" in self.pipeline_class._lora_loadable_modules:
             pipe.text_encoder.add_adapter(text_lora_config)

From 9ae5b6299d3b1a7b0378dc77c3c69baf521587d2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 2 Oct 2025 17:46:15 +0530
Subject: [PATCH 32/69] [ci] xfail failing tests in CI. (#12418)

xfail failing tests in CI.
---
 tests/pipelines/test_pipelines.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 3a6981361268..5b86423553c5 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import PIL.Image
+import pytest
 import requests_mock
 import safetensors.torch
 import torch
@@ -62,10 +63,7 @@
 )
 from diffusers.pipelines.pipeline_utils import _get_pipeline_class
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-)
+from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, is_transformers_version
 from diffusers.utils.torch_utils import is_compiled_module
 
 from ..testing_utils import (
@@ -584,6 +582,7 @@ def test_download_variants_with_sharded_checkpoints(self):
                     assert not any(f.endswith(unexpected_ext) for f in files)
                     assert all(variant in f for f in model_files if f.endswith(model_ext) and variant is not None)
 
+    @pytest.mark.xfail(condition=is_transformers_version(">", "4.56.2"), reason="Some import error", strict=True)
     def test_download_legacy_variants_with_sharded_ckpts_raises_warning(self):
         repo_id = "hf-internal-testing/tiny-stable-diffusion-pipe-variants-all-kinds"
         logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
@@ -690,6 +689,7 @@ def test_download_bin_variant_does_not_exist_for_model(self):
                 )
             assert "Error no file name" in str(error_context.exception)
 
+    @pytest.mark.xfail(condition=is_transformers_version(">", "4.56.2"), reason="Some import error", strict=True)
     def test_local_save_load_index(self):
         prompt = "hello"
         for variant in [None, "fp16"]:
@@ -1584,6 +1584,7 @@ def test_save_safe_serialization(self):
             assert pipeline.scheduler is not None
             assert pipeline.feature_extractor is not None
 
+    @pytest.mark.xfail(condition=is_transformers_version(">", "4.56.2"), reason="Some import error", strict=True)
     def test_no_pytorch_download_when_doing_safetensors(self):
         # by default we don't download
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1603,6 +1604,7 @@ def test_no_pytorch_download_when_doing_safetensors(self):
             # pytorch does not
             assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
 
+    @pytest.mark.xfail(condition=is_transformers_version(">", "4.56.2"), reason="Some import error", strict=True)
     def test_no_safetensors_download_when_doing_pytorch(self):
         use_safetensors = False
 

From b4297967a04cca6ac4493202c02d81c30d0f9ee8 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 2 Oct 2025 20:38:02 +0530
Subject: [PATCH 33/69] [core] conditionally import torch distributed stuff.
 (#12420)

conditionally import torch distributed stuff.
---
 src/diffusers/models/attention_dispatch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
index 0a2ad681237b..e1694910997a 100644
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -20,7 +20,10 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 
 import torch
-import torch.distributed._functional_collectives as funcol
+
+
+if torch.distributed.is_available():
+    import torch.distributed._functional_collectives as funcol
 
 from ..utils import (
     get_logger,

From 7242b5ff627fad93dd85834b0278267b6cbe2d6d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 2 Oct 2025 20:57:11 +0200
Subject: [PATCH 34/69] FIX Test to ignore warning for enable_lora_hotswap
 (#12421)

I noticed that the test should be for the option check_compiled="ignore"
but it was using check_compiled="warn". This has been fixed, now the
correct argument is passed.

However, the fact that the test passed means that it was incorrect to
begin with. The way that logs are collected does not collect the
logger.warning call here (not sure why). To amend this, I'm now using
assertNoLogs. With this change, the test correctly fails when the wrong
argument is passed.
---
 tests/models/test_modeling_common.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 9b1c6b50dc8f..a44ef571c5be 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -25,7 +25,6 @@
 import unittest
 import unittest.mock as mock
 import uuid
-import warnings
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -2373,14 +2372,15 @@ def test_enable_lora_hotswap_called_after_adapter_added_warning(self):
 
     def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
         # check possibility to ignore the error/warning
+        from diffusers.loaders.peft import logger
+
         lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict).to(torch_device)
         model.add_adapter(lora_config)
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")  # Capture all warnings
-            model.enable_lora_hotswap(target_rank=32, check_compiled="warn")
-            self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}")
+        # note: assertNoLogs requires Python 3.10+
+        with self.assertNoLogs(logger, level="WARNING"):
+            model.enable_lora_hotswap(target_rank=32, check_compiled="ignore")
 
     def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
         # check that wrong argument value raises an error

From 941ac9c3d9aab9c36fc33c58dac1980442928082 Mon Sep 17 00:00:00 2001
From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
Date: Fri, 3 Oct 2025 17:46:47 +0300
Subject: [PATCH 35/69] [training-scripts] Make more examples UV-compatible
 (follow up on #12000) (#12407)

* make qwen and kontext uv compatible

* add torchvision

* add torchvision

* add datasets, bitsandbytes, prodigyopt

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../train_dreambooth_lora_flux_advanced.py     |  4 ++++
 .../dreambooth/train_dreambooth_lora_flux.py   |  4 ++++
 .../train_dreambooth_lora_flux_kontext.py      | 18 ++++++++++++++++++
 .../train_dreambooth_lora_qwen_image.py        | 18 ++++++++++++++++++
 .../dreambooth/train_dreambooth_lora_sana.py   |  4 ++++
 5 files changed, 48 insertions(+)

diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py
index a46490e8b3bf..5aa33190d4a0 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py
@@ -25,6 +25,10 @@
 #     "Jinja2",
 #     "peft>=0.11.1",
 #     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
 # ]
 # ///
 
diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py
index bd3a974a17d8..3b6ab814f278 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux.py
@@ -25,6 +25,10 @@
 #     "Jinja2",
 #     "peft>=0.11.1",
 #     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
 # ]
 # ///
 
diff --git a/examples/dreambooth/train_dreambooth_lora_flux_kontext.py b/examples/dreambooth/train_dreambooth_lora_flux_kontext.py
index 03c05a05e094..fc6df87768ca 100644
--- a/examples/dreambooth/train_dreambooth_lora_flux_kontext.py
+++ b/examples/dreambooth/train_dreambooth_lora_flux_kontext.py
@@ -14,6 +14,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# /// script
+# dependencies = [
+#     "diffusers @ git+https://github.com/huggingface/diffusers.git",
+#     "torch>=2.0.0",
+#     "accelerate>=0.31.0",
+#     "transformers>=4.41.2",
+#     "ftfy",
+#     "tensorboard",
+#     "Jinja2",
+#     "peft>=0.11.1",
+#     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
+# ]
+# ///
+
 import argparse
 import copy
 import itertools
diff --git a/examples/dreambooth/train_dreambooth_lora_qwen_image.py b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
index feec4da712f3..75eae92dfbd0 100644
--- a/examples/dreambooth/train_dreambooth_lora_qwen_image.py
+++ b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
@@ -13,6 +13,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
+# /// script
+# dependencies = [
+#     "diffusers @ git+https://github.com/huggingface/diffusers.git",
+#     "torch>=2.0.0",
+#     "accelerate>=0.31.0",
+#     "transformers>=4.41.2",
+#     "ftfy",
+#     "tensorboard",
+#     "Jinja2",
+#     "peft>=0.11.1",
+#     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
+# ]
+# ///
+
 import argparse
 import copy
 import itertools
diff --git a/examples/dreambooth/train_dreambooth_lora_sana.py b/examples/dreambooth/train_dreambooth_lora_sana.py
index b188a80916d7..2b0c1ee6697d 100644
--- a/examples/dreambooth/train_dreambooth_lora_sana.py
+++ b/examples/dreambooth/train_dreambooth_lora_sana.py
@@ -25,6 +25,10 @@
 #     "Jinja2",
 #     "peft>=0.14.0",
 #     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
 # ]
 # ///
 

From 2b7deffe361b7b0e1d2665a1f9f0bd4daea4927c Mon Sep 17 00:00:00 2001
From: Vladimir Mandic <mandic00@live.com>
Date: Sat, 4 Oct 2025 23:53:38 -0400
Subject: [PATCH 36/69] fix scale_shift_factor being on cpu for wan and ltx
 (#12347)

* wan fix scale_shift_factor being on cpu

* apply device cast to ltx transformer

* Apply style fixes

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/diffusers/models/transformers/transformer_ltx.py      | 4 +++-
 src/diffusers/models/transformers/transformer_wan.py      | 4 ++--
 src/diffusers/models/transformers/transformer_wan_vace.py | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
index 9f3840690d81..685c73c07c75 100644
--- a/src/diffusers/models/transformers/transformer_ltx.py
+++ b/src/diffusers/models/transformers/transformer_ltx.py
@@ -353,7 +353,9 @@ def forward(
         norm_hidden_states = self.norm1(hidden_states)
 
         num_ada_params = self.scale_shift_table.shape[0]
-        ada_values = self.scale_shift_table[None, None] + temb.reshape(batch_size, temb.size(1), num_ada_params, -1)
+        ada_values = self.scale_shift_table[None, None].to(temb.device) + temb.reshape(
+            batch_size, temb.size(1), num_ada_params, -1
+        )
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
         norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
 
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
index 25c055fb563c..dd75fb124f1a 100644
--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -682,12 +682,12 @@ def forward(
         # 5. Output norm, projection & unpatchify
         if temb.ndim == 3:
             # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
-            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
             shift = shift.squeeze(2)
             scale = scale.squeeze(2)
         else:
             # batch_size, inner_dim
-            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+            shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
 
         # Move the shift and scale tensors to the same device as hidden_states.
         # When using multi-GPU inference via accelerate these will be on the
diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
index e5a9c7e0a659..30c38c244ad8 100644
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -103,7 +103,7 @@ def forward(
             control_hidden_states = control_hidden_states + hidden_states
 
         shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
-            self.scale_shift_table + temb.float()
+            self.scale_shift_table.to(temb.device) + temb.float()
         ).chunk(6, dim=1)
 
         # 1. Self-attention
@@ -361,7 +361,7 @@ def forward(
                     hidden_states = hidden_states + control_hint * scale
 
         # 6. Output norm, projection & unpatchify
-        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
 
         # Move the shift and scale tensors to the same device as hidden_states.
         # When using multi-GPU inference via accelerate these will be on the

From c3675d4c9bb9c02521cd2c1aec198460c1657256 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 5 Oct 2025 21:57:13 +0530
Subject: [PATCH 37/69] [core] support QwenImage Edit Plus in modular (#12416)

* up

* up

* up

* up

* up

* up

* remove saves

* move things around a bit.

* get ready.
---
 src/diffusers/__init__.py                     |   4 +
 src/diffusers/modular_pipelines/__init__.py   |   4 +
 .../modular_pipelines/modular_pipeline.py     |   4 +-
 .../modular_pipelines/qwenimage/__init__.py   |  18 +-
 .../qwenimage/before_denoise.py               |   3 +-
 .../modular_pipelines/qwenimage/encoders.py   | 236 +++++++++++++++++-
 .../qwenimage/modular_blocks.py               | 151 ++++++++++-
 .../qwenimage/modular_pipeline.py             |  10 +
 src/diffusers/pipelines/auto_pipeline.py      |   2 +
 .../dummy_torch_and_transformers_objects.py   |  30 +++
 10 files changed, 449 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 8867250deda8..686e8d99dabf 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -390,6 +390,8 @@
             "QwenImageAutoBlocks",
             "QwenImageEditAutoBlocks",
             "QwenImageEditModularPipeline",
+            "QwenImageEditPlusAutoBlocks",
+            "QwenImageEditPlusModularPipeline",
             "QwenImageModularPipeline",
             "StableDiffusionXLAutoBlocks",
             "StableDiffusionXLModularPipeline",
@@ -1052,6 +1054,8 @@
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
             QwenImageEditModularPipeline,
+            QwenImageEditPlusAutoBlocks,
+            QwenImageEditPlusModularPipeline,
             QwenImageModularPipeline,
             StableDiffusionXLAutoBlocks,
             StableDiffusionXLModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 65c22b349b1c..2e590594af71 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -52,6 +52,8 @@
         "QwenImageModularPipeline",
         "QwenImageEditModularPipeline",
         "QwenImageEditAutoBlocks",
+        "QwenImageEditPlusModularPipeline",
+        "QwenImageEditPlusAutoBlocks",
     ]
     _import_structure["components_manager"] = ["ComponentsManager"]
 
@@ -78,6 +80,8 @@
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
             QwenImageEditModularPipeline,
+            QwenImageEditPlusAutoBlocks,
+            QwenImageEditPlusModularPipeline,
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 037c9e323c6b..e543bf0bb3af 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -59,6 +59,7 @@
         ("flux", "FluxModularPipeline"),
         ("qwenimage", "QwenImageModularPipeline"),
         ("qwenimage-edit", "QwenImageEditModularPipeline"),
+        ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
     ]
 )
 
@@ -1628,7 +1629,8 @@ def from_pretrained(
             blocks = ModularPipelineBlocks.from_pretrained(
                 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
             )
-        except EnvironmentError:
+        except EnvironmentError as e:
+            logger.debug(f"EnvironmentError: {e}")
             blocks = None
 
         cache_dir = kwargs.pop("cache_dir", None)
diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
index 81cf515730ef..ae4ec4799fbc 100644
--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -29,13 +29,20 @@
         "EDIT_AUTO_BLOCKS",
         "EDIT_BLOCKS",
         "EDIT_INPAINT_BLOCKS",
+        "EDIT_PLUS_AUTO_BLOCKS",
+        "EDIT_PLUS_BLOCKS",
         "IMAGE2IMAGE_BLOCKS",
         "INPAINT_BLOCKS",
         "TEXT2IMAGE_BLOCKS",
         "QwenImageAutoBlocks",
         "QwenImageEditAutoBlocks",
+        "QwenImageEditPlusAutoBlocks",
+    ]
+    _import_structure["modular_pipeline"] = [
+        "QwenImageEditModularPipeline",
+        "QwenImageEditPlusModularPipeline",
+        "QwenImageModularPipeline",
     ]
-    _import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -54,13 +61,20 @@
             EDIT_AUTO_BLOCKS,
             EDIT_BLOCKS,
             EDIT_INPAINT_BLOCKS,
+            EDIT_PLUS_AUTO_BLOCKS,
+            EDIT_PLUS_BLOCKS,
             IMAGE2IMAGE_BLOCKS,
             INPAINT_BLOCKS,
             TEXT2IMAGE_BLOCKS,
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
+            QwenImageEditPlusAutoBlocks,
+        )
+        from .modular_pipeline import (
+            QwenImageEditModularPipeline,
+            QwenImageEditPlusModularPipeline,
+            QwenImageModularPipeline,
         )
-        from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 606236cfe91b..fdec95dc506e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -203,7 +203,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state.latents = components.pachifier.pack_latents(block_state.latents)
 
         self.set_block_state(state, block_state)
-
         return components, state
 
 
@@ -571,7 +570,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step"
+        return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step"
 
     @property
     def inputs(self) -> List[InputParam]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 2ab83a03ee55..04fb3fdc947b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -128,6 +128,61 @@ def get_qwen_prompt_embeds_edit(
     return prompt_embeds, encoder_attention_mask
 
 
+def get_qwen_prompt_embeds_edit_plus(
+    text_encoder,
+    processor,
+    prompt: Union[str, List[str]] = None,
+    image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None,
+    prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
+    prompt_template_encode_start_idx: int = 64,
+    device: Optional[torch.device] = None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if isinstance(image, list):
+        base_img_prompt = ""
+        for i, img in enumerate(image):
+            base_img_prompt += img_template_encode.format(i + 1)
+    elif image is not None:
+        base_img_prompt = img_template_encode.format(1)
+    else:
+        base_img_prompt = ""
+
+    template = prompt_template_encode
+
+    drop_idx = prompt_template_encode_start_idx
+    txt = [template.format(base_img_prompt + e) for e in prompt]
+
+    model_inputs = processor(
+        text=txt,
+        images=image,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    outputs = text_encoder(
+        input_ids=model_inputs.input_ids,
+        attention_mask=model_inputs.attention_mask,
+        pixel_values=model_inputs.pixel_values,
+        image_grid_thw=model_inputs.image_grid_thw,
+        output_hidden_states=True,
+    )
+
+    hidden_states = outputs.hidden_states[-1]
+    split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
+    split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+    attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+    max_seq_len = max([e.size(0) for e in split_hidden_states])
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+    )
+    encoder_attention_mask = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+    )
+
+    prompt_embeds = prompt_embeds.to(device=device)
+    return prompt_embeds, encoder_attention_mask
+
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -266,6 +321,83 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
+    model_name = "qwenimage"
+
+    def __init__(
+        self,
+        input_name: str = "image",
+        output_name: str = "resized_image",
+        vae_image_output_name: str = "vae_image",
+    ):
+        """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
+
+        This block resizes an input image or a list input images and exposes the resized result under configurable
+        input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
+        "image", "control_image")
+
+        Args:
+            input_name (str, optional): Name of the image field to read from the
+                pipeline state. Defaults to "image".
+            output_name (str, optional): Name of the resized image field to write
+                back to the pipeline state. Defaults to "resized_image".
+            vae_image_output_name (str, optional): Name of the image field
+                to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
+                processes the input image(s) differently for the VL and the VAE.
+        """
+        if not isinstance(input_name, str) or not isinstance(output_name, str):
+            raise ValueError(
+                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
+            )
+        self.condition_image_size = 384 * 384
+        self._image_input_name = input_name
+        self._resized_image_output_name = output_name
+        self._vae_image_output_name = vae_image_output_name
+        super().__init__()
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return super().intermediate_outputs + [
+            OutputParam(
+                name=self._vae_image_output_name,
+                type_hint=List[PIL.Image.Image],
+                description="The images to be processed which will be further used by the VAE encoder.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        images = getattr(block_state, self._image_input_name)
+
+        if not is_valid_image_imagelist(images):
+            raise ValueError(f"Images must be image or list of images but are {type(images)}")
+
+        if (
+            not isinstance(images, torch.Tensor)
+            and isinstance(images, PIL.Image.Image)
+            and not isinstance(images, list)
+        ):
+            images = [images]
+
+        # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
+        condition_images = []
+        vae_images = []
+        for img in images:
+            image_width, image_height = img.size
+            condition_width, condition_height, _ = calculate_dimensions(
+                self.condition_image_size, image_width / image_height
+            )
+            condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
+            vae_images.append(img)
+
+        setattr(block_state, self._resized_image_output_name, condition_images)
+        setattr(block_state, self._vae_image_output_name, vae_images)
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
@@ -511,6 +643,61 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
+    model_name = "qwenimage"
+
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(
+                name="prompt_template_encode",
+                default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+            ),
+            ConfigSpec(
+                name="img_template_encode",
+                default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
+            ),
+            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(block_state.prompt, block_state.negative_prompt)
+
+        device = components._execution_device
+
+        block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit_plus(
+            components.text_encoder,
+            components.processor,
+            prompt=block_state.prompt,
+            image=block_state.resized_image,
+            prompt_template_encode=components.config.prompt_template_encode,
+            img_template_encode=components.config.img_template_encode,
+            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            device=device,
+        )
+
+        if components.requires_unconditional_embeds:
+            negative_prompt = block_state.negative_prompt or " "
+            block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
+                get_qwen_prompt_embeds_edit_plus(
+                    components.text_encoder,
+                    components.processor,
+                    prompt=negative_prompt,
+                    image=block_state.resized_image,
+                    prompt_template_encode=components.config.prompt_template_encode,
+                    img_template_encode=components.config.img_template_encode,
+                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                    device=device,
+                )
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
@@ -612,12 +799,7 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("resized_image"),
-            InputParam("image"),
-            InputParam("height"),
-            InputParam("width"),
-        ]
+        return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -661,6 +843,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
+    model_name = "qwenimage-edit-plus"
+    vae_image_size = 1024 * 1024
+
+    @property
+    def description(self) -> str:
+        return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        if block_state.vae_image is None and block_state.image is None:
+            raise ValueError("`vae_image` and `image` cannot be None at the same time")
+
+        if block_state.vae_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
+        else:
+            width, height = block_state.vae_image[0].size
+            image = block_state.vae_image
+
+            block_state.processed_image = components.image_processor.preprocess(
+                image=image, height=height, width=width
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
@@ -738,7 +961,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             dtype=dtype,
             latent_channels=components.num_channels_latents,
         )
-
         setattr(block_state, self._image_latents_output_name, image_latents)
 
         self.set_block_state(state, block_state)
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
index 9126766cc202..83bfcb3da4fd 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -37,6 +37,9 @@
 )
 from .encoders import (
     QwenImageControlNetVaeEncoderStep,
+    QwenImageEditPlusProcessImagesInputStep,
+    QwenImageEditPlusResizeDynamicStep,
+    QwenImageEditPlusTextEncoderStep,
     QwenImageEditResizeDynamicStep,
     QwenImageEditTextEncoderStep,
     QwenImageInpaintProcessImagesInputStep,
@@ -872,7 +875,151 @@ def description(self):
         )
 
 
-# 3. all block presets supported in QwenImage & QwenImage-Edit
+#################### QwenImage Edit Plus #####################
+
+# 3. QwenImage-Edit Plus
+
+## 3.1 QwenImage-Edit Plus / edit
+
+#### QwenImage-Edit Plus vl encoder: take both image and text prompts
+QwenImageEditPlusVLEncoderBlocks = InsertableDict(
+    [
+        ("resize", QwenImageEditPlusResizeDynamicStep()),
+        ("encode", QwenImageEditPlusTextEncoderStep()),
+    ]
+)
+
+
+class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = QwenImageEditPlusVLEncoderBlocks.values()
+    block_names = QwenImageEditPlusVLEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together."
+
+
+#### QwenImage-Edit Plus vae encoder
+QwenImageEditPlusVaeEncoderBlocks = InsertableDict(
+    [
+        ("resize", QwenImageEditPlusResizeDynamicStep()),  # edit plus has a different resize step
+        ("preprocess", QwenImageEditPlusProcessImagesInputStep()),  # vae_image -> processed_image
+        ("encode", QwenImageVaeEncoderDynamicStep()),  # processed_image -> image_latents
+    ]
+)
+
+
+class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = QwenImageEditPlusVaeEncoderBlocks.values()
+    block_names = QwenImageEditPlusVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that encode the image inputs into their latent representations."
+
+
+#### QwenImage Edit Plus presets
+EDIT_PLUS_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
+        ("vae_encoder", QwenImageEditPlusVaeEncoderStep()),
+        ("input", QwenImageEditInputStep()),
+        ("prepare_latents", QwenImagePrepareLatentsStep()),
+        ("set_timesteps", QwenImageSetTimestepsStep()),
+        ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
+        ("denoise", QwenImageEditDenoiseStep()),
+        ("decode", QwenImageDecodeStep()),
+    ]
+)
+
+
+# auto before_denoise step for edit tasks
+class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [QwenImageEditBeforeDenoiseStep]
+    block_names = ["edit"]
+    block_trigger_inputs = ["image_latents"]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n"
+            + "This is an auto pipeline block that works for edit (img2img) task.\n"
+            + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n"
+            + " - if `image_latents` is not provided, step will be skipped."
+        )
+
+
+## 3.2 QwenImage-Edit Plus/auto encoders
+
+
+class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [
+        QwenImageEditPlusVaeEncoderStep,
+    ]
+    block_names = ["edit"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations. \n"
+            " This is an auto pipeline block that works for edit task.\n"
+            + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n"
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+
+## 3.3 QwenImage-Edit/auto blocks & presets
+
+
+class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = [
+        QwenImageEditAutoInputStep,
+        QwenImageEditPlusAutoBeforeDenoiseStep,
+        QwenImageEditAutoDenoiseStep,
+    ]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+            + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n"
+            + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
+        )
+
+
+EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", QwenImageEditPlusVLEncoderStep()),
+        ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()),
+        ("denoise", QwenImageEditPlusCoreDenoiseStep()),
+        ("decode", QwenImageAutoDecodeStep()),
+    ]
+)
+
+
+class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    model_name = "qwenimage-edit-plus"
+    block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
+    block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n"
+            + "- for edit (img2img) generation, you need to provide `image`\n"
+        )
+
+
+# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus
 
 
 ALL_BLOCKS = {
@@ -880,8 +1027,10 @@ def description(self):
     "img2img": IMAGE2IMAGE_BLOCKS,
     "edit": EDIT_BLOCKS,
     "edit_inpaint": EDIT_INPAINT_BLOCKS,
+    "edit_plus": EDIT_PLUS_BLOCKS,
     "inpaint": INPAINT_BLOCKS,
     "controlnet": CONTROLNET_BLOCKS,
     "auto": AUTO_BLOCKS,
     "edit_auto": EDIT_AUTO_BLOCKS,
+    "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS,
 }
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
index 7200169923a5..d9e30864f660 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -196,3 +196,13 @@ def requires_unconditional_embeds(self):
             requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
 
         return requires_unconditional_embeds
+
+
+class QwenImageEditPlusModularPipeline(QwenImageEditModularPipeline):
+    """
+    A ModularPipeline for QwenImage-Edit Plus.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "QwenImageEditPlusAutoBlocks"
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index d265bfdcaf3d..8a32d4c367a3 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -95,6 +95,7 @@
     QwenImageControlNetPipeline,
     QwenImageEditInpaintPipeline,
     QwenImageEditPipeline,
+    QwenImageEditPlusPipeline,
     QwenImageImg2ImgPipeline,
     QwenImageInpaintPipeline,
     QwenImagePipeline,
@@ -186,6 +187,7 @@
         ("flux-kontext", FluxKontextPipeline),
         ("qwenimage", QwenImageImg2ImgPipeline),
         ("qwenimage-edit", QwenImageEditPipeline),
+        ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
     ]
 )
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index bb8fea8c8a8b..cf8037796488 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -77,6 +77,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class QwenImageEditPlusAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class QwenImageEditPlusModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class QwenImageModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From ce90f9b2db9f459f463a3239dc4c24a04072fd43 Mon Sep 17 00:00:00 2001
From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com>
Date: Mon, 6 Oct 2025 08:24:54 +0530
Subject: [PATCH 38/69] [FIX] Text to image training peft version (#12434)

Fix peft error
---
 examples/text_to_image/requirements.txt      | 2 +-
 examples/text_to_image/requirements_sdxl.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/text_to_image/requirements.txt b/examples/text_to_image/requirements.txt
index c3ffa42f0edc..be05fe3fcdc5 100644
--- a/examples/text_to_image/requirements.txt
+++ b/examples/text_to_image/requirements.txt
@@ -5,4 +5,4 @@ datasets>=2.19.1
 ftfy
 tensorboard
 Jinja2
-peft==0.7.0
+peft>=0.17.0
diff --git a/examples/text_to_image/requirements_sdxl.txt b/examples/text_to_image/requirements_sdxl.txt
index 64cbc9205fd0..4dacc26ce4bb 100644
--- a/examples/text_to_image/requirements_sdxl.txt
+++ b/examples/text_to_image/requirements_sdxl.txt
@@ -5,4 +5,4 @@ ftfy
 tensorboard
 Jinja2
 datasets
-peft==0.7.0
\ No newline at end of file
+peft>=0.17.0
\ No newline at end of file

From 7f3e9b8695e80aa1f7a5ee55d025eeb8ee795602 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 6 Oct 2025 13:15:54 +0530
Subject: [PATCH 39/69] make flux ready for mellon (#12419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* make flux ready for mellon

* up

* Apply suggestions from code review

Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>

---------

Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>
---
 .../modular_pipelines/flux/before_denoise.py  |  4 +++
 .../modular_pipelines/flux/encoders.py        |  5 +++
 .../modular_pipelines/flux/modular_blocks.py  | 31 ++++++++++++++-----
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
index 4272066309a2..95858fbf6eb0 100644
--- a/src/diffusers/modular_pipelines/flux/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -252,11 +252,13 @@ def inputs(self) -> List[InputParam]:
             InputParam(
                 "prompt_embeds",
                 required=True,
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="Pre-generated text embeddings. Can be generated from text_encoder step.",
             ),
             InputParam(
                 "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
             ),
@@ -279,11 +281,13 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
                 description="pooled text embeddings used to guide the image generation",
             ),
             # TODO: support negative embeddings?
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index 8c49990280ac..16ddecbadb4f 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -181,6 +181,7 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam("prompt"),
             InputParam("prompt_2"),
+            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
             InputParam("joint_attention_kwargs"),
         ]
 
@@ -189,16 +190,19 @@ def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
                 "prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "text_ids",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="ids from the text sequence for RoPE",
             ),
@@ -404,6 +408,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             pooled_prompt_embeds=None,
             device=block_state.device,
             num_images_per_prompt=1,  # TODO: hardcoded for now.
+            max_sequence_length=block_state.max_sequence_length,
             lora_scale=block_state.text_encoder_lora_scale,
         )
 
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
index 37895bddbf07..ca4f993a11fe 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -84,9 +84,9 @@ def description(self):
 
 # before_denoise: all task (text2img, img2img)
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep]
-    block_names = ["text2image", "img2img"]
-    block_trigger_inputs = [None, "image_latents"]
+    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
 
     @property
     def description(self):
@@ -124,16 +124,32 @@ def description(self):
         return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
 
 
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step support text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+        )
+
+
 # text2image
 class FluxAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         FluxTextEncoderStep,
         FluxAutoVaeEncoderStep,
-        FluxAutoBeforeDenoiseStep,
-        FluxAutoDenoiseStep,
+        FluxCoreDenoiseStep,
         FluxAutoDecodeStep,
     ]
-    block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"]
+    block_names = ["text_encoder", "image_encoder", "denoise", "decode"]
 
     @property
     def description(self):
@@ -171,8 +187,7 @@ def description(self):
     [
         ("text_encoder", FluxTextEncoderStep),
         ("image_encoder", FluxAutoVaeEncoderStep),
-        ("before_denoise", FluxAutoBeforeDenoiseStep),
-        ("denoise", FluxAutoDenoiseStep),
+        ("denoise", FluxCoreDenoiseStep),
         ("decode", FluxAutoDecodeStep),
     ]
 )

From cf4b97b2337ea0144f35ed3ec9146e27ba7a44e8 Mon Sep 17 00:00:00 2001
From: Charles <charles@huggingface.co>
Date: Mon, 6 Oct 2025 17:45:34 +0200
Subject: [PATCH 40/69] [perf] Cache version checks (#12399)

---
 src/diffusers/utils/import_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 9399ccd2a7a3..97065267b004 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -21,6 +21,7 @@
 import os
 import sys
 from collections import OrderedDict, defaultdict
+from functools import lru_cache as cache
 from itertools import chain
 from types import ModuleType
 from typing import Any, Tuple, Union
@@ -673,6 +674,7 @@ def compare_versions(library_or_version: Union[str, Version], operation: str, re
 
 
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+@cache
 def is_torch_version(operation: str, version: str):
     """
     Compares the current PyTorch version to a given reference with an operation.
@@ -686,6 +688,7 @@ def is_torch_version(operation: str, version: str):
     return compare_versions(parse(_torch_version), operation, version)
 
 
+@cache
 def is_torch_xla_version(operation: str, version: str):
     """
     Compares the current torch_xla version to a given reference with an operation.
@@ -701,6 +704,7 @@ def is_torch_xla_version(operation: str, version: str):
     return compare_versions(parse(_torch_xla_version), operation, version)
 
 
+@cache
 def is_transformers_version(operation: str, version: str):
     """
     Compares the current Transformers version to a given reference with an operation.
@@ -716,6 +720,7 @@ def is_transformers_version(operation: str, version: str):
     return compare_versions(parse(_transformers_version), operation, version)
 
 
+@cache
 def is_hf_hub_version(operation: str, version: str):
     """
     Compares the current Hugging Face Hub version to a given reference with an operation.
@@ -731,6 +736,7 @@ def is_hf_hub_version(operation: str, version: str):
     return compare_versions(parse(_hf_hub_version), operation, version)
 
 
+@cache
 def is_accelerate_version(operation: str, version: str):
     """
     Compares the current Accelerate version to a given reference with an operation.
@@ -746,6 +752,7 @@ def is_accelerate_version(operation: str, version: str):
     return compare_versions(parse(_accelerate_version), operation, version)
 
 
+@cache
 def is_peft_version(operation: str, version: str):
     """
     Compares the current PEFT version to a given reference with an operation.
@@ -761,6 +768,7 @@ def is_peft_version(operation: str, version: str):
     return compare_versions(parse(_peft_version), operation, version)
 
 
+@cache
 def is_bitsandbytes_version(operation: str, version: str):
     """
     Args:
@@ -775,6 +783,7 @@ def is_bitsandbytes_version(operation: str, version: str):
     return compare_versions(parse(_bitsandbytes_version), operation, version)
 
 
+@cache
 def is_gguf_version(operation: str, version: str):
     """
     Compares the current Accelerate version to a given reference with an operation.
@@ -790,6 +799,7 @@ def is_gguf_version(operation: str, version: str):
     return compare_versions(parse(_gguf_version), operation, version)
 
 
+@cache
 def is_torchao_version(operation: str, version: str):
     """
     Compares the current torchao version to a given reference with an operation.
@@ -805,6 +815,7 @@ def is_torchao_version(operation: str, version: str):
     return compare_versions(parse(_torchao_version), operation, version)
 
 
+@cache
 def is_k_diffusion_version(operation: str, version: str):
     """
     Compares the current k-diffusion version to a given reference with an operation.
@@ -820,6 +831,7 @@ def is_k_diffusion_version(operation: str, version: str):
     return compare_versions(parse(_k_diffusion_version), operation, version)
 
 
+@cache
 def is_optimum_quanto_version(operation: str, version: str):
     """
     Compares the current Accelerate version to a given reference with an operation.
@@ -835,6 +847,7 @@ def is_optimum_quanto_version(operation: str, version: str):
     return compare_versions(parse(_optimum_quanto_version), operation, version)
 
 
+@cache
 def is_nvidia_modelopt_version(operation: str, version: str):
     """
     Compares the current Nvidia ModelOpt version to a given reference with an operation.
@@ -850,6 +863,7 @@ def is_nvidia_modelopt_version(operation: str, version: str):
     return compare_versions(parse(_nvidia_modelopt_version), operation, version)
 
 
+@cache
 def is_xformers_version(operation: str, version: str):
     """
     Compares the current xformers version to a given reference with an operation.
@@ -865,6 +879,7 @@ def is_xformers_version(operation: str, version: str):
     return compare_versions(parse(_xformers_version), operation, version)
 
 
+@cache
 def is_sageattention_version(operation: str, version: str):
     """
     Compares the current sageattention version to a given reference with an operation.
@@ -880,6 +895,7 @@ def is_sageattention_version(operation: str, version: str):
     return compare_versions(parse(_sageattention_version), operation, version)
 
 
+@cache
 def is_flash_attn_version(operation: str, version: str):
     """
     Compares the current flash-attention version to a given reference with an operation.

From 0974b4c6067165434fa715654b355b41beb5fceb Mon Sep 17 00:00:00 2001
From: Changseop Yeom <89627517+braintrue@users.noreply.github.com>
Date: Tue, 7 Oct 2025 06:24:05 +0900
Subject: [PATCH 41/69] [i18n-KO] Fix typo and update translation in
 ethical_guidelines.md (#12435)

---
 .../ko/conceptual/ethical_guidelines.md       | 42 +++++++++----------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/docs/source/ko/conceptual/ethical_guidelines.md b/docs/source/ko/conceptual/ethical_guidelines.md
index b8c55048bf27..63fc4a7741de 100644
--- a/docs/source/ko/conceptual/ethical_guidelines.md
+++ b/docs/source/ko/conceptual/ethical_guidelines.md
@@ -14,51 +14,47 @@ specific language governing permissions and limitations under the License.
 
 ## 서문 [[preamble]]
 
-[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며 추론 및 훈련을 위한 모듈식 툴박스로 사용됩니다.
+[Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며, 추론과 훈련을 위한 모듈형 툴박스로 활용됩니다.
 
-이 기술의 실제 적용과 사회에 미칠 수 있는 부정적인 영향을 고려하여 Diffusers 라이브러리의 개발, 사용자 기여 및 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
-
-이이 기술을 사용함에 따른 위험은 여전히 검토 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 사칭; 소수자 집단의 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
-
-우리는 위험을 지속적으로 추적하고 커뮤니티의 응답과 소중한 피드백에 따라 다음 지침을 조정할 것입니다.
+이 기술의 실제 적용 사례와 사회에 미칠 수 있는 잠재적 부정적 영향을 고려할 때, Diffusers 라이브러리의 개발, 사용자 기여, 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다.
 
+이 기술 사용과 관련된 위험은 여전히 검토 중이지만, 예를 들면: 예술가의 저작권 문제, 딥페이크 악용, 부적절한 맥락에서의 성적 콘텐츠 생성, 비동의 사칭, 소수자 집단 억압을 영속화하는 유해한 사회적 편견 등이 있습니다.
+우리는 이러한 위험을 지속적으로 추적하고, 커뮤니티의 반응과 소중한 피드백에 따라 아래 지침을 조정할 것입니다.
 
 ## 범위 [[scope]]
 
-Diffusers 커뮤니티는 프로젝트의 개발에 다음과 같은 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대한 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
-
+Diffusers 커뮤니티는 프로젝트 개발에 다음 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대해 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다.
 
 ## 윤리 지침 [[ethical-guidelines]]
 
-다음 윤리 지침은 일반적으로 적용되지만, 민감한 윤리적 문제와 관련하여 기술적 선택을 할 때 이를 우선적으로 적용할 것입니다. 나아가, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 조정할 것을 약속드립니다.
-
-- **투명성**: 우리는 PR을 관리하고, 사용자에게 우리의 선택을 설명하며, 기술적 의사결정을 내릴 때 투명성을 유지할 것을 약속합니다.
+다음 윤리 지침은 일반적으로 적용되지만, 윤리적으로 민감한 문제와 관련된 기술적 선택을 할 때 우선적으로 적용됩니다. 또한, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 지속적으로 조정할 것을 약속합니다.
 
-- **일관성**: 우리는 프로젝트 관리에서 사용자들에게 동일한 수준의 관심을 보장하고 기술적으로 안정되고 일관된 상태를 유지할 것을 약속합니다.
+- **투명성**: 우리는 PR 관리, 사용자에게 선택의 이유 설명, 기술적 의사결정 과정에서 투명성을 유지할 것을 약속합니다.
 
-- **간결성**: Diffusers 라이브러리를 사용하고 활용하기 쉽게 만들기 위해, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
+- **일관성**: 프로젝트 관리에서 모든 사용자에게 동일한 수준의 관심을 보장하고, 기술적으로 안정적이고 일관된 상태를 유지할 것을 약속합니다.
 
-- **접근성**: Diffusers 프로젝트는 기술적 전문 지식 없어도 프로젝트 운영에 참여할 수 있는 기여자의 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근할 수 있게 됩니다.
+- **간결성**: Diffusers 라이브러리를 쉽게 사용하고 활용할 수 있도록, 프로젝트의 목표를 간결하고 일관성 있게 유지할 것을 약속합니다.
 
-- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림(upstream) 코드, 모델 및 데이터셋의 재현성에 대해 투명하게 공개할 것을 목표로 합니다.
+- **접근성**: Diffusers 프로젝트는 기술적 전문지식이 없어도 기여할 수 있도록 진입장벽을 낮춥니다. 이를 통해 연구 결과물이 커뮤니티에 더 잘 접근될 수 있습니다.
 
-- **책임**: 우리는 커뮤니티와 팀워크를 통해, 이 기술의 잠재적인 위험과 위험을 예측하고 완화하는 데 대한 공동 책임을 가지고 있습니다.
+- **재현성**: 우리는 Diffusers 라이브러리를 통해 제공되는 업스트림 코드, 모델, 데이터셋의 재현성에 대해 투명하게 공개하는 것을 목표로 합니다.
 
+- **책임**: 커뮤니티와 팀워크를 통해, 이 기술의 잠재적 위험을 예측하고 완화하는 데 공동 책임을 집니다.
 
 ## 구현 사례: 안전 기능과 메커니즘 [[examples-of-implementations-safety-features-and-mechanisms]]
 
-팀은 diffusion 기술과 관련된 잠재적인 윤리 및 사회적 위험에 대처하기 위한 기술적 및 비기술적 도구를 제공하고자 하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능의 구현하고 우리와 함께 인식을 높이는 데 매우 중요합니다.
+팀은 diffusion 기술과 관련된 잠재적 윤리 및 사회적 위험에 대응하기 위해 기술적·비기술적 도구를 제공하고자 노력하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능 구현과 인식 제고에 매우 중요합니다.
 
-- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 이를 통해 커뮤니티는 프로젝트에 대해 토론하고 더 나은 협력을 할 수 있습니다.
+- [**커뮤니티 탭**](https://huggingface.co/docs/hub/repositories-pull-requests-discussions): 커뮤니티가 프로젝트에 대해 토론하고 더 나은 협업을 할 수 있도록 지원합니다.
 
-- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer)을 제공합니다. 이런 의미에서, 우리는 편향 탐색 및 평가를 지원하고 장려합니다.
+- **편향 탐색 및 평가**: Hugging Face 팀은 Stable Diffusion 모델의 편향성을 대화형으로 보여주는 [space](https://huggingface.co/spaces/society-ethics/DiffusionBiasExplorer)를 제공합니다. 우리는 이러한 편향 탐색과 평가를 지원하고 장려합니다.
 
 - **배포에서의 안전 유도**
 
-  - [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 이는 필터되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절한 변질에 취약한 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).
+  - [**안전한 Stable Diffusion**](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_safe): 필터링되지 않은 웹 크롤링 데이터셋으로 훈련된 Stable Diffusion과 같은 모델이 부적절하게 변질되는 문제를 완화합니다. 관련 논문: [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://huggingface.co/papers/2211.05105).
 
-  - [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 이미지가 생성된 후에 이미자가 임베딩 공간에서 일련의 하드코딩된 유해 개념의 클래스일 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
+  - [**안전 검사기**](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py): 생성된 이미지가 임베딩 공간에서 하드코딩된 유해 개념 클래스와 일치할 확률을 확인하고 비교합니다. 유해 개념은 역공학을 방지하기 위해 의도적으로 숨겨져 있습니다.
 
-- **Hub에서의 단계적인 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한해야 합니다. 이 단계적인 배포는 중간 단계로, 리포지토리 작성자가 사용에 대한 더 많은 통제력을 갖게 합니다.
+- **Hub에서의 단계적 배포**: 특히 민감한 상황에서는 일부 리포지토리에 대한 접근을 제한할 수 있습니다. 단계적 배포는 리포지토리 작성자가 사용에 대해 더 많은 통제권을 갖도록 하는 중간 단계입니다.
 
-- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선싱을 통해 자유로운 접근을 보장하면서도 더 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.
+- **라이선싱**: [OpenRAILs](https://huggingface.co/blog/open_rail)와 같은 새로운 유형의 라이선스를 통해 자유로운 접근을 보장하면서도 보다 책임 있는 사용을 위한 일련의 제한을 둘 수 있습니다.

From 2d69bacb007bc3e21f2cb9c7b4dd89fb71ccdcf8 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 7 Oct 2025 13:51:20 +0530
Subject: [PATCH 42/69] handle offload_state_dict when initing transformers
 models (#12438)

---
 src/diffusers/pipelines/pipeline_loading_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index b7a3e08105ff..dd542145d3fa 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -838,6 +838,9 @@ def load_sub_model(
         else:
             loading_kwargs["low_cpu_mem_usage"] = False
 
+    if is_transformers_model and is_transformers_version(">=", "4.57.0"):
+        loading_kwargs.pop("offload_state_dict")
+
     if (
         quantization_config is not None
         and isinstance(quantization_config, PipelineQuantizationConfig)

From de03851e2f9cfd75b797a8d271798f9fa59fccd7 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 7 Oct 2025 14:20:17 +0000
Subject: [PATCH 43/69] update doc

---
 docs/source/en/api/pipelines/mirage.md | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/mirage.md
index 3383bbecae2a..f0117795a989 100644
--- a/docs/source/en/api/pipelines/mirage.md
+++ b/docs/source/en/api/pipelines/mirage.md
@@ -22,18 +22,12 @@ Mirage is a text-to-image diffusion model using a transformer-based architecture
 
 Key features:
 
-- **Transformer Architecture**: Uses a modern transformer-based denoising model with attention mechanisms optimized for image generation
-- **Flow Matching**: Employs flow matching with Euler discrete scheduling for efficient sampling
+- **Simplified MMDIT architecture**: Uses a simplified MMDIT architecture for image generation where text tokens are not updated through the transformer blocks
+- **Flow Matching**: Employs flow matching with discrete scheduling for efficient sampling
 - **Flexible VAE Support**: Compatible with both Flux VAE (8x compression, 16 latent channels) and DC-AE (32x compression, 32 latent channels)
-- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding with strong text-image alignment
+- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding offering multiple language support
 - **Efficient Architecture**: ~1.3B parameters in the transformer, enabling fast inference while maintaining quality
-- **Modular Design**: Text encoder and VAE weights are loaded from HuggingFace, keeping checkpoint sizes small
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
 
 ## Loading the Pipeline
 
@@ -46,7 +40,7 @@ from diffusers import MiragePipeline
 pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
 pipe.to("cuda")
 
-prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light"
 image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
 image.save("mirage_output.png")
 ```
@@ -123,11 +117,11 @@ Key parameters for image generation:
 ```py
 # Example with custom parameters
 image = pipe(
-    prompt="A serene mountain landscape at sunset",
+    prompt="A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light",
     num_inference_steps=28,
     guidance_scale=4.0,
-    height=1024,
-    width=1024,
+    height=512,
+    width=512,
     generator=torch.Generator("cuda").manual_seed(42)
 ).images[0]
 ```

From a69aa4bb5bd6d4b5d5a0c6611e7c014df683b4a4 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 7 Oct 2025 14:25:27 +0000
Subject: [PATCH 44/69] rename model to photon

---
 .../en/api/pipelines/{mirage.md => photon.md} | 34 ++++++++--------
 ...sers.py => convert_photon_to_diffusers.py} | 34 ++++++++--------
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/models/__init__.py              |  2 +-
 src/diffusers/models/attention_processor.py   | 20 +++++-----
 src/diffusers/models/transformers/__init__.py |  2 +-
 ...former_mirage.py => transformer_photon.py} | 14 +++----
 src/diffusers/pipelines/__init__.py           |  2 +-
 src/diffusers/pipelines/mirage/__init__.py    |  5 ---
 src/diffusers/pipelines/photon/__init__.py    |  5 +++
 .../{mirage => photon}/pipeline_output.py     |  4 +-
 .../pipeline_photon.py}                       | 40 +++++++++----------
 ...e.py => test_models_transformer_photon.py} | 14 +++----
 13 files changed, 89 insertions(+), 89 deletions(-)
 rename docs/source/en/api/pipelines/{mirage.md => photon.md} (86%)
 rename scripts/{convert_mirage_to_diffusers.py => convert_photon_to_diffusers.py} (92%)
 rename src/diffusers/models/transformers/{transformer_mirage.py => transformer_photon.py} (99%)
 delete mode 100644 src/diffusers/pipelines/mirage/__init__.py
 create mode 100644 src/diffusers/pipelines/photon/__init__.py
 rename src/diffusers/pipelines/{mirage => photon}/pipeline_output.py (93%)
 rename src/diffusers/pipelines/{mirage/pipeline_mirage.py => photon/pipeline_photon.py} (95%)
 rename tests/models/transformers/{test_models_transformer_mirage.py => test_models_transformer_photon.py} (95%)

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/photon.md
similarity index 86%
rename from docs/source/en/api/pipelines/mirage.md
rename to docs/source/en/api/pipelines/photon.md
index f0117795a989..f8f7098545f8 100644
--- a/docs/source/en/api/pipelines/mirage.md
+++ b/docs/source/en/api/pipelines/photon.md
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License. -->
 
-# MiragePipeline
+# PhotonPipeline
 
 <div class="flex flex-wrap space-x-1">
   <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
 </div>
 
-Mirage is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
+Photon is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
 
 Key features:
 
@@ -31,18 +31,18 @@ Key features:
 
 ## Loading the Pipeline
 
-Mirage checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
+Photon checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
 
 ```py
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 
 # Load pipeline - VAE and text encoder will be loaded from HuggingFace
-pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
 pipe.to("cuda")
 
 prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light"
 image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
-image.save("mirage_output.png")
+image.save("photon_output.png")
 ```
 
 ### Manual Component Loading
@@ -51,14 +51,14 @@ You can also load components individually:
 
 ```py
 import torch
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 from diffusers.models import AutoencoderKL, AutoencoderDC
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from transformers import T5GemmaModel, GemmaTokenizerFast
 
 # Load transformer
-transformer = MirageTransformer2DModel.from_pretrained(
+transformer = PhotonTransformer2DModel.from_pretrained(
     "path/to/checkpoint", subfolder="transformer"
 )
 
@@ -78,7 +78,7 @@ vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="v
 # Or DC-AE (32 latent channels):
 # vae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers")
 
-pipe = MiragePipeline(
+pipe = PhotonPipeline(
     transformer=transformer,
     scheduler=scheduler,
     text_encoder=text_encoder,
@@ -90,7 +90,7 @@ pipe.to("cuda")
 
 ## VAE Variants
 
-Mirage supports two VAE configurations:
+Photon supports two VAE configurations:
 
 ### Flux VAE (AutoencoderKL)
 - **Compression**: 8x spatial compression
@@ -132,21 +132,21 @@ For memory-constrained environments:
 
 ```py
 import torch
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 
-pipe = MiragePipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
+pipe = PhotonPipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
 pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use
 
 # Or use sequential CPU offload for even lower memory
 pipe.enable_sequential_cpu_offload()
 ```
 
-## MiragePipeline
+## PhotonPipeline
 
-[[autodoc]] MiragePipeline
+[[autodoc]] PhotonPipeline
   - all
   - __call__
 
-## MiragePipelineOutput
+## PhotonPipelineOutput
 
-[[autodoc]] pipelines.mirage.pipeline_output.MiragePipelineOutput
+[[autodoc]] pipelines.photon.pipeline_output.PhotonPipelineOutput
diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_photon_to_diffusers.py
similarity index 92%
rename from scripts/convert_mirage_to_diffusers.py
rename to scripts/convert_photon_to_diffusers.py
index 37de253d1448..ad04463e019f 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_photon_to_diffusers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Script to convert Mirage checkpoint from original codebase to diffusers format.
+Script to convert Photon checkpoint from original codebase to diffusers format.
 """
 
 import argparse
@@ -16,14 +16,14 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
-from diffusers.pipelines.mirage import MiragePipeline
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
+from diffusers.pipelines.photon import PhotonPipeline
 
 DEFAULT_HEIGHT = 512
 DEFAULT_WIDTH = 512
 
 @dataclass(frozen=True)
-class MirageBase:
+class PhotonBase:
     context_in_dim: int = 2304
     hidden_size: int = 1792
     mlp_ratio: float = 3.5
@@ -36,22 +36,22 @@ class MirageBase:
 
 
 @dataclass(frozen=True)
-class MirageFlux(MirageBase):
+class PhotonFlux(PhotonBase):
     in_channels: int = 16
     patch_size: int = 2
 
 
 @dataclass(frozen=True)
-class MirageDCAE(MirageBase):
+class PhotonDCAE(PhotonBase):
     in_channels: int = 32
     patch_size: int = 1
 
 
 def build_config(vae_type: str) -> dict:
     if vae_type == "flux":
-        cfg = MirageFlux()
+        cfg = PhotonFlux()
     elif vae_type == "dc-ae":
-        cfg = MirageDCAE()
+        cfg = PhotonDCAE()
     else:
         raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
 
@@ -125,8 +125,8 @@ def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth
     return converted_state_dict
 
 
-def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> MirageTransformer2DModel:
-    """Create and load MirageTransformer2DModel from old checkpoint."""
+def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> PhotonTransformer2DModel:
+    """Create and load PhotonTransformer2DModel from old checkpoint."""
 
     print(f"Loading checkpoint from: {checkpoint_path}")
 
@@ -154,8 +154,8 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     converted_state_dict = convert_checkpoint_parameters(state_dict, depth=model_depth)
 
     # Create transformer with config
-    print("Creating MirageTransformer2DModel...")
-    transformer = MirageTransformer2DModel(**config)
+    print("Creating PhotonTransformer2DModel...")
+    transformer = PhotonTransformer2DModel(**config)
 
     # Load state dict
     print("Loading converted parameters...")
@@ -212,13 +212,13 @@ def create_model_index(vae_type: str, output_path: str):
     text_model_name = "google/t5gemma-2b-2b-ul2"
 
     model_index = {
-        "_class_name": "MiragePipeline",
+        "_class_name": "PhotonPipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
         "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
         "text_encoder": text_model_name,
         "tokenizer": text_model_name,
-        "transformer": ["diffusers", "MirageTransformer2DModel"],
+        "transformer": ["diffusers", "PhotonTransformer2DModel"],
         "vae": vae_model_name,
         "vae_subfolder": vae_subfolder,
         "default_height": default_height,
@@ -262,7 +262,7 @@ def main(args):
 
     # Verify the pipeline can be loaded
     try:
-        pipeline = MiragePipeline.from_pretrained(args.output_path)
+        pipeline = PhotonPipeline.from_pretrained(args.output_path)
         print("Pipeline loaded successfully!")
         print(f"Transformer: {type(pipeline.transformer).__name__}")
         print(f"VAE: {type(pipeline.vae).__name__}")
@@ -285,10 +285,10 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
+    parser = argparse.ArgumentParser(description="Convert Photon checkpoint to diffusers format")
 
     parser.add_argument(
-        "--checkpoint_path", type=str, required=True, help="Path to the original Mirage checkpoint (.pth file)"
+        "--checkpoint_path", type=str, required=True, help="Path to the original Photon checkpoint (.pth file)"
     )
 
     parser.add_argument(
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 6fc6ac5f3ebd..13b0ac8d64b0 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -224,7 +224,7 @@
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
-            "MirageTransformer2DModel",
+            "PhotonTransformer2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
             "MotionAdapter",
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 279e69216b1b..86e32c1eec3e 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -93,7 +93,7 @@
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
-    _import_structure["transformers.transformer_mirage"] = ["MirageTransformer2DModel"]
+    _import_structure["transformers.transformer_photon"] = ["PhotonTransformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 08e80e4329ba..23ec72a8f657 100755
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -5609,15 +5609,15 @@ def __new__(cls, *args, **kwargs):
         return processor
 
 
-class MirageAttnProcessor2_0:
+class PhotonAttnProcessor2_0:
     r"""
-    Processor for implementing Mirage-style attention with multi-source tokens and RoPE.
-    Properly integrates with diffusers Attention module while handling Mirage-specific logic.
+    Processor for implementing Photon-style attention with multi-source tokens and RoPE.
+    Properly integrates with diffusers Attention module while handling Photon-specific logic.
     """
 
     def __init__(self):
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
-            raise ImportError("MirageAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
+            raise ImportError("PhotonAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
 
     def __call__(
         self,
@@ -5629,9 +5629,9 @@ def __call__(
         **kwargs,
     ) -> torch.Tensor:
         """
-        Apply Mirage attention using standard diffusers interface.
+        Apply Photon attention using standard diffusers interface.
 
-        Expected tensor formats from MirageBlock.attn_forward():
+        Expected tensor formats from PhotonBlock.attn_forward():
         - hidden_states: Image queries with RoPE applied [B, H, L_img, D]
         - encoder_hidden_states: Packed key+value tensors [B, H, L_all, 2*D]
           (concatenated keys and values from text + image + spatial conditioning)
@@ -5640,15 +5640,15 @@ def __call__(
 
         if encoder_hidden_states is None:
             raise ValueError(
-                "MirageAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
-                "This should be provided by MirageBlock.attn_forward()."
+                "PhotonAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
+                "This should be provided by PhotonBlock.attn_forward()."
             )
 
         # Unpack the combined key+value tensor
         # encoder_hidden_states is [B, H, L_all, 2*D] containing [keys, values]
         key, value = encoder_hidden_states.chunk(2, dim=-1)  # Each [B, H, L_all, D]
 
-        # Apply scaled dot-product attention with Mirage's processed tensors
+        # Apply scaled dot-product attention with Photon's processed tensors
         # hidden_states is image queries [B, H, L_img, D]
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             hidden_states.contiguous(), key.contiguous(), value.contiguous(), attn_mask=attention_mask
@@ -5714,7 +5714,7 @@ def __call__(
     PAGHunyuanAttnProcessor2_0,
     PAGCFGHunyuanAttnProcessor2_0,
     LuminaAttnProcessor2_0,
-    MirageAttnProcessor2_0,
+    PhotonAttnProcessor2_0,
     FusedAttnProcessor2_0,
     CustomDiffusionXFormersAttnProcessor,
     CustomDiffusionAttnProcessor2_0,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index ebe0d0c9b8e1..652f6d811393 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -29,7 +29,7 @@
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
-    from .transformer_mirage import MirageTransformer2DModel
+    from .transformer_photon import PhotonTransformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_photon.py
similarity index 99%
rename from src/diffusers/models/transformers/transformer_mirage.py
rename to src/diffusers/models/transformers/transformer_photon.py
index 90ba11fb2d24..9ec6e9756c20 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_photon.py
@@ -23,7 +23,7 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
+from ..attention_processor import Attention, AttentionProcessor, PhotonAttnProcessor2_0
 from ..embeddings import get_timestep_embedding
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -206,7 +206,7 @@ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
         return ModulationOut(*out[:3]), ModulationOut(*out[3:])
 
 
-class MirageBlock(nn.Module):
+class PhotonBlock(nn.Module):
     r"""
     Multimodal transformer block with text–image cross-attention, modulation, and MLP.
 
@@ -304,7 +304,7 @@ def __init__(
             dim_head=self.head_dim,
             bias=False,
             out_bias=False,
-            processor=MirageAttnProcessor2_0(),
+            processor=PhotonAttnProcessor2_0(),
         )
 
         # mlp
@@ -538,7 +538,7 @@ def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
     return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
 
 
-class MirageTransformer2DModel(ModelMixin, ConfigMixin):
+class PhotonTransformer2DModel(ModelMixin, ConfigMixin):
     r"""
     Transformer-based 2D model for text to image generation.
     It supports attention processor injection and LoRA scaling.
@@ -581,7 +581,7 @@ class MirageTransformer2DModel(ModelMixin, ConfigMixin):
         txt_in (`nn.Linear`):
             Projection layer for text conditioning.
         blocks (`nn.ModuleList`):
-            Stack of transformer blocks (`MirageBlock`).
+            Stack of transformer blocks (`PhotonBlock`).
         final_layer (`LastLayer`):
             Projection layer mapping hidden tokens back to patch outputs.
 
@@ -656,7 +656,7 @@ def __init__(
 
         self.blocks = nn.ModuleList(
             [
-                MirageBlock(
+                PhotonBlock(
                     self.hidden_size,
                     self.num_heads,
                     mlp_ratio=mlp_ratio,
@@ -781,7 +781,7 @@ def forward(
         return_dict: bool = True,
     ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
         r"""
-        Forward pass of the MirageTransformer2DModel.
+        Forward pass of the PhotonTransformer2DModel.
 
         The latent image is split into patch tokens, combined with text conditioning,
         and processed through a stack of transformer blocks modulated by the timestep.
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 7b7ebb633c3b..ae0d90c48c63 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -144,7 +144,7 @@
         "FluxKontextPipeline",
         "FluxKontextInpaintPipeline",
     ]
-    _import_structure["mirage"] = ["MiragePipeline"]
+    _import_structure["photon"] = ["PhotonPipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
         "AudioLDM2Pipeline",
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
deleted file mode 100644
index cba951057370..000000000000
--- a/src/diffusers/pipelines/mirage/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .pipeline_mirage import MiragePipeline
-from .pipeline_output import MiragePipelineOutput
-
-
-__all__ = ["MiragePipeline", "MiragePipelineOutput"]
diff --git a/src/diffusers/pipelines/photon/__init__.py b/src/diffusers/pipelines/photon/__init__.py
new file mode 100644
index 000000000000..d1dd5b2cbf53
--- /dev/null
+++ b/src/diffusers/pipelines/photon/__init__.py
@@ -0,0 +1,5 @@
+from .pipeline_photon import PhotonPipeline
+from .pipeline_output import PhotonPipelineOutput
+
+
+__all__ = ["PhotonPipeline", "PhotonPipelineOutput"]
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/photon/pipeline_output.py
similarity index 93%
rename from src/diffusers/pipelines/mirage/pipeline_output.py
rename to src/diffusers/pipelines/photon/pipeline_output.py
index e41c8e3bea00..ca0674d94b6c 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/photon/pipeline_output.py
@@ -22,9 +22,9 @@
 
 
 @dataclass
-class MiragePipelineOutput(BaseOutput):
+class PhotonPipelineOutput(BaseOutput):
     """
-    Output class for Mirage pipelines.
+    Output class for Photon pipelines.
 
     Args:
         images (`List[PIL.Image.Image]` or `np.ndarray`)
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/photon/pipeline_photon.py
similarity index 95%
rename from src/diffusers/pipelines/mirage/pipeline_mirage.py
rename to src/diffusers/pipelines/photon/pipeline_photon.py
index ced78adec786..ce3479fedcdd 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -31,7 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
-from ...models.transformers.transformer_mirage import MirageTransformer2DModel, seq2img
+from ...models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -39,7 +39,7 @@
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import MiragePipelineOutput
+from .pipeline_output import PhotonPipelineOutput
 
 
 DEFAULT_HEIGHT = 512
@@ -49,7 +49,7 @@
 
 
 class TextPreprocessor:
-    """Text preprocessing utility for MiragePipeline."""
+    """Text preprocessing utility for PhotonPipeline."""
 
     def __init__(self):
         """Initialize text preprocessor."""
@@ -179,15 +179,15 @@ def clean_text(self, text: str) -> str:
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import MiragePipeline
+        >>> from diffusers import PhotonPipeline
         >>> from diffusers.models import AutoencoderKL, AutoencoderDC
         >>> from transformers import T5GemmaModel, GemmaTokenizerFast
 
         >>> # Load pipeline directly with from_pretrained
-        >>> pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+        >>> pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
 
         >>> # Or initialize pipeline components manually
-        >>> transformer = MirageTransformer2DModel.from_pretrained("path/to/transformer")
+        >>> transformer = PhotonTransformer2DModel.from_pretrained("path/to/transformer")
         >>> scheduler = FlowMatchEulerDiscreteScheduler()
         >>> # Load T5Gemma encoder
         >>> t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
@@ -195,7 +195,7 @@ def clean_text(self, text: str) -> str:
         >>> tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
         >>> vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
 
-        >>> pipe = MiragePipeline(
+        >>> pipe = PhotonPipeline(
         ...     transformer=transformer,
         ...     scheduler=scheduler,
         ...     text_encoder=text_encoder,
@@ -205,26 +205,26 @@ def clean_text(self, text: str) -> str:
         >>> pipe.to("cuda")
         >>> prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
         >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
-        >>> image.save("mirage_output.png")
+        >>> image.save("photon_output.png")
         ```
 """
 
 
-class MiragePipeline(
+class PhotonPipeline(
     DiffusionPipeline,
     LoraLoaderMixin,
     FromSingleFileMixin,
     TextualInversionLoaderMixin,
 ):
     r"""
-    Pipeline for text-to-image generation using Mirage Transformer.
+    Pipeline for text-to-image generation using Photon Transformer.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-        transformer ([`MirageTransformer2DModel`]):
-            The Mirage transformer model to denoise the encoded image latents.
+        transformer ([`PhotonTransformer2DModel`]):
+            The Photon transformer model to denoise the encoded image latents.
         scheduler ([`FlowMatchEulerDiscreteScheduler`]):
             A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
         text_encoder ([`T5EncoderModel`]):
@@ -248,7 +248,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         """
         Override from_pretrained to load VAE and text encoder from HuggingFace models.
 
-        The MiragePipeline checkpoints only store transformer and scheduler locally.
+        The PhotonPipeline checkpoints only store transformer and scheduler locally.
         VAE and text encoder are loaded from external HuggingFace models as specified
         in model_index.json.
         """
@@ -285,7 +285,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # Load transformer and scheduler from local checkpoint
         logger.info(f"Loading transformer from {pretrained_model_name_or_path}...")
-        transformer = MirageTransformer2DModel.from_pretrained(
+        transformer = PhotonTransformer2DModel.from_pretrained(
             pretrained_model_name_or_path, subfolder="transformer"
         )
 
@@ -310,7 +310,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
     def __init__(
         self,
-        transformer: MirageTransformer2DModel,
+        transformer: PhotonTransformer2DModel,
         scheduler: FlowMatchEulerDiscreteScheduler,
         text_encoder: Union[T5EncoderModel, Any],
         tokenizer: Union[T5TokenizerFast, GemmaTokenizerFast, AutoTokenizer],
@@ -318,9 +318,9 @@ def __init__(
     ):
         super().__init__()
 
-        if MirageTransformer2DModel is None:
+        if PhotonTransformer2DModel is None:
             raise ImportError(
-                "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
+                "PhotonTransformer2DModel is not available. Please ensure the transformer_photon module is properly installed."
             )
 
         self.text_encoder = text_encoder
@@ -544,7 +544,7 @@ def __call__(
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.mirage.MiragePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.photon.PhotonPipelineOutput`] instead of a plain tuple.
             callback_on_step_end (`Callable`, *optional*):
                 A function that calls at the end of each denoising steps during the inference. The function is called
                 with the following arguments: `callback_on_step_end(self, step, timestep, callback_kwargs)`.
@@ -557,7 +557,7 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.mirage.MiragePipelineOutput`] or `tuple`: [`~pipelines.mirage.MiragePipelineOutput`] if
+            [`~pipelines.photon.PhotonPipelineOutput`] or `tuple`: [`~pipelines.photon.PhotonPipelineOutput`] if
             `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
             generated images.
         """
@@ -683,4 +683,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return MiragePipelineOutput(images=image)
+        return PhotonPipelineOutput(images=image)
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_photon.py
similarity index 95%
rename from tests/models/transformers/test_models_transformer_mirage.py
rename to tests/models/transformers/test_models_transformer_photon.py
index fe7436debc4c..2f08484d230c 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_photon.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 
 from ...testing_utils import enable_full_determinism, torch_device
 from ..test_modeling_common import ModelTesterMixin
@@ -26,8 +26,8 @@
 enable_full_determinism()
 
 
-class MirageTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = MirageTransformer2DModel
+class PhotonTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PhotonTransformer2DModel
     main_input_name = "image_latent"
 
     @property
@@ -92,7 +92,7 @@ def test_forward_signature(self):
 
     def test_model_initialization(self):
         # Test model initialization
-        model = MirageTransformer2DModel(
+        model = PhotonTransformer2DModel(
             in_channels=16,
             patch_size=2,
             context_in_dim=1792,
@@ -121,7 +121,7 @@ def test_model_with_dict_config(self):
             "theta": 10_000,
         }
 
-        model = MirageTransformer2DModel.from_config(config_dict)
+        model = PhotonTransformer2DModel.from_config(config_dict)
         self.assertEqual(model.config.in_channels, 16)
         self.assertEqual(model.config.hidden_size, 1792)
 
@@ -193,7 +193,7 @@ def test_attention_mask(self):
     def test_invalid_config(self):
         # Test invalid configuration - hidden_size not divisible by num_heads
         with self.assertRaises(ValueError):
-            MirageTransformer2DModel(
+            PhotonTransformer2DModel(
                 in_channels=16,
                 patch_size=2,
                 context_in_dim=1792,
@@ -207,7 +207,7 @@ def test_invalid_config(self):
 
         # Test invalid axes_dim that doesn't sum to pe_dim
         with self.assertRaises(ValueError):
-            MirageTransformer2DModel(
+            PhotonTransformer2DModel(
                 in_channels=16,
                 patch_size=2,
                 context_in_dim=1792,

From 1066de8c699db994ecd6beadd7d5293ffc3ead49 Mon Sep 17 00:00:00 2001
From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
Date: Tue, 7 Oct 2025 18:27:15 +0300
Subject: [PATCH 45/69] [Qwen LoRA training] fix bug when offloading (#12440)

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled

* fix bug when offload and cache_latents both enabled
---
 examples/dreambooth/train_dreambooth_lora_qwen_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_qwen_image.py b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
index 75eae92dfbd0..56de160d6f29 100644
--- a/examples/dreambooth/train_dreambooth_lora_qwen_image.py
+++ b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
@@ -1338,7 +1338,7 @@ def compute_text_embeddings(prompt, text_encoding_pipeline):
                         batch["pixel_values"] = batch["pixel_values"].to(
                             accelerator.device, non_blocking=True, dtype=vae.dtype
                         )
-                    latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+                        latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
                 if train_dataset.custom_instance_prompts:
                     with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
                         prompt_embeds, prompt_embeds_mask = compute_text_embeddings(

From 2dc31677e12fe175950f28fd5a0c0703594e7ce4 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 8 Oct 2025 09:22:34 +0530
Subject: [PATCH 46/69] Align Flux modular more and more with Qwen modular
 (#12445)

* start

* fix

* up
---
 .../modular_pipelines/flux/before_denoise.py  | 329 ++++++------------
 .../modular_pipelines/flux/denoise.py         |  12 +-
 .../modular_pipelines/flux/encoders.py        | 232 ++++++------
 .../modular_pipelines/flux/inputs.py          | 236 +++++++++++++
 .../modular_pipelines/flux/modular_blocks.py  | 185 ++++++----
 5 files changed, 573 insertions(+), 421 deletions(-)
 create mode 100644 src/diffusers/modular_pipelines/flux/inputs.py

diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
index 95858fbf6eb0..5f3193af0e35 100644
--- a/src/diffusers/modular_pipelines/flux/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import torch
 
-from ...models import AutoencoderKL
+from ...pipelines import FluxPipeline
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -104,48 +104,6 @@ def calculate_shift(
     return mu
 
 
-# Adapted from the original implementation.
-def prepare_latents_img2img(
-    vae, scheduler, image, timestep, batch_size, num_channels_latents, height, width, dtype, device, generator
-):
-    if isinstance(generator, list) and len(generator) != batch_size:
-        raise ValueError(
-            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-        )
-
-    vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
-    latent_channels = vae.config.latent_channels
-
-    # VAE applies 8x compression on images but we must also account for packing which requires
-    # latent height and width to be divisible by 2.
-    height = 2 * (int(height) // (vae_scale_factor * 2))
-    width = 2 * (int(width) // (vae_scale_factor * 2))
-    shape = (batch_size, num_channels_latents, height, width)
-    latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-
-    image = image.to(device=device, dtype=dtype)
-    if image.shape[1] != latent_channels:
-        image_latents = _encode_vae_image(image=image, generator=generator)
-    else:
-        image_latents = image
-    if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-        # expand init_latents for batch_size
-        additional_image_per_prompt = batch_size // image_latents.shape[0]
-        image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
-    elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-        raise ValueError(
-            f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-        )
-    else:
-        image_latents = torch.cat([image_latents], dim=0)
-
-    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-    latents = scheduler.scale_noise(image_latents, timestep, noise)
-    latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
-    return latents, latent_image_ids
-
-
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -160,6 +118,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
+# TODO: align this with Qwen patchifier
 def _pack_latents(latents, batch_size, num_channels_latents, height, width):
     latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
     latents = latents.permute(0, 2, 4, 1, 3, 5)
@@ -168,35 +127,6 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
     return latents
 
 
-def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-    latent_image_ids = torch.zeros(height, width, 3)
-    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
-    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
-
-    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
-
-    latent_image_ids = latent_image_ids.reshape(
-        latent_image_id_height * latent_image_id_width, latent_image_id_channels
-    )
-
-    return latent_image_ids.to(device=device, dtype=dtype)
-
-
-# Cannot use "# Copied from" because it introduces weird indentation errors.
-def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
-    if isinstance(generator, list):
-        image_latents = [
-            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
-        ]
-        image_latents = torch.cat(image_latents, dim=0)
-    else:
-        image_latents = retrieve_latents(vae.encode(image), generator=generator)
-
-    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
-
-    return image_latents
-
-
 def _get_initial_timesteps_and_optionals(
     transformer,
     scheduler,
@@ -231,96 +161,6 @@ def _get_initial_timesteps_and_optionals(
     return timesteps, num_inference_steps, sigmas, guidance
 
 
-class FluxInputStep(ModularPipelineBlocks):
-    model_name = "flux"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Input processing step that:\n"
-            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
-            "  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_images_per_prompt`\n\n"
-            "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
-            "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
-            "have a final batch_size of batch_size * num_images_per_prompt."
-        )
-
-    @property
-    def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("num_images_per_prompt", default=1),
-            InputParam(
-                "prompt_embeds",
-                required=True,
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
-            ),
-            InputParam(
-                "pooled_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
-            ),
-            # TODO: support negative embeddings?
-        ]
-
-    @property
-    def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
-            OutputParam(
-                "prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="text embeddings used to guide the image generation",
-            ),
-            OutputParam(
-                "pooled_prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="pooled text embeddings used to guide the image generation",
-            ),
-            # TODO: support negative embeddings?
-        ]
-
-    def check_inputs(self, components, block_state):
-        if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
-            if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
-                raise ValueError(
-                    "`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
-                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
-                    f" {block_state.pooled_prompt_embeds.shape}."
-                )
-
-    @torch.no_grad()
-    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
-        # TODO: consider adding negative embeddings?
-        block_state = self.get_block_state(state)
-        self.check_inputs(components, block_state)
-
-        block_state.batch_size = block_state.prompt_embeds.shape[0]
-        block_state.dtype = block_state.prompt_embeds.dtype
-
-        _, seq_len, _ = block_state.prompt_embeds.shape
-        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
-        block_state.prompt_embeds = block_state.prompt_embeds.view(
-            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
-        )
-        self.set_block_state(state, block_state)
-
-        return components, state
-
-
 class FluxSetTimestepsStep(ModularPipelineBlocks):
     model_name = "flux"
 
@@ -389,6 +229,10 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.sigmas = sigmas
         block_state.guidance = guidance
 
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        components.scheduler.set_begin_index(0)
+
         self.set_block_state(state, block_state)
         return components, state
 
@@ -432,11 +276,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=int,
                 description="The number of denoising steps to perform at inference time",
             ),
-            OutputParam(
-                "latent_timestep",
-                type_hint=torch.Tensor,
-                description="The timestep that represents the initial noise level for image-to-image generation",
-            ),
             OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used."),
         ]
 
@@ -484,8 +323,6 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.sigmas = sigmas
         block_state.guidance = guidance
 
-        block_state.latent_timestep = timesteps[:1].repeat(batch_size)
-
         self.set_block_state(state, block_state)
         return components, state
 
@@ -524,11 +361,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
             ),
-            OutputParam(
-                "latent_image_ids",
-                type_hint=torch.Tensor,
-                description="IDs computed from the image sequence needed for RoPE",
-            ),
         ]
 
     @staticmethod
@@ -552,20 +384,13 @@ def prepare_latents(
         generator,
         latents=None,
     ):
-        # Couldn't use the `prepare_latents` method directly from Flux because I decided to copy over
-        # the packing methods here. So, for example, `comp._pack_latents()` won't work if we were
-        # to go with the "# Copied from ..." approach. Or maybe there's a way?
-
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
         height = 2 * (int(height) // (comp.vae_scale_factor * 2))
         width = 2 * (int(width) // (comp.vae_scale_factor * 2))
 
         shape = (batch_size, num_channels_latents, height, width)
 
         if latents is not None:
-            latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-            return latents.to(device=device, dtype=dtype), latent_image_ids
+            return latents.to(device=device, dtype=dtype)
 
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
@@ -573,12 +398,11 @@ def prepare_latents(
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
+        # TODO: move packing latents code to a patchifier
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
 
-        latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
-
-        return latents, latent_image_ids
+        return latents
 
     @torch.no_grad()
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -587,12 +411,11 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.height = block_state.height or components.default_height
         block_state.width = block_state.width or components.default_width
         block_state.device = components._execution_device
-        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
         block_state.num_channels_latents = components.num_channels_latents
 
         self.check_inputs(components, block_state)
         batch_size = block_state.batch_size * block_state.num_images_per_prompt
-        block_state.latents, block_state.latent_image_ids = self.prepare_latents(
+        block_state.latents = self.prepare_latents(
             components,
             batch_size,
             block_state.num_channels_latents,
@@ -613,81 +436,123 @@ class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("vae", AutoencoderKL), ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+    def description(self) -> str:
+        return "Step that adds noise to image latents for image-to-image. Should be run after `set_timesteps`,"
+        " `prepare_latents`. Both noise and image latents should already be patchified."
 
     @property
-    def description(self) -> str:
-        return "Step that prepares the latents for the image-to-image generation process"
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
 
     @property
-    def inputs(self) -> List[Tuple[str, Any]]:
+    def inputs(self) -> List[InputParam]:
         return [
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
-            InputParam("latents", type_hint=Optional[torch.Tensor]),
-            InputParam("num_images_per_prompt", type_hint=int, default=1),
-            InputParam("generator"),
             InputParam(
-                "image_latents",
+                name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The latents representing the reference image for image-to-image/inpainting generation. Can be generated in vae_encode step.",
+                description="The initial random noised, can be generated in prepare latent step.",
             ),
             InputParam(
-                "latent_timestep",
+                name="image_latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The timestep that represents the initial noise level for image-to-image/inpainting generation. Can be generated in set_timesteps step.",
+                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
             ),
             InputParam(
-                "batch_size",
+                name="timesteps",
                 required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam("dtype", required=True, type_hint=torch.dtype, description="The dtype of the model inputs"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
-            ),
-            OutputParam(
-                "latent_image_ids",
+                name="initial_noise",
                 type_hint=torch.Tensor,
-                description="IDs computed from the image sequence needed for RoPE",
+                description="The initial random noised used for inpainting denoising.",
             ),
         ]
 
+    @staticmethod
+    def check_inputs(image_latents, latents):
+        if image_latents.shape[0] != latents.shape[0]:
+            raise ValueError(
+                f"`image_latents` must have have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}"
+            )
+
+        if image_latents.ndim != 3:
+            raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}")
+
     @torch.no_grad()
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
-        block_state.device = components._execution_device
-        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
-        block_state.num_channels_latents = components.num_channels_latents
-        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
-        block_state.device = components._execution_device
+        self.check_inputs(image_latents=block_state.image_latents, latents=block_state.latents)
 
-        # TODO: implement `check_inputs`
-        batch_size = block_state.batch_size * block_state.num_images_per_prompt
-        if block_state.latents is None:
-            block_state.latents, block_state.latent_image_ids = prepare_latents_img2img(
-                components.vae,
-                components.scheduler,
-                block_state.image_latents,
-                block_state.latent_timestep,
-                batch_size,
-                block_state.num_channels_latents,
-                block_state.height,
-                block_state.width,
-                block_state.dtype,
-                block_state.device,
-                block_state.generator,
-            )
+        # prepare latent timestep
+        latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
+
+        # make copy of initial_noise
+        block_state.initial_noise = block_state.latents
+
+        # scale noise
+        block_state.latents = components.scheduler.scale_noise(
+            block_state.image_latents, latent_timestep, block_state.latents
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class FluxRoPEInputsStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the RoPE inputs for the denoising process. Should be placed after text encoder and latent preparation steps."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="txt_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the prompt embeds, used for RoPE calculation.",
+            ),
+            OutputParam(
+                name="img_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=List[int],
+                description="The sequence lengths of the image latents, used for RoPE calculation.",
+            ),
+        ]
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        prompt_embeds = block_state.prompt_embeds
+        device, dtype = prompt_embeds.device, prompt_embeds.dtype
+        block_state.txt_ids = torch.zeros(prompt_embeds.shape[1], 3).to(
+            device=prompt_embeds.device, dtype=prompt_embeds.dtype
+        )
+
+        height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
+        width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
+        block_state.img_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
 
         self.set_block_state(state, block_state)
 
diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py
index ffa0a4456f5d..e482c198e835 100644
--- a/src/diffusers/modular_pipelines/flux/denoise.py
+++ b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -76,18 +76,17 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 description="Pooled prompt embeddings",
             ),
             InputParam(
-                "text_ids",
+                "txt_ids",
                 required=True,
                 type_hint=torch.Tensor,
                 description="IDs computed from text sequence needed for RoPE",
             ),
             InputParam(
-                "latent_image_ids",
+                "img_ids",
                 required=True,
                 type_hint=torch.Tensor,
                 description="IDs computed from image sequence needed for RoPE",
             ),
-            # TODO: guidance
         ]
 
     @torch.no_grad()
@@ -101,8 +100,8 @@ def __call__(
             encoder_hidden_states=block_state.prompt_embeds,
             pooled_projections=block_state.pooled_prompt_embeds,
             joint_attention_kwargs=block_state.joint_attention_kwargs,
-            txt_ids=block_state.text_ids,
-            img_ids=block_state.latent_image_ids,
+            txt_ids=block_state.txt_ids,
+            img_ids=block_state.img_ids,
             return_dict=False,
         )[0]
         block_state.noise_pred = noise_pred
@@ -195,9 +194,6 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.num_warmup_steps = max(
             len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
         )
-        # We set the index here to remove DtoH sync, helpful especially during compilation.
-        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
-        components.scheduler.set_begin_index(0)
         with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
             for i, t in enumerate(block_state.timesteps):
                 components, block_state = self.loop_step(components, block_state, i=i, t=t)
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index 16ddecbadb4f..6368086cbb5f 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -25,7 +25,7 @@
 from ...models import AutoencoderKL
 from ...utils import USE_PEFT_BACKEND, is_ftfy_available, logging, scale_lora_layers, unscale_lora_layers
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import FluxModularPipeline
 
 
@@ -67,89 +67,148 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
-class FluxVaeEncoderStep(ModularPipelineBlocks):
-    model_name = "flux"
+def encode_vae_image(vae: AutoencoderKL, image: torch.Tensor, generator: torch.Generator, sample_mode="sample"):
+    if isinstance(generator, list):
+        image_latents = [
+            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
+            for i in range(image.shape[0])
+        ]
+        image_latents = torch.cat(image_latents, dim=0)
+    else:
+        image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
+
+    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
+
+    return image_latents
+
+
+class FluxProcessImagesInputStep(ModularPipelineBlocks):
+    model_name = "Flux"
 
     @property
     def description(self) -> str:
-        return "Vae Encoder step that encode the input image into a latent representation"
+        return "Image Preprocess step. Resizing is needed in Flux Kontext (will be implemented later.)"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return [
-            ComponentSpec("vae", AutoencoderKL),
             ComponentSpec(
                 "image_processor",
                 VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 16}),
+                config=FrozenDict({"vae_scale_factor": 16}),
                 default_creation_method="from_config",
             ),
         ]
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("generator"),
-            InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
-            InputParam(
-                "preprocess_kwargs",
-                type_hint=Optional[dict],
-                description="A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]",
-            ),
-        ]
+        return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                "image_latents",
-                type_hint=torch.Tensor,
-                description="The latents representing the reference image for image-to-image/inpainting generation",
-            )
+            OutputParam(name="processed_image"),
         ]
 
     @staticmethod
-    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image with self.vae->vae
-    def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
-        if isinstance(generator, list):
-            image_latents = [
-                retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
-            ]
-            image_latents = torch.cat(image_latents, dim=0)
+    def check_inputs(height, width, vae_scale_factor):
+        if height is not None and height % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
+
+        if width is not None and width % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        if block_state.resized_image is None and block_state.image is None:
+            raise ValueError("`resized_image` and `image` cannot be None at the same time")
+
+        if block_state.resized_image is None:
+            image = block_state.image
+            self.check_inputs(
+                height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
+            )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
         else:
-            image_latents = retrieve_latents(vae.encode(image), generator=generator)
+            width, height = block_state.resized_image[0].size
+            image = block_state.resized_image
+
+        block_state.processed_image = components.image_processor.preprocess(image=image, height=height, width=width)
+
+        self.set_block_state(state, block_state)
+        return components, state
 
-        image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
 
-        return image_latents
+class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    def __init__(
+        self,
+        input_name: str = "processed_image",
+        output_name: str = "image_latents",
+    ):
+        """Initialize a VAE encoder step for converting images to latent representations.
+
+        Both the input and output names are configurable so this block can be configured to process to different image
+        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
+
+        Args:
+            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
+                Examples: "processed_image" or "processed_control_image"
+            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
+                Examples: "image_latents" or "control_image_latents"
+
+        Examples:
+            # Basic usage with default settings (includes image processor): # FluxImageVaeEncoderDynamicStep()
+
+            # Custom input/output names for control image: # FluxImageVaeEncoderDynamicStep(
+                input_name="processed_control_image", output_name="control_image_latents"
+            )
+        """
+        self._image_input_name = input_name
+        self._image_latents_output_name = output_name
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        components = [ComponentSpec("vae", AutoencoderKL)]
+        return components
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return inputs
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                self._image_latents_output_name,
+                type_hint=torch.Tensor,
+                description="The latents representing the reference image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
-        block_state.preprocess_kwargs = block_state.preprocess_kwargs or {}
-        block_state.device = components._execution_device
-        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
 
-        block_state.image = components.image_processor.preprocess(
-            block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs
-        )
-        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
+        device = components._execution_device
+        dtype = components.vae.dtype
 
-        block_state.batch_size = block_state.image.shape[0]
+        image = getattr(block_state, self._image_input_name)
+        image = image.to(device=device, dtype=dtype)
 
-        # if generator is a list, make sure the length of it matches the length of images (both should be batch_size)
-        if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
-                f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        block_state.image_latents = self._encode_vae_image(
-            components.vae, image=block_state.image, generator=block_state.generator
-        )
+        # Encode image into latents
+        image_latents = encode_vae_image(image=image, vae=components.vae, generator=block_state.generator)
+        setattr(block_state, self._image_latents_output_name, image_latents)
 
         self.set_block_state(state, block_state)
 
@@ -161,7 +220,7 @@ class FluxTextEncoderStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Text Encoder step that generate text_embeddings to guide the video generation"
+        return "Text Encoder step that generate text_embeddings to guide the image generation"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -172,10 +231,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec("tokenizer_2", T5TokenizerFast),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return []
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -200,12 +255,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=torch.Tensor,
                 description="pooled text embeddings used to guide the image generation",
             ),
-            OutputParam(
-                "text_ids",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="ids from the text sequence for RoPE",
-            ),
         ]
 
     @staticmethod
@@ -216,16 +265,10 @@ def check_inputs(block_state):
 
     @staticmethod
     def _get_t5_prompt_embeds(
-        components,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        max_sequence_length: int,
-        device: torch.device,
+        components, prompt: Union[str, List[str]], max_sequence_length: int, device: torch.device
     ):
         dtype = components.text_encoder_2.dtype
-
         prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
 
         if isinstance(components, TextualInversionLoaderMixin):
             prompt = components.maybe_convert_prompt(prompt, components.tokenizer_2)
@@ -251,23 +294,11 @@ def _get_t5_prompt_embeds(
 
         prompt_embeds = components.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        _, seq_len, _ = prompt_embeds.shape
-
-        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
         return prompt_embeds
 
     @staticmethod
-    def _get_clip_prompt_embeds(
-        components,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        device: torch.device,
-    ):
+    def _get_clip_prompt_embeds(components, prompt: Union[str, List[str]], device: torch.device):
         prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
 
         if isinstance(components, TextualInversionLoaderMixin):
             prompt = components.maybe_convert_prompt(prompt, components.tokenizer)
@@ -297,10 +328,6 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.pooler_output
         prompt_embeds = prompt_embeds.to(dtype=components.text_encoder.dtype, device=device)
 
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
-
         return prompt_embeds
 
     @staticmethod
@@ -309,34 +336,11 @@ def encode_prompt(
         prompt: Union[str, List[str]],
         prompt_2: Union[str, List[str]],
         device: Optional[torch.device] = None,
-        num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_sequence_length: int = 512,
         lora_scale: Optional[float] = None,
     ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in all text-encoders
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
         device = device or components._execution_device
 
         # set lora scale so that monkey patched LoRA
@@ -361,12 +365,10 @@ def encode_prompt(
                 components,
                 prompt=prompt,
                 device=device,
-                num_images_per_prompt=num_images_per_prompt,
             )
             prompt_embeds = FluxTextEncoderStep._get_t5_prompt_embeds(
                 components,
                 prompt=prompt_2,
-                num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
                 device=device,
             )
@@ -381,10 +383,7 @@ def encode_prompt(
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(components.text_encoder_2, lora_scale)
 
-        dtype = components.text_encoder.dtype if components.text_encoder is not None else torch.bfloat16
-        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
-
-        return prompt_embeds, pooled_prompt_embeds, text_ids
+        return prompt_embeds, pooled_prompt_embeds
 
     @torch.no_grad()
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -400,14 +399,13 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             if block_state.joint_attention_kwargs is not None
             else None
         )
-        (block_state.prompt_embeds, block_state.pooled_prompt_embeds, block_state.text_ids) = self.encode_prompt(
+        block_state.prompt_embeds, block_state.pooled_prompt_embeds = self.encode_prompt(
             components,
             prompt=block_state.prompt,
             prompt_2=None,
             prompt_embeds=None,
             pooled_prompt_embeds=None,
             device=block_state.device,
-            num_images_per_prompt=1,  # TODO: hardcoded for now.
             max_sequence_length=block_state.max_sequence_length,
             lora_scale=block_state.text_encoder_lora_scale,
         )
diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py
new file mode 100644
index 000000000000..f9192655d1ac
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -0,0 +1,236 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import torch
+
+from ...pipelines import FluxPipeline
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import InputParam, OutputParam
+
+# TODO: consider making these common utilities for modular if they are not pipeline-specific.
+from ..qwenimage.inputs import calculate_dimension_from_latents, repeat_tensor_to_batch_size
+from .modular_pipeline import FluxModularPipeline
+
+
+class FluxTextInputStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Text input processing step that standardizes text embeddings for the pipeline.\n"
+            "This step:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_images_per_prompt", default=1),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
+            ),
+            InputParam(
+                "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
+            ),
+            # TODO: support negative embeddings?
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "pooled_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="pooled text embeddings used to guide the image generation",
+            ),
+            # TODO: support negative embeddings?
+        ]
+
+    def check_inputs(self, components, block_state):
+        if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None:
+            if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]:
+                raise ValueError(
+                    "`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but"
+                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`"
+                    f" {block_state.pooled_prompt_embeds.shape}."
+                )
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        # TODO: consider adding negative embeddings?
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+        )
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+# Adapted from `QwenImageInputsDynamicStep`
+class FluxInputsDynamicStep(ModularPipelineBlocks):
+    model_name = "flux"
+
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        # Functionality section
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        # Inputs info
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+        # Add image latent inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        # Add additional batch inputs
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
+            OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
+        ]
+
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate height/width from latents
+            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+
+            if not hasattr(block_state, "image_height"):
+                block_state.image_height = height
+            if not hasattr(block_state, "image_width"):
+                block_state.image_width = width
+
+            # 2. Patchify the image latent tensor
+            # TODO: Implement patchifier for Flux.
+            latent_height, latent_width = image_latent_tensor.shape[2:]
+            image_latent_tensor = FluxPipeline._pack_latents(
+                image_latent_tensor, block_state.batch_size, image_latent_tensor.shape[1], latent_height, latent_width
+            )
+
+            # 3. Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=image_latent_input_name,
+                input_tensor=image_latent_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, image_latent_input_name, image_latent_tensor)
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
index ca4f993a11fe..b40dfe176207 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -18,21 +18,41 @@
 from .before_denoise import (
     FluxImg2ImgPrepareLatentsStep,
     FluxImg2ImgSetTimestepsStep,
-    FluxInputStep,
     FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
     FluxSetTimestepsStep,
 )
 from .decoders import FluxDecodeStep
 from .denoise import FluxDenoiseStep
-from .encoders import FluxTextEncoderStep, FluxVaeEncoderStep
+from .encoders import FluxProcessImagesInputStep, FluxTextEncoderStep, FluxVaeEncoderDynamicStep
+from .inputs import FluxInputsDynamicStep, FluxTextInputStep
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 # vae encoder (run before before_denoise)
+FluxImg2ImgVaeEncoderBlocks = InsertableDict(
+    [
+        ("preprocess", FluxProcessImagesInputStep()),
+        ("encode", FluxVaeEncoderDynamicStep()),
+    ]
+)
+
+
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
+    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
 class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxVaeEncoderStep]
+    block_classes = [FluxImg2ImgVaeEncoderStep]
     block_names = ["img2img"]
     block_trigger_inputs = ["image"]
 
@@ -41,45 +61,48 @@ def description(self):
         return (
             "Vae encoder step that encode the image inputs into their latent representations.\n"
             + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is provided, step will be skipped."
+            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
         )
 
 
-# before_denoise: text2img, img2img
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [
-        FluxInputStep,
-        FluxPrepareLatentsStep,
-        FluxSetTimestepsStep,
+# before_denoise: text2img
+FluxBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
     ]
-    block_names = ["input", "prepare_latents", "set_timesteps"]
+)
+
+
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxBeforeDenoiseBlocks.values()
+    block_names = FluxBeforeDenoiseBlocks.keys()
 
     @property
     def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `FluxSetTimestepsStep` is used to set the timesteps\n"
-        )
+        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
 
 
 # before_denoise: img2img
+FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [FluxInputStep, FluxImg2ImgSetTimestepsStep, FluxImg2ImgPrepareLatentsStep]
-    block_names = ["input", "set_timesteps", "prepare_latents"]
+    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
+    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
 
     @property
     def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxImg2ImgSetTimestepsStep` is used to set the timesteps\n"
-            + " - `FluxImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
-        )
+        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
 
 
 # before_denoise: all task (text2img, img2img)
@@ -113,7 +136,7 @@ def description(self) -> str:
         )
 
 
-# decode: all task (text2img, img2img, inpainting)
+# decode: all task (text2img, img2img)
 class FluxAutoDecodeStep(AutoPipelineBlocks):
     block_classes = [FluxDecodeStep]
     block_names = ["non-inpaint"]
@@ -124,32 +147,73 @@ def description(self):
         return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
 
 
+# inputs: text2image/img2img
+FluxImg2ImgBlocks = InsertableDict(
+    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
+)
+
+
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = FluxImg2ImgBlocks.values()
+    block_names = FluxImg2ImgBlocks.keys()
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+class FluxImageAutoInputStep(AutoPipelineBlocks):
+    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+        )
+
+
 class FluxCoreDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
+    model_name = "flux"
+    block_classes = [FluxImageAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
     block_names = ["input", "before_denoise", "denoise"]
 
     @property
     def description(self):
         return (
             "Core step that performs the denoising process. \n"
-            + " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
             + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
             + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step support text-to-image and image-to-image tasks for Flux:\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
             + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
         )
 
 
-# text2image
-class FluxAutoBlocks(SequentialPipelineBlocks):
-    block_classes = [
-        FluxTextEncoderStep,
-        FluxAutoVaeEncoderStep,
-        FluxCoreDenoiseStep,
-        FluxAutoDecodeStep,
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("image_encoder", FluxAutoVaeEncoderStep()),
+        ("denoise", FluxCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
     ]
-    block_names = ["text_encoder", "image_encoder", "denoise", "decode"]
+)
+
+
+class FluxAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
 
     @property
     def description(self):
@@ -162,35 +226,28 @@ def description(self):
 
 TEXT2IMAGE_BLOCKS = InsertableDict(
     [
-        ("text_encoder", FluxTextEncoderStep),
-        ("input", FluxInputStep),
-        ("prepare_latents", FluxPrepareLatentsStep),
-        ("set_timesteps", FluxSetTimestepsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("input", FluxTextInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
     ]
 )
 
 IMAGE2IMAGE_BLOCKS = InsertableDict(
     [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxVaeEncoderStep),
-        ("input", FluxInputStep),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep),
-        ("prepare_latents", FluxImg2ImgPrepareLatentsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxVaeEncoderDynamicStep()),
+        ("input", FluxImg2ImgInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
     ]
 )
 
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxAutoVaeEncoderStep),
-        ("denoise", FluxCoreDenoiseStep),
-        ("decode", FluxAutoDecodeStep),
-    ]
-)
-
-
 ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS}

From 35e538d46a32e6ef588678f478437d594c32f949 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 8 Oct 2025 09:46:18 +0530
Subject: [PATCH 47/69] fix dockerfile definitions. (#12424)

* fix dockerfile definitions.

* python 3.10 slim.

* up

* up

* up

* up

* up

* revert pr_tests.yml changes

* up

* up

* reduce python version for torch 2.1.0
---
 .github/workflows/build_docker_images.yml     |  1 -
 .github/workflows/pr_tests.yml                |  1 -
 docker/diffusers-doc-builder/Dockerfile       | 80 ++++++++-----------
 docker/diffusers-pytorch-cpu/Dockerfile       | 69 +++++++---------
 docker/diffusers-pytorch-cuda/Dockerfile      | 43 +++++-----
 .../diffusers-pytorch-minimum-cuda/Dockerfile | 42 +++++-----
 .../Dockerfile                                | 72 ++++++++---------
 7 files changed, 136 insertions(+), 172 deletions(-)

diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
index 583853c6d649..1d7be0d6bce0 100644
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -72,7 +72,6 @@ jobs:
         image-name:
           - diffusers-pytorch-cpu
           - diffusers-pytorch-cuda
-          - diffusers-pytorch-cuda
           - diffusers-pytorch-xformers-cuda
           - diffusers-pytorch-minimum-cuda
           - diffusers-doc-builder
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 1543b264b0cc..005e89f4056a 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -286,4 +286,3 @@ jobs:
       with:
         name: pr_main_test_reports
         path: reports
-
diff --git a/docker/diffusers-doc-builder/Dockerfile b/docker/diffusers-doc-builder/Dockerfile
index 3a76b3331c17..313eb8b8f85f 100644
--- a/docker/diffusers-doc-builder/Dockerfile
+++ b/docker/diffusers-doc-builder/Dockerfile
@@ -1,56 +1,42 @@
-FROM ubuntu:20.04
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.10 \
-                   python3-pip \
-                   libgl1 \
-                   zip \
-                   wget \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN apt-get -y update && apt-get install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1
+
+ENV UV_PYTHON=/usr/local/bin/python
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers \
-        matplotlib \
-        setuptools==69.5.1 \
-        bitsandbytes \
-        torchao \
-        gguf \
-        optimum-quanto
+RUN pip install uv
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    hf_transfer \
+    setuptools==69.5.1 \
+    bitsandbytes \
+    torchao \
+    gguf \
+    optimum-quanto
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
 
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile
index 8d98c52598d2..3fc16d57e8a7 100644
--- a/docker/diffusers-pytorch-cpu/Dockerfile
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -1,50 +1,37 @@
-FROM ubuntu:20.04
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   python3.10 \
-                   python3.10-dev \
-                   python3-pip \
-                   libgl1 \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN apt-get -y update && apt-get install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1
+
+ENV UV_PYTHON=/usr/local/bin/python
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark \
-        --extra-index-url https://download.pytorch.org/whl/cpu && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers matplotlib  \
-        hf_transfer
+RUN pip install uv
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    hf_transfer
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
 
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
index 695f5ed08dc5..1cfcda643a5a 100644
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -2,11 +2,13 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get -y update \
     && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update
 
 RUN apt install -y bash \
     build-essential \
@@ -16,36 +18,31 @@ RUN apt install -y bash \
     ca-certificates \
     libsndfile1-dev \
     libgl1 \
-    python3.10 \
-    python3.10-dev \
+    python3 \
     python3-pip \
-    python3.10-venv && \
-    rm -rf /var/lib/apt/lists
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
 
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN uv pip install --no-cache-dir \
     torch \
     torchvision \
-    torchaudio \
-    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    torchaudio
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
     accelerate \
-    datasets \
-    hf-doc-builder \
-    huggingface-hub \
-    hf_transfer \
-    Jinja2 \
-    librosa \
     numpy==1.26.4 \
-    scipy \
-    tensorboard \
-    transformers \
-    pytorch-lightning  \
+    pytorch-lightning \
     hf_transfer
 
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-minimum-cuda/Dockerfile b/docker/diffusers-pytorch-minimum-cuda/Dockerfile
index 57ca7657acf1..8ca6a157da6b 100644
--- a/docker/diffusers-pytorch-minimum-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-minimum-cuda/Dockerfile
@@ -2,6 +2,7 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 ENV MINIMUM_SUPPORTED_TORCH_VERSION="2.1.0"
 ENV MINIMUM_SUPPORTED_TORCHVISION_VERSION="0.16.0"
@@ -9,7 +10,8 @@ ENV MINIMUM_SUPPORTED_TORCHAUDIO_VERSION="2.1.0"
 
 RUN apt-get -y update \
     && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update
 
 RUN apt install -y bash \
     build-essential \
@@ -19,35 +21,31 @@ RUN apt install -y bash \
     ca-certificates \
     libsndfile1-dev \
     libgl1 \
-    python3.10 \
-    python3.10-dev \
+    python3 \
     python3-pip \
-    python3.10-venv && \
-    rm -rf /var/lib/apt/lists
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
 
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
+RUN uv pip install --no-cache-dir \
     torch==$MINIMUM_SUPPORTED_TORCH_VERSION \
     torchvision==$MINIMUM_SUPPORTED_TORCHVISION_VERSION \
-    torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION \
-    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
+    torchaudio==$MINIMUM_SUPPORTED_TORCHAUDIO_VERSION
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
     accelerate \
-    datasets \
-    hf-doc-builder \
-    huggingface-hub \
-    hf_transfer \
-    Jinja2 \
-    librosa \
     numpy==1.26.4 \
-    scipy \
-    tensorboard \
-    transformers \
+    pytorch-lightning \
     hf_transfer
 
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
index 1693eb293024..fe1f475504fa 100644
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -2,50 +2,48 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
+ARG PYTHON_VERSION=3.12
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get -y update \
     && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
+    && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update
 
 RUN apt install -y bash \
-                   build-essential \
-                   git \
-                   git-lfs \
-                   curl \
-                   ca-certificates \
-                   libsndfile1-dev \
-                   libgl1 \
-                   python3.10 \
-                   python3.10-dev \
-                   python3-pip \
-                   python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1 \
+    python3 \
+    python3-pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m pip install --no-cache-dir \
-        torch \
-        torchvision \
-        torchaudio \
-        invisible_watermark && \
-    python3.10 -m uv pip install --no-cache-dir \
-        accelerate \
-        datasets \
-        hf-doc-builder \
-        huggingface-hub \
-        hf_transfer \
-        Jinja2 \
-        librosa \
-        numpy==1.26.4 \
-        scipy \
-        tensorboard \
-        transformers \
-        xformers  \
-        hf_transfer
+RUN uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio
+
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
+
+# Extra dependencies
+RUN uv pip install --no-cache-dir \
+    accelerate \
+    numpy==1.26.4 \
+    pytorch-lightning \
+    hf_transfer \
+    xformers
 
 CMD ["/bin/bash"]

From 345864eb852b528fd1f4b6ad087fa06e0470006b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 8 Oct 2025 10:45:39 +0530
Subject: [PATCH 48/69] fix more torch.distributed imports (#12425)

* up

* unguard.
---
 src/diffusers/hooks/context_parallel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
index 83406d4969b7..915fe453b90b 100644
--- a/src/diffusers/hooks/context_parallel.py
+++ b/src/diffusers/hooks/context_parallel.py
@@ -17,7 +17,10 @@
 from typing import Dict, List, Type, Union
 
 import torch
-import torch.distributed._functional_collectives as funcol
+
+
+if torch.distributed.is_available():
+    import torch.distributed._functional_collectives as funcol
 
 from ..models._modeling_parallel import (
     ContextParallelConfig,

From 9e099a7b45a2c39c3c9fea1eff1546e03702dab0 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 10:20:19 +0200
Subject: [PATCH 49/69] mirage pipeline first commit

---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/models/__init__.py              |   1 +
 src/diffusers/models/transformers/__init__.py |   1 +
 .../models/transformers/transformer_mirage.py | 489 ++++++++++++++
 src/diffusers/pipelines/__init__.py           |   1 +
 src/diffusers/pipelines/mirage/__init__.py    |   4 +
 .../pipelines/mirage/pipeline_mirage.py       | 629 ++++++++++++++++++
 .../pipelines/mirage/pipeline_output.py       |  35 +
 .../test_models_transformer_mirage.py         | 252 +++++++
 9 files changed, 1413 insertions(+)
 create mode 100644 src/diffusers/models/transformers/transformer_mirage.py
 create mode 100644 src/diffusers/pipelines/mirage/__init__.py
 create mode 100644 src/diffusers/pipelines/mirage/pipeline_mirage.py
 create mode 100644 src/diffusers/pipelines/mirage/pipeline_output.py
 create mode 100644 tests/models/transformers/test_models_transformer_mirage.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 686e8d99dabf..6c419b6e7ad1 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -224,6 +224,7 @@
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
+            "MirageTransformer2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
             "MotionAdapter",
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 457f70448af3..279e69216b1b 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -93,6 +93,7 @@
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
+    _import_structure["transformers.transformer_mirage"] = ["MirageTransformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index b60f0636e6dc..ebe0d0c9b8e1 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -29,6 +29,7 @@
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
+    from .transformer_mirage import MirageTransformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
new file mode 100644
index 000000000000..39c569cbb26b
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -0,0 +1,489 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union, Tuple
+import torch
+import math
+from torch import Tensor, nn
+from torch.nn.functional import fold, unfold
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..modeling_outputs import Transformer2DModelOutput
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+
+
+logger = logging.get_logger(__name__)
+
+
+# Mirage Layer Components
+def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
+    img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
+    img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
+    img_ids[..., 1] = torch.arange(w // patch_size, device=device)[None, :]
+    return img_ids.reshape((h // patch_size) * (w // patch_size), 2).unsqueeze(0).repeat(bs, 1, 1)
+
+
+def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq)
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.rope_rearrange = Rearrange("b n d (i j) -> b n d i j", i=2, j=2)
+
+    def rope(self, pos: Tensor, dim: int, theta: int) -> Tensor:
+        assert dim % 2 == 0
+        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+        omega = 1.0 / (theta**scale)
+        out = pos.unsqueeze(-1) * omega.unsqueeze(0)
+        out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+        out = self.rope_rearrange(out)
+        return out.float()
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [self.rope(ids[:, :, i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+
+
+def timestep_embedding(t: Tensor, dim: int, max_period: int = 10000, time_factor: float = 1000.0) -> Tensor:
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms * self.scale).to(dtype=x_dtype)
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.lin = nn.Linear(dim, 6 * dim, bias=True)
+        nn.init.constant_(self.lin.weight, 0)
+        nn.init.constant_(self.lin.bias, 0)
+
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(6, dim=-1)
+        return ModulationOut(*out[:3]), ModulationOut(*out[3:])
+
+
+class MirageBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+
+        self._fsdp_wrap = True
+        self._activation_checkpointing = True
+
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.hidden_size = hidden_size
+
+        # img qkv
+        self.img_pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_qkv_proj = nn.Linear(hidden_size, hidden_size * 3, bias=False)
+        self.attn_out = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.qk_norm = QKNorm(self.head_dim)
+
+        # txt kv
+        self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
+        self.k_norm = RMSNorm(self.head_dim)
+
+
+        # mlp
+        self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.gate_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
+        self.down_proj = nn.Linear(self.mlp_hidden_dim, hidden_size, bias=False)
+        self.mlp_act = nn.GELU(approximate="tanh")
+
+        self.modulation = Modulation(hidden_size)
+        self.spatial_cond_kv_proj: None | nn.Linear = None
+
+    def attn_forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        pe: Tensor,
+        modulation: ModulationOut,
+        spatial_conditioning: None | Tensor = None,
+        attention_mask: None | Tensor = None,
+    ) -> Tensor:
+        # image tokens proj and norm
+        img_mod = (1 + modulation.scale) * self.img_pre_norm(img) + modulation.shift
+
+        img_qkv = self.img_qkv_proj(img_mod)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.qk_norm(img_q, img_k, img_v)
+
+        # txt tokens proj and norm
+        txt_kv = self.txt_kv_proj(txt)
+        txt_k, txt_v = rearrange(txt_kv, "B L (K H D) -> K B H L D", K=2, H=self.num_heads)
+        txt_k = self.k_norm(txt_k)
+
+        # compute attention
+        img_q, img_k = apply_rope(img_q, pe), apply_rope(img_k, pe)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        # optional spatial conditioning tokens
+        cond_len = 0
+        if self.spatial_cond_kv_proj is not None:
+            assert spatial_conditioning is not None
+            cond_kv = self.spatial_cond_kv_proj(spatial_conditioning)
+            cond_k, cond_v = rearrange(cond_kv, "B L (K H D) -> K B H L D", K=2, H=self.num_heads)
+            cond_k = apply_rope(cond_k, pe)
+            cond_len = cond_k.shape[2]
+            k = torch.cat((cond_k, k), dim=2)
+            v = torch.cat((cond_v, v), dim=2)
+
+        # build additive attention bias
+        attn_bias: Tensor | None = None
+        attn_mask: Tensor | None = None
+
+        # build multiplicative 0/1 mask for provided attention_mask over [cond?, text, image] keys
+        if attention_mask is not None:
+            bs, _, l_img, _ = img_q.shape
+            l_txt = txt_k.shape[2]
+            l_all = k.shape[2]
+
+            assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
+            assert (
+                attention_mask.shape[-1] == l_txt
+            ), f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+
+            device = img_q.device
+
+            ones_img = torch.ones((bs, l_img), dtype=torch.bool, device=device)
+            cond_mask = torch.ones((bs, cond_len), dtype=torch.bool, device=device)
+
+            mask_parts = [
+                cond_mask,
+                attention_mask.to(torch.bool),
+                ones_img,
+            ]
+            joint_mask = torch.cat(mask_parts, dim=-1)  # (B, L_all)
+
+            # repeat across heads and query positions
+            attn_mask = joint_mask[:, None, None, :].expand(-1, self.num_heads, l_img, -1)  # (B,H,L_img,L_all)
+
+        attn = torch.nn.functional.scaled_dot_product_attention(
+            img_q.contiguous(), k.contiguous(), v.contiguous(), attn_mask=attn_mask
+        )
+        attn = rearrange(attn, "B H L D -> B L (H D)")
+        attn = self.attn_out(attn)
+
+        return attn
+
+    def ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
+        x = (1 + modulation.scale) * self.post_attention_layernorm(x) + modulation.shift
+        return self.down_proj(self.mlp_act(self.gate_proj(x)) * self.up_proj(x))
+
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        spatial_conditioning: Tensor | None = None,
+        attention_mask: Tensor | None = None,
+        **_: dict[str, Any],
+    ) -> Tensor:
+        mod_attn, mod_mlp = self.modulation(vec)
+
+        img = img + mod_attn.gate * self.attn_forward(
+            img,
+            txt,
+            pe,
+            mod_attn,
+            spatial_conditioning=spatial_conditioning,
+            attention_mask=attention_mask,
+        )
+        img = img + mod_mlp.gate * self.ffn_forward(img, mod_mlp)
+        return img
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+        nn.init.constant_(self.adaLN_modulation[1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[1].bias, 0)
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+
+
+@dataclass
+class MirageParams:
+    in_channels: int
+    patch_size: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    axes_dim: list[int]
+    theta: int
+    time_factor: float = 1000.0
+    time_max_period: int = 10_000
+    conditioning_block_ids: list[int] | None = None
+
+
+def img2seq(img: Tensor, patch_size: int) -> Tensor:
+    """Flatten an image into a sequence of patches"""
+    return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
+
+
+def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
+    """Revert img2seq"""
+    if isinstance(shape, tuple):
+        shape = shape[-2:]
+    elif isinstance(shape, torch.Tensor):
+        shape = (int(shape[0]), int(shape[1]))
+    else:
+        raise NotImplementedError(f"shape type {type(shape)} not supported")
+    return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
+
+
+class MirageTransformer2DModel(ModelMixin, ConfigMixin):
+    """Mirage Transformer model with IP-Adapter support."""
+
+    config_name = "config.json"
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        patch_size: int = 2,
+        context_in_dim: int = 2304,
+        hidden_size: int = 1792,
+        mlp_ratio: float = 3.5,
+        num_heads: int = 28,
+        depth: int = 16,
+        axes_dim: list = None,
+        theta: int = 10000,
+        time_factor: float = 1000.0,
+        time_max_period: int = 10000,
+        conditioning_block_ids: list = None,
+        **kwargs
+    ):
+        super().__init__()
+
+        if axes_dim is None:
+            axes_dim = [32, 32]
+
+        # Create MirageParams from the provided arguments
+        params = MirageParams(
+            in_channels=in_channels,
+            patch_size=patch_size,
+            context_in_dim=context_in_dim,
+            hidden_size=hidden_size,
+            mlp_ratio=mlp_ratio,
+            num_heads=num_heads,
+            depth=depth,
+            axes_dim=axes_dim,
+            theta=theta,
+            time_factor=time_factor,
+            time_max_period=time_max_period,
+            conditioning_block_ids=conditioning_block_ids,
+        )
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.patch_size = params.patch_size
+        self.out_channels = self.in_channels * self.patch_size**2
+
+        self.time_factor = params.time_factor
+        self.time_max_period = params.time_max_period
+
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+
+        pe_dim = params.hidden_size // params.num_heads
+
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels * self.patch_size**2, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        conditioning_block_ids: list[int] = params.conditioning_block_ids or list(range(params.depth))
+
+        self.blocks = nn.ModuleList(
+            [
+                MirageBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                )
+                for i in range(params.depth)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+    def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
+        """Timestep independent stuff"""
+        txt = self.txt_in(txt)
+        img = img2seq(image_latent, self.patch_size)
+        bs, _, h, w = image_latent.shape
+        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
+        pe = self.pe_embedder(img_ids)
+        return img, txt, pe
+
+    def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
+        return self.time_in(
+            timestep_embedding(
+                t=timestep, dim=256, max_period=self.time_max_period, time_factor=self.time_factor
+            ).to(dtype)
+        )
+
+    def forward_transformers(
+        self,
+        image_latent: Tensor,
+        cross_attn_conditioning: Tensor,
+        timestep: Optional[Tensor] = None,
+        time_embedding: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        **block_kwargs: Any,
+    ) -> Tensor:
+        img = self.img_in(image_latent)
+
+        if time_embedding is not None:
+            vec = time_embedding
+        else:
+            if timestep is None:
+                raise ValueError("Please provide either a timestep or a timestep_embedding")
+            vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
+
+        for block in self.blocks:
+            img = block(
+                img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
+            )
+
+        img = self.final_layer(img, vec)
+        return img
+
+    def forward(
+        self,
+        image_latent: Tensor,
+        timestep: Tensor,
+        cross_attn_conditioning: Tensor,
+        micro_conditioning: Tensor,
+        cross_attn_mask: None | Tensor = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        img_seq, txt, pe = self.process_inputs(image_latent, cross_attn_conditioning)
+        img_seq = self.forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
+        output = seq2img(img_seq, self.patch_size, image_latent.shape)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 190c7871d270..7b7ebb633c3b 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -144,6 +144,7 @@
         "FluxKontextPipeline",
         "FluxKontextInpaintPipeline",
     ]
+    _import_structure["mirage"] = ["MiragePipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
         "AudioLDM2Pipeline",
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
new file mode 100644
index 000000000000..4fd8ad191b3f
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/__init__.py
@@ -0,0 +1,4 @@
+from .pipeline_mirage import MiragePipeline
+from .pipeline_output import MiragePipelineOutput
+
+__all__ = ["MiragePipeline", "MiragePipelineOutput"]
\ No newline at end of file
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
new file mode 100644
index 000000000000..126eab07977c
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -0,0 +1,629 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import html
+import re
+import urllib.parse as ul
+
+import ftfy
+import torch
+from transformers import (
+    AutoTokenizer,
+    GemmaTokenizerFast,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, AutoencoderDC
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import MiragePipelineOutput
+
+try:
+    from ...models.transformers.transformer_mirage import MirageTransformer2DModel
+except ImportError:
+    MirageTransformer2DModel = None
+
+logger = logging.get_logger(__name__)
+
+
+class TextPreprocessor:
+    """Text preprocessing utility for MiragePipeline."""
+
+    def __init__(self):
+        """Initialize text preprocessor."""
+        self.bad_punct_regex = re.compile(
+            r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + r"\\" + r"\/" + r"\*" + r"]{1,}"
+        )
+
+    def clean_text(self, text: str) -> str:
+        """Clean text using comprehensive text processing logic."""
+        # See Deepfloyd https://github.com/deep-floyd/IF/blob/develop/deepfloyd_if/modules/t5.py
+        text = str(text)
+        text = ul.unquote_plus(text)
+        text = text.strip().lower()
+        text = re.sub("<person>", "person", text)
+
+        # Remove all urls:
+        text = re.sub(
+            r"\b((?:https?|www):(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@))",
+            "",
+            text,
+        )  # regex for urls
+
+        # @<nickname>
+        text = re.sub(r"@[\w\d]+\b", "", text)
+
+        # 31C0—31EF CJK Strokes through 4E00—9FFF CJK Unified Ideographs
+        text = re.sub(r"[\u31c0-\u31ef]+", "", text)
+        text = re.sub(r"[\u31f0-\u31ff]+", "", text)
+        text = re.sub(r"[\u3200-\u32ff]+", "", text)
+        text = re.sub(r"[\u3300-\u33ff]+", "", text)
+        text = re.sub(r"[\u3400-\u4dbf]+", "", text)
+        text = re.sub(r"[\u4dc0-\u4dff]+", "", text)
+        text = re.sub(r"[\u4e00-\u9fff]+", "", text)
+
+        # все виды тире / all types of dash --> "-"
+        text = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
+            "-",
+            text,
+        )
+
+        # кавычки к одному стандарту
+        text = re.sub(r"[`´«»""¨]", '"', text)
+        text = re.sub(r"['']", "'", text)
+
+        # &quot; and &amp
+        text = re.sub(r"&quot;?", "", text)
+        text = re.sub(r"&amp", "", text)
+
+        # ip addresses:
+        text = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", text)
+
+        # article ids:
+        text = re.sub(r"\d:\d\d\s+$", "", text)
+
+        # \n
+        text = re.sub(r"\\n", " ", text)
+
+        # "#123", "#12345..", "123456.."
+        text = re.sub(r"#\d{1,3}\b", "", text)
+        text = re.sub(r"#\d{5,}\b", "", text)
+        text = re.sub(r"\b\d{6,}\b", "", text)
+
+        # filenames:
+        text = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", text)
+
+        # Clean punctuation
+        text = re.sub(r"[\"\']{2,}", r'"', text)  # """AUSVERKAUFT"""
+        text = re.sub(r"[\.]{2,}", r" ", text)
+
+        text = re.sub(self.bad_punct_regex, r" ", text)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        text = re.sub(r"\s+\.\s+", r" ", text)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, text)) > 3:
+            text = re.sub(regex2, " ", text)
+
+        # Basic cleaning
+        text = ftfy.fix_text(text)
+        text = html.unescape(html.unescape(text))
+        text = text.strip()
+
+        # Clean alphanumeric patterns
+        text = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", text)  # jc6640
+        text = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", text)  # jc6640vc
+        text = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", text)  # 6640vc231
+
+        # Common spam patterns
+        text = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", text)
+        text = re.sub(r"(free\s)?download(\sfree)?", "", text)
+        text = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", text)
+        text = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", text)
+        text = re.sub(r"\bpage\s+\d+\b", "", text)
+
+        text = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", text)  # j2d1a2a...
+        text = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", text)
+
+        # Final cleanup
+        text = re.sub(r"\b\s+\:\s+", r": ", text)
+        text = re.sub(r"(\D[,\./])\b", r"\1 ", text)
+        text = re.sub(r"\s+", " ", text)
+
+        text.strip()
+
+        text = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", text)
+        text = re.sub(r"^[\'\_,\-\:;]", r"", text)
+        text = re.sub(r"[\'\_,\-\:\-\+]$", r"", text)
+        text = re.sub(r"^\.\S+$", "", text)
+
+        return text.strip()
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MiragePipeline
+        >>> from diffusers.models import AutoencoderKL, AutoencoderDC
+        >>> from transformers import T5GemmaModel, GemmaTokenizerFast
+
+        >>> # Load pipeline directly with from_pretrained
+        >>> pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+
+        >>> # Or initialize pipeline components manually
+        >>> transformer = MirageTransformer2DModel.from_pretrained("path/to/transformer")
+        >>> scheduler = FlowMatchEulerDiscreteScheduler()
+        >>> # Load T5Gemma encoder
+        >>> t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
+        >>> text_encoder = t5gemma_model.encoder
+        >>> tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
+        >>> vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
+
+        >>> pipe = MiragePipeline(
+        ...     transformer=transformer,
+        ...     scheduler=scheduler,
+        ...     text_encoder=text_encoder,
+        ...     tokenizer=tokenizer,
+        ...     vae=vae
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+        >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+        >>> image.save("mirage_output.png")
+        ```
+"""
+
+
+class MiragePipeline(
+    DiffusionPipeline,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Mirage Transformer.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        transformer ([`MirageTransformer2DModel`]):
+            The Mirage transformer model to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        text_encoder ([`T5EncoderModel`]):
+            Standard text encoder model for encoding prompts.
+        tokenizer ([`T5TokenizerFast` or `GemmaTokenizerFast`]):
+            Tokenizer for the text encoder.
+        vae ([`AutoencoderKL`] or [`AutoencoderDC`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Supports both AutoencoderKL (8x compression) and AutoencoderDC (32x compression).
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents"]
+    _optional_components = []
+
+    # Component configurations for automatic loading
+    config_name = "model_index.json"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        """
+        Override from_pretrained to ensure T5GemmaEncoder is available for loading.
+
+        This ensures that T5GemmaEncoder from transformers is accessible in the module namespace
+        during component loading, which is required for MiragePipeline checkpoints that use
+        T5GemmaEncoder as the text encoder.
+        """
+        # Ensure T5GemmaEncoder is available for loading
+        import transformers
+        if not hasattr(transformers, 'T5GemmaEncoder'):
+            try:
+                from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+                transformers.T5GemmaEncoder = T5GemmaEncoder
+            except ImportError:
+                # T5GemmaEncoder not available in this transformers version
+                pass
+
+        # Proceed with standard loading
+        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+
+    def __init__(
+        self,
+        transformer: MirageTransformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        text_encoder: Union[T5EncoderModel, Any],
+        tokenizer: Union[T5TokenizerFast, GemmaTokenizerFast, AutoTokenizer],
+        vae: Union[AutoencoderKL, AutoencoderDC],
+    ):
+        super().__init__()
+
+        if MirageTransformer2DModel is None:
+            raise ImportError(
+                "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
+            )
+
+        # Store standard components
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+
+        # Initialize text preprocessor
+        self.text_preprocessor = TextPreprocessor()
+
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+        )
+
+        # Enhance VAE with universal properties for both AutoencoderKL and AutoencoderDC
+        self._enhance_vae_properties()
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae.spatial_compression_ratio)
+
+    def _enhance_vae_properties(self):
+        """Add universal properties to VAE for consistent interface across AutoencoderKL and AutoencoderDC."""
+        if not hasattr(self, "vae") or self.vae is None:
+            return
+
+        # Set spatial_compression_ratio property
+        if hasattr(self.vae, "spatial_compression_ratio"):
+            # AutoencoderDC already has this property
+            pass
+        elif hasattr(self.vae, "config") and hasattr(self.vae.config, "block_out_channels"):
+            # AutoencoderKL: calculate from block_out_channels
+            self.vae.spatial_compression_ratio = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        else:
+            # Fallback
+            self.vae.spatial_compression_ratio = 8
+
+        # Set scaling_factor property with safe defaults
+        if hasattr(self.vae, "config"):
+            self.vae.scaling_factor = getattr(self.vae.config, "scaling_factor", 0.18215)
+        else:
+            self.vae.scaling_factor = 0.18215
+
+        # Set shift_factor property with safe defaults (0.0 for AutoencoderDC)
+        if hasattr(self.vae, "config"):
+            shift_factor = getattr(self.vae.config, "shift_factor", None)
+            if shift_factor is None:  # AutoencoderDC case
+                self.vae.shift_factor = 0.0
+            else:
+                self.vae.shift_factor = shift_factor
+        else:
+            self.vae.shift_factor = 0.0
+
+        # Set latent_channels property (like VaeTower does)
+        if hasattr(self.vae, "config") and hasattr(self.vae.config, "latent_channels"):
+            # AutoencoderDC has latent_channels in config
+            self.vae.latent_channels = int(self.vae.config.latent_channels)
+        elif hasattr(self.vae, "config") and hasattr(self.vae.config, "in_channels"):
+            # AutoencoderKL has in_channels in config
+            self.vae.latent_channels = int(self.vae.config.in_channels)
+        else:
+            # Fallback based on VAE type - DC-AE typically has 32, AutoencoderKL has 4/16
+            if hasattr(self.vae, "spatial_compression_ratio") and self.vae.spatial_compression_ratio == 32:
+                self.vae.latent_channels = 32  # DC-AE default
+            else:
+                self.vae.latent_channels = 4   # AutoencoderKL default
+
+    @property
+    def vae_scale_factor(self):
+        """Compatibility property that returns spatial compression ratio."""
+        return getattr(self.vae, "spatial_compression_ratio", 8)
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        """Prepare initial latents for the diffusion process."""
+        if latents is None:
+            latent_height, latent_width = height // self.vae.spatial_compression_ratio, width // self.vae.spatial_compression_ratio
+            shape = (batch_size, num_channels_latents, latent_height, latent_width)
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # FlowMatchEulerDiscreteScheduler doesn't use init_noise_sigma scaling
+        return latents
+
+    def encode_prompt(self, prompt: Union[str, List[str]], device: torch.device):
+        """Encode text prompt using standard text encoder and tokenizer."""
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        return self._encode_prompt_standard(prompt, device)
+
+    def _encode_prompt_standard(self, prompt: List[str], device: torch.device):
+        """Encode prompt using standard text encoder and tokenizer with batch processing."""
+        # Clean text using modular preprocessor
+        cleaned_prompts = [self.text_preprocessor.clean_text(text) for text in prompt]
+        cleaned_uncond_prompts = [self.text_preprocessor.clean_text("") for _ in prompt]
+
+        # Batch conditional and unconditional prompts together for efficiency
+        all_prompts = cleaned_prompts + cleaned_uncond_prompts
+
+        # Tokenize all prompts in one batch
+        tokens = self.tokenizer(
+            all_prompts,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+
+        input_ids = tokens["input_ids"].to(device)
+        attention_mask = tokens["attention_mask"].bool().to(device)
+
+        # Encode all prompts in one batch
+        with torch.no_grad():
+            # Disable autocast like in TextTower
+            with torch.autocast("cuda", enabled=False):
+                emb = self.text_encoder(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+
+        # Use last hidden state (matching TextTower's use_last_hidden_state=True default)
+        all_embeddings = emb["last_hidden_state"]
+
+        # Split back into conditional and unconditional
+        batch_size = len(prompt)
+        text_embeddings = all_embeddings[:batch_size]
+        uncond_text_embeddings = all_embeddings[batch_size:]
+
+        cross_attn_mask = attention_mask[:batch_size]
+        uncond_cross_attn_mask = attention_mask[batch_size:]
+
+        return text_embeddings, cross_attn_mask, uncond_text_embeddings, uncond_cross_attn_mask
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        guidance_scale: float,
+        callback_on_step_end_tensor_inputs: Optional[List[str]] = None,
+    ):
+        """Check that all inputs are in correct format."""
+        if height % self.vae.spatial_compression_ratio != 0 or width % self.vae.spatial_compression_ratio != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}.")
+
+        if guidance_scale < 1.0:
+            raise ValueError(f"guidance_scale has to be >= 1.0 but is {guidance_scale}")
+
+        if callback_on_step_end_tensor_inputs is not None and not isinstance(callback_on_step_end_tensor_inputs, list):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be a list but is {callback_on_step_end_tensor_inputs}"
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.mirage.MiragePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self, step, timestep, callback_kwargs)`.
+                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include tensors that are listed
+                in the `._callback_tensor_inputs` attribute.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.mirage.MiragePipelineOutput`] or `tuple`: [`~pipelines.mirage.MiragePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # 0. Default height and width to transformer config
+        height = height or 256
+        width = width or 256
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            guidance_scale,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError("prompt must be provided as a string or list of strings")
+
+        device = self._execution_device
+
+        # 2. Encode input prompt
+        text_embeddings, cross_attn_mask, uncond_text_embeddings, uncond_cross_attn_mask = self.encode_prompt(
+            prompt, device
+        )
+
+        # 3. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.vae.latent_channels  # From your transformer config
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare extra step kwargs
+        extra_step_kwargs = {}
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_eta:
+            extra_step_kwargs["eta"] = 0.0
+
+        # 6. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Duplicate latents for CFG
+                latents_in = torch.cat([latents, latents], dim=0)
+
+                # Cross-attention batch (uncond, cond)
+                ca_embed = torch.cat([uncond_text_embeddings, text_embeddings], dim=0)
+                ca_mask = None
+                if cross_attn_mask is not None and uncond_cross_attn_mask is not None:
+                    ca_mask = torch.cat([uncond_cross_attn_mask, cross_attn_mask], dim=0)
+
+                # Normalize timestep for the transformer
+                t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).repeat(2).to(device)
+
+                # Process inputs for transformer
+                img_seq, txt, pe = self.transformer.process_inputs(latents_in, ca_embed)
+
+                # Forward through transformer layers
+                img_seq = self.transformer.forward_transformers(
+                    img_seq, txt, time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    pe=pe, attention_mask=ca_mask
+                )
+
+                # Convert back to image format
+                from ...models.transformers.transformer_mirage import seq2img
+                noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
+
+                # Apply CFG
+                noise_uncond, noise_text = noise_both.chunk(2, dim=0)
+                noise_pred = noise_uncond + guidance_scale * (noise_text - noise_uncond)
+
+                # Compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_on_step_end(self, i, t, callback_kwargs)
+
+                # Call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post-processing
+        if output_type == "latent":
+            image = latents
+        else:
+            # Unscale latents for VAE (supports both AutoencoderKL and AutoencoderDC)
+            latents = (latents / self.vae.scaling_factor) + self.vae.shift_factor
+            # Decode using VAE (AutoencoderKL or AutoencoderDC)
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # Use standard image processor for post-processing
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return MiragePipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
new file mode 100644
index 000000000000..e5cdb2a40924
--- /dev/null
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -0,0 +1,35 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class MiragePipelineOutput(BaseOutput):
+    """
+    Output class for Mirage pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
\ No newline at end of file
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
new file mode 100644
index 000000000000..11accdaecbee
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel, MirageParams
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class MirageTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = MirageTransformer2DModel
+    main_input_name = "image_latent"
+
+    @property
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
+        return (16, 4, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4, 4)
+
+    def prepare_dummy_input(self, height=32, width=32):
+        batch_size = 1
+        num_latent_channels = 16
+        sequence_length = 16
+        embedding_dim = 1792
+
+        image_latent = torch.randn((batch_size, num_latent_channels, height, width)).to(torch_device)
+        cross_attn_conditioning = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        micro_conditioning = torch.randn((batch_size, embedding_dim)).to(torch_device)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+
+        return {
+            "image_latent": image_latent,
+            "timestep": timestep,
+            "cross_attn_conditioning": cross_attn_conditioning,
+            "micro_conditioning": micro_conditioning,
+        }
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 16,
+            "patch_size": 2,
+            "context_in_dim": 1792,
+            "hidden_size": 1792,
+            "mlp_ratio": 3.5,
+            "num_heads": 28,
+            "depth": 4,  # Smaller depth for testing
+            "axes_dim": [32, 32],
+            "theta": 10_000,
+        }
+        inputs_dict = self.prepare_dummy_input()
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            # Test forward
+            outputs = model(**inputs_dict)
+
+        self.assertIsNotNone(outputs)
+        expected_shape = inputs_dict["image_latent"].shape
+        self.assertEqual(outputs.shape, expected_shape)
+
+    def test_mirage_params_initialization(self):
+        # Test model initialization
+        model = MirageTransformer2DModel(
+            in_channels=16,
+            patch_size=2,
+            context_in_dim=1792,
+            hidden_size=1792,
+            mlp_ratio=3.5,
+            num_heads=28,
+            depth=4,
+            axes_dim=[32, 32],
+            theta=10_000,
+        )
+        self.assertEqual(model.config.in_channels, 16)
+        self.assertEqual(model.config.hidden_size, 1792)
+        self.assertEqual(model.config.num_heads, 28)
+
+    def test_model_with_dict_config(self):
+        # Test model initialization with from_config
+        config_dict = {
+            "in_channels": 16,
+            "patch_size": 2,
+            "context_in_dim": 1792,
+            "hidden_size": 1792,
+            "mlp_ratio": 3.5,
+            "num_heads": 28,
+            "depth": 4,
+            "axes_dim": [32, 32],
+            "theta": 10_000,
+        }
+
+        model = MirageTransformer2DModel.from_config(config_dict)
+        self.assertEqual(model.config.in_channels, 16)
+        self.assertEqual(model.config.hidden_size, 1792)
+
+    def test_process_inputs(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            img_seq, txt, pe = model.process_inputs(
+                inputs_dict["image_latent"],
+                inputs_dict["cross_attn_conditioning"]
+            )
+
+        # Check shapes
+        batch_size = inputs_dict["image_latent"].shape[0]
+        height, width = inputs_dict["image_latent"].shape[2:]
+        patch_size = init_dict["patch_size"]
+        expected_seq_len = (height // patch_size) * (width // patch_size)
+
+        self.assertEqual(img_seq.shape, (batch_size, expected_seq_len, init_dict["in_channels"] * patch_size**2))
+        self.assertEqual(txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"]))
+        # Check that pe has the correct batch size, sequence length and some embedding dimension
+        self.assertEqual(pe.shape[0], batch_size)  # batch size
+        self.assertEqual(pe.shape[1], 1)  # unsqueeze(1) in EmbedND
+        self.assertEqual(pe.shape[2], expected_seq_len)  # sequence length
+        self.assertEqual(pe.shape[-2:], (2, 2))  # rope rearrange output
+
+    def test_forward_transformers(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            # Process inputs first
+            img_seq, txt, pe = model.process_inputs(
+                inputs_dict["image_latent"],
+                inputs_dict["cross_attn_conditioning"]
+            )
+
+            # Test forward_transformers
+            output_seq = model.forward_transformers(
+                img_seq,
+                txt,
+                timestep=inputs_dict["timestep"],
+                pe=pe
+            )
+
+        # Check output shape
+        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"]**2
+        self.assertEqual(output_seq.shape, (img_seq.shape[0], img_seq.shape[1], expected_out_channels))
+
+    def test_attention_mask(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        # Create attention mask
+        batch_size = inputs_dict["cross_attn_conditioning"].shape[0]
+        seq_len = inputs_dict["cross_attn_conditioning"].shape[1]
+        attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool).to(torch_device)
+        attention_mask[:, seq_len//2:] = False  # Mask second half
+
+        with torch.no_grad():
+            outputs = model(
+                **inputs_dict,
+                cross_attn_mask=attention_mask
+            )
+
+        self.assertIsNotNone(outputs)
+        expected_shape = inputs_dict["image_latent"].shape
+        self.assertEqual(outputs.shape, expected_shape)
+
+    def test_invalid_config(self):
+        # Test invalid configuration - hidden_size not divisible by num_heads
+        with self.assertRaises(ValueError):
+            MirageTransformer2DModel(
+                in_channels=16,
+                patch_size=2,
+                context_in_dim=1792,
+                hidden_size=1793,  # Not divisible by 28
+                mlp_ratio=3.5,
+                num_heads=28,
+                depth=4,
+                axes_dim=[32, 32],
+                theta=10_000,
+            )
+
+        # Test invalid axes_dim that doesn't sum to pe_dim
+        with self.assertRaises(ValueError):
+            MirageTransformer2DModel(
+                in_channels=16,
+                patch_size=2,
+                context_in_dim=1792,
+                hidden_size=1792,
+                mlp_ratio=3.5,
+                num_heads=28,
+                depth=4,
+                axes_dim=[30, 30],  # Sum = 60, but pe_dim = 1792/28 = 64
+                theta=10_000,
+            )
+
+    def test_gradient_checkpointing_enable(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        # Enable gradient checkpointing
+        model.enable_gradient_checkpointing()
+
+        # Check that _activation_checkpointing is set
+        for block in model.blocks:
+            self.assertTrue(hasattr(block, '_activation_checkpointing'))
+
+    def test_from_config(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        # Create model from config
+        model = self.model_class.from_config(init_dict)
+        self.assertIsInstance(model, self.model_class)
+        self.assertEqual(model.config.in_channels, init_dict["in_channels"])
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From 6e10ed4938afe86d48d60dd8e97bbab38e737422 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 11:51:14 +0200
Subject: [PATCH 50/69] use attention processors

---
 src/diffusers/models/attention_processor.py   | 58 +++++++++++++
 .../models/transformers/transformer_mirage.py | 86 ++++++++++++++++---
 2 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 66455d733aee..e4ab33be9784 100755
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -5605,6 +5605,63 @@ def __new__(cls, *args, **kwargs):
         return processor
 
 
+class MirageAttnProcessor2_0:
+    r"""
+    Processor for implementing Mirage-style attention with multi-source tokens and RoPE.
+    Properly integrates with diffusers Attention module while handling Mirage-specific logic.
+    """
+
+    def __init__(self):
+        if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
+            raise ImportError("MirageAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: "Attention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Apply Mirage attention using standard diffusers interface.
+
+        Expected tensor formats from MirageBlock.attn_forward():
+        - hidden_states: Image queries with RoPE applied [B, H, L_img, D]
+        - encoder_hidden_states: Packed key+value tensors [B, H, L_all, 2*D]
+          (concatenated keys and values from text + image + spatial conditioning)
+        - attention_mask: Custom attention mask [B, H, L_img, L_all] or None
+        """
+
+        if encoder_hidden_states is None:
+            raise ValueError(
+                "MirageAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
+                "This should be provided by MirageBlock.attn_forward()."
+            )
+
+        # Unpack the combined key+value tensor
+        # encoder_hidden_states is [B, H, L_all, 2*D] containing [keys, values]
+        key, value = encoder_hidden_states.chunk(2, dim=-1)  # Each [B, H, L_all, D]
+
+        # Apply scaled dot-product attention with Mirage's processed tensors
+        # hidden_states is image queries [B, H, L_img, D]
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            hidden_states.contiguous(), key.contiguous(), value.contiguous(), attn_mask=attention_mask
+        )
+
+        # Reshape from [B, H, L_img, D] to [B, L_img, H*D]
+        batch_size, num_heads, seq_len, head_dim = attn_output.shape
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, num_heads * head_dim)
+
+        # Apply output projection using the diffusers Attention module
+        attn_output = attn.to_out[0](attn_output)
+        if len(attn.to_out) > 1:
+            attn_output = attn.to_out[1](attn_output)  # dropout if present
+
+        return attn_output
+
+
 ADDED_KV_ATTENTION_PROCESSORS = (
     AttnAddedKVProcessor,
     SlicedAttnAddedKVProcessor,
@@ -5653,6 +5710,7 @@ def __new__(cls, *args, **kwargs):
     PAGHunyuanAttnProcessor2_0,
     PAGCFGHunyuanAttnProcessor2_0,
     LuminaAttnProcessor2_0,
+    MirageAttnProcessor2_0,
     FusedAttnProcessor2_0,
     CustomDiffusionXFormersAttnProcessor,
     CustomDiffusionAttnProcessor2_0,
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 39c569cbb26b..0225b9532aff 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -24,6 +24,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ..modeling_utils import ModelMixin
 from ..modeling_outputs import Transformer2DModelOutput
+from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 
 
@@ -159,13 +160,21 @@ def __init__(
         # img qkv
         self.img_pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_qkv_proj = nn.Linear(hidden_size, hidden_size * 3, bias=False)
-        self.attn_out = nn.Linear(hidden_size, hidden_size, bias=False)
         self.qk_norm = QKNorm(self.head_dim)
 
         # txt kv
         self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
         self.k_norm = RMSNorm(self.head_dim)
 
+        self.attention = Attention(
+            query_dim=hidden_size,
+            heads=num_heads,
+            dim_head=self.head_dim,
+            bias=False,
+            out_bias=False,
+            processor=MirageAttnProcessor2_0(),
+        )
+
 
         # mlp
         self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
@@ -214,15 +223,11 @@ def attn_forward(
             k = torch.cat((cond_k, k), dim=2)
             v = torch.cat((cond_v, v), dim=2)
 
-        # build additive attention bias
-        attn_bias: Tensor | None = None
-        attn_mask: Tensor | None = None
-
         # build multiplicative 0/1 mask for provided attention_mask over [cond?, text, image] keys
+        attn_mask: Tensor | None = None
         if attention_mask is not None:
             bs, _, l_img, _ = img_q.shape
             l_txt = txt_k.shape[2]
-            l_all = k.shape[2]
 
             assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
             assert (
@@ -244,11 +249,13 @@ def attn_forward(
             # repeat across heads and query positions
             attn_mask = joint_mask[:, None, None, :].expand(-1, self.num_heads, l_img, -1)  # (B,H,L_img,L_all)
 
-        attn = torch.nn.functional.scaled_dot_product_attention(
-            img_q.contiguous(), k.contiguous(), v.contiguous(), attn_mask=attn_mask
+        kv_packed = torch.cat([k, v], dim=-1)
+
+        attn = self.attention(
+            hidden_states=img_q,                    
+            encoder_hidden_states=kv_packed,        
+            attention_mask=attn_mask,
         )
-        attn = rearrange(attn, "B H L D -> B L (H D)")
-        attn = self.attn_out(attn)
 
         return attn
 
@@ -413,6 +420,65 @@ def __init__(
 
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
 
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
     def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
         """Timestep independent stuff"""
         txt = self.txt_in(txt)

From 866c6de0e3cde370ce134706cf50d1f342064a42 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 12:50:19 +0200
Subject: [PATCH 51/69] use diffusers rmsnorm

---
 .../models/transformers/transformer_mirage.py  | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 0225b9532aff..f4199da1edcc 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -26,12 +26,12 @@
 from ..modeling_outputs import Transformer2DModelOutput
 from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..normalization import RMSNorm
 
 
 logger = logging.get_logger(__name__)
 
 
-# Mirage Layer Components
 def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
     img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
     img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
@@ -93,23 +93,13 @@ def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
 
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms * self.scale).to(dtype=x_dtype)
 
 
 class QKNorm(torch.nn.Module):
     def __init__(self, dim: int):
         super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
+        self.query_norm = RMSNorm(dim, eps=1e-6)
+        self.key_norm = RMSNorm(dim, eps=1e-6)
 
     def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
         q = self.query_norm(q)
@@ -164,7 +154,7 @@ def __init__(
 
         # txt kv
         self.txt_kv_proj = nn.Linear(hidden_size, hidden_size * 2, bias=False)
-        self.k_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6)
 
         self.attention = Attention(
             query_dim=hidden_size,

From 4e8b647227d013816903271b233027cc8034d2d1 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 14:26:50 +0200
Subject: [PATCH 52/69] use diffusers timestep embedding method

---
 .../models/transformers/transformer_mirage.py  | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index f4199da1edcc..916559eb47ac 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -27,6 +27,7 @@
 from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..normalization import RMSNorm
+from ..embeddings import get_timestep_embedding
 
 
 logger = logging.get_logger(__name__)
@@ -71,15 +72,6 @@ def forward(self, ids: Tensor) -> Tensor:
         return emb.unsqueeze(1)
 
 
-def timestep_embedding(t: Tensor, dim: int, max_period: int = 10000, time_factor: float = 1000.0) -> Tensor:
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
 
 
 class MLPEmbedder(nn.Module):
@@ -480,8 +472,12 @@ def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[T
 
     def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
-            timestep_embedding(
-                t=timestep, dim=256, max_period=self.time_max_period, time_factor=self.time_factor
+            get_timestep_embedding(
+                timesteps=timestep,
+                embedding_dim=256,
+                max_period=self.time_max_period,
+                scale=self.time_factor,
+                flip_sin_to_cos=True  # Match original cos, sin order
             ).to(dtype)
         )
 

From 472ad97e410b9d8a46a002ee45583edb8f02061e Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 15:17:11 +0200
Subject: [PATCH 53/69] remove MirageParams

---
 .../models/transformers/transformer_mirage.py | 64 +++++--------------
 .../pipelines/mirage/pipeline_output.py       |  2 +-
 .../test_models_transformer_mirage.py         |  8 +--
 3 files changed, 22 insertions(+), 52 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 916559eb47ac..396e000524ec 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -288,20 +288,6 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
         return x
 
 
-@dataclass
-class MirageParams:
-    in_channels: int
-    patch_size: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    axes_dim: list[int]
-    theta: int
-    time_factor: float = 1000.0
-    time_max_period: int = 10_000
-    conditioning_block_ids: list[int] | None = None
 
 
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
@@ -348,55 +334,39 @@ def __init__(
         if axes_dim is None:
             axes_dim = [32, 32]
 
-        # Create MirageParams from the provided arguments
-        params = MirageParams(
-            in_channels=in_channels,
-            patch_size=patch_size,
-            context_in_dim=context_in_dim,
-            hidden_size=hidden_size,
-            mlp_ratio=mlp_ratio,
-            num_heads=num_heads,
-            depth=depth,
-            axes_dim=axes_dim,
-            theta=theta,
-            time_factor=time_factor,
-            time_max_period=time_max_period,
-            conditioning_block_ids=conditioning_block_ids,
-        )
-
-        self.params = params
-        self.in_channels = params.in_channels
-        self.patch_size = params.patch_size
+        # Store parameters directly
+        self.in_channels = in_channels
+        self.patch_size = patch_size
         self.out_channels = self.in_channels * self.patch_size**2
 
-        self.time_factor = params.time_factor
-        self.time_max_period = params.time_max_period
+        self.time_factor = time_factor
+        self.time_max_period = time_max_period
 
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
 
-        pe_dim = params.hidden_size // params.num_heads
+        pe_dim = hidden_size // num_heads
 
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
 
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
         self.img_in = nn.Linear(self.in_channels * self.patch_size**2, self.hidden_size, bias=True)
         self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
 
-        conditioning_block_ids: list[int] = params.conditioning_block_ids or list(range(params.depth))
+        conditioning_block_ids: list[int] = conditioning_block_ids or list(range(depth))
 
         self.blocks = nn.ModuleList(
             [
                 MirageBlock(
                     self.hidden_size,
                     self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
+                    mlp_ratio=mlp_ratio,
                 )
-                for i in range(params.depth)
+                for i in range(depth)
             ]
         )
 
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
index e5cdb2a40924..dfb55821d142 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Union
 
 import numpy as np
 import PIL.Image
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 11accdaecbee..5e7b0bd165a6 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel, MirageParams
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 
 from ...testing_utils import enable_full_determinism, torch_device
 from ..test_modeling_common import ModelTesterMixin
@@ -88,9 +88,9 @@ def test_forward_signature(self):
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
-        self.assertEqual(outputs.shape, expected_shape)
+        self.assertEqual(outputs.sample.shape, expected_shape)
 
-    def test_mirage_params_initialization(self):
+    def test_model_initialization(self):
         # Test model initialization
         model = MirageTransformer2DModel(
             in_channels=16,
@@ -196,7 +196,7 @@ def test_attention_mask(self):
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
-        self.assertEqual(outputs.shape, expected_shape)
+        self.assertEqual(outputs.sample.shape, expected_shape)
 
     def test_invalid_config(self):
         # Test invalid configuration - hidden_size not divisible by num_heads

From 97a231e3561e1817d46fb7a0c7840423dc76ec99 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 16:35:56 +0200
Subject: [PATCH 54/69] checkpoint conversion script

---
 scripts/convert_mirage_to_diffusers.py | 312 +++++++++++++++++++++++++
 1 file changed, 312 insertions(+)
 create mode 100644 scripts/convert_mirage_to_diffusers.py

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
new file mode 100644
index 000000000000..85716e69ff92
--- /dev/null
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+Script to convert Mirage checkpoint from original codebase to diffusers format.
+"""
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import torch
+from safetensors.torch import save_file
+from transformers import GemmaTokenizerFast
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.pipelines.mirage import MiragePipeline
+
+def load_reference_config(vae_type: str) -> dict:
+    """Load transformer config from existing pipeline checkpoint."""
+
+    if vae_type == "flux":
+        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated/transformer/config.json"
+    elif vae_type == "dc-ae":
+        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated/transformer/config.json"
+    else:
+        raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
+
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Reference config not found: {config_path}")
+
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+
+    print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
+    return config
+
+def create_parameter_mapping() -> dict:
+    """Create mapping from old parameter names to new diffusers names."""
+
+    # Key mappings for structural changes
+    mapping = {}
+
+    # RMSNorm: scale -> weight
+    for i in range(16):  # 16 layers
+        mapping[f"blocks.{i}.qk_norm.query_norm.scale"] = f"blocks.{i}.qk_norm.query_norm.weight"
+        mapping[f"blocks.{i}.qk_norm.key_norm.scale"] = f"blocks.{i}.qk_norm.key_norm.weight"
+        mapping[f"blocks.{i}.k_norm.scale"] = f"blocks.{i}.k_norm.weight"
+
+        # Attention: attn_out -> attention.to_out.0
+        mapping[f"blocks.{i}.attn_out.weight"] = f"blocks.{i}.attention.to_out.0.weight"
+
+    return mapping
+
+def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
+    """Convert old checkpoint parameters to new diffusers format."""
+
+    print("Converting checkpoint parameters...")
+
+    mapping = create_parameter_mapping()
+    converted_state_dict = {}
+
+    # First, print available keys to understand structure
+    print("Available keys in checkpoint:")
+    for key in sorted(old_state_dict.keys())[:10]:  # Show first 10 keys
+        print(f"  {key}")
+    if len(old_state_dict) > 10:
+        print(f"  ... and {len(old_state_dict) - 10} more")
+
+    for key, value in old_state_dict.items():
+        new_key = key
+
+        # Apply specific mappings if needed
+        if key in mapping:
+            new_key = mapping[key]
+            print(f"  Mapped: {key} -> {new_key}")
+
+        # Handle img_qkv_proj -> split to to_q, to_k, to_v
+        if "img_qkv_proj.weight" in key:
+            print(f"  Found QKV projection: {key}")
+            # Split QKV weight into separate Q, K, V projections
+            qkv_weight = value
+            hidden_size = qkv_weight.shape[1]
+            q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
+
+            # Extract layer number from key (e.g., blocks.0.img_qkv_proj.weight -> 0)
+            parts = key.split('.')
+            layer_idx = None
+            for i, part in enumerate(parts):
+                if part == 'blocks' and i + 1 < len(parts) and parts[i+1].isdigit():
+                    layer_idx = parts[i+1]
+                    break
+
+            if layer_idx is not None:
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_q.weight"] = q_weight
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_k.weight"] = k_weight
+                converted_state_dict[f"blocks.{layer_idx}.attention.to_v.weight"] = v_weight
+                print(f"  Split QKV for layer {layer_idx}")
+
+                # Also keep the original img_qkv_proj for backward compatibility
+                converted_state_dict[new_key] = value
+        else:
+            converted_state_dict[new_key] = value
+
+    print(f"✓ Converted {len(converted_state_dict)} parameters")
+    return converted_state_dict
+
+
+def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> MirageTransformer2DModel:
+    """Create and load MirageTransformer2DModel from old checkpoint."""
+
+    print(f"Loading checkpoint from: {checkpoint_path}")
+
+    # Load old checkpoint
+    if not os.path.exists(checkpoint_path):
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+
+    old_checkpoint = torch.load(checkpoint_path, map_location='cpu')
+
+    # Handle different checkpoint formats
+    if isinstance(old_checkpoint, dict):
+        if 'model' in old_checkpoint:
+            state_dict = old_checkpoint['model']
+        elif 'state_dict' in old_checkpoint:
+            state_dict = old_checkpoint['state_dict']
+        else:
+            state_dict = old_checkpoint
+    else:
+        state_dict = old_checkpoint
+
+    print(f"✓ Loaded checkpoint with {len(state_dict)} parameters")
+
+    # Convert parameter names if needed
+    converted_state_dict = convert_checkpoint_parameters(state_dict)
+
+    # Create transformer with config
+    print("Creating MirageTransformer2DModel...")
+    transformer = MirageTransformer2DModel(**config)
+
+    # Load state dict
+    print("Loading converted parameters...")
+    missing_keys, unexpected_keys = transformer.load_state_dict(converted_state_dict, strict=False)
+
+    if missing_keys:
+        print(f"⚠ Missing keys: {missing_keys}")
+    if unexpected_keys:
+        print(f"⚠ Unexpected keys: {unexpected_keys}")
+
+    if not missing_keys and not unexpected_keys:
+        print("✓ All parameters loaded successfully!")
+
+    return transformer
+
+def copy_pipeline_components(vae_type: str, output_path: str):
+    """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
+
+    if vae_type == "flux":
+        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated"
+    else:  # dc-ae
+        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated"
+
+    components = ["vae", "scheduler", "text_encoder", "tokenizer"]
+
+    for component in components:
+        src_path = os.path.join(ref_pipeline, component)
+        dst_path = os.path.join(output_path, component)
+
+        if os.path.exists(src_path):
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
+            else:
+                shutil.copy2(src_path, dst_path)
+            print(f"✓ Copied {component}")
+        else:
+            print(f"⚠ Component not found: {src_path}")
+
+def create_model_index(vae_type: str, output_path: str):
+    """Create model_index.json for the pipeline."""
+
+    if vae_type == "flux":
+        vae_class = "AutoencoderKL"
+    else:  # dc-ae
+        vae_class = "AutoencoderDC"
+
+    model_index = {
+        "_class_name": "MiragePipeline",
+        "_diffusers_version": "0.31.0.dev0",
+        "_name_or_path": os.path.basename(output_path),
+        "scheduler": [
+            "diffusers",
+            "FlowMatchEulerDiscreteScheduler"
+        ],
+        "text_encoder": [
+            "transformers",
+            "T5GemmaEncoder"
+        ],
+        "tokenizer": [
+            "transformers",
+            "GemmaTokenizerFast"
+        ],
+        "transformer": [
+            "diffusers",
+            "MirageTransformer2DModel"
+        ],
+        "vae": [
+            "diffusers",
+            vae_class
+        ]
+    }
+
+    model_index_path = os.path.join(output_path, "model_index.json")
+    with open(model_index_path, 'w') as f:
+        json.dump(model_index, f, indent=2)
+
+    print(f"✓ Created model_index.json")
+
+def main(args):
+    # Validate inputs
+    if not os.path.exists(args.checkpoint_path):
+        raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint_path}")
+
+    # Load reference config based on VAE type
+    config = load_reference_config(args.vae_type)
+
+    # Create output directory
+    os.makedirs(args.output_path, exist_ok=True)
+    print(f"✓ Output directory: {args.output_path}")
+
+    # Create transformer from checkpoint
+    transformer = create_transformer_from_checkpoint(args.checkpoint_path, config)
+
+    # Save transformer
+    transformer_path = os.path.join(args.output_path, "transformer")
+    os.makedirs(transformer_path, exist_ok=True)
+
+    # Save config
+    with open(os.path.join(transformer_path, "config.json"), 'w') as f:
+        json.dump(config, f, indent=2)
+
+    # Save model weights as safetensors
+    state_dict = transformer.state_dict()
+    save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
+    print(f"✓ Saved transformer to {transformer_path}")
+
+    # Copy other pipeline components
+    copy_pipeline_components(args.vae_type, args.output_path)
+
+    # Create model index
+    create_model_index(args.vae_type, args.output_path)
+
+    # Verify the pipeline can be loaded
+    try:
+        pipeline = MiragePipeline.from_pretrained(args.output_path)
+        print(f"Pipeline loaded successfully!")
+        print(f"Transformer: {type(pipeline.transformer).__name__}")
+        print(f"VAE: {type(pipeline.vae).__name__}")
+        print(f"Text Encoder: {type(pipeline.text_encoder).__name__}")
+        print(f"Scheduler: {type(pipeline.scheduler).__name__}")
+
+        # Display model info
+        num_params = sum(p.numel() for p in pipeline.transformer.parameters())
+        print(f"✓ Transformer parameters: {num_params:,}")
+
+    except Exception as e:
+        print(f"Pipeline verification failed: {e}")
+        return False
+
+    print("Conversion completed successfully!")
+    print(f"Converted pipeline saved to: {args.output_path}")
+    print(f"VAE type: {args.vae_type}")
+
+
+    return True
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
+
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        required=True,
+        help="Path to the original Mirage checkpoint (.pth file)"
+    )
+
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output directory for the converted diffusers pipeline"
+    )
+
+    parser.add_argument(
+        "--vae_type",
+        type=str,
+        choices=["flux", "dc-ae"],
+        required=True,
+        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        success = main(args)
+        if not success:
+            sys.exit(1)
+    except Exception as e:
+        print(f"❌ Conversion failed: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
\ No newline at end of file

From 35d721f79bbb518ea5c1e209f71c0fd80cba9434 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Fri, 26 Sep 2025 17:00:55 +0200
Subject: [PATCH 55/69] ruff formating

---
 scripts/convert_mirage_to_diffusers.py        | 83 ++++++++-----------
 .../models/transformers/transformer_mirage.py | 41 ++++-----
 src/diffusers/pipelines/mirage/__init__.py    |  3 +-
 .../pipelines/mirage/pipeline_mirage.py       | 50 +++++++----
 .../pipelines/mirage/pipeline_output.py       |  2 +-
 .../test_models_transformer_mirage.py         | 30 +++----
 6 files changed, 100 insertions(+), 109 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 85716e69ff92..5e2a2ff768f4 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -8,16 +8,17 @@
 import os
 import shutil
 import sys
+
 import torch
 from safetensors.torch import save_file
-from transformers import GemmaTokenizerFast
 
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
-from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.pipelines.mirage import MiragePipeline
 
+
 def load_reference_config(vae_type: str) -> dict:
     """Load transformer config from existing pipeline checkpoint."""
 
@@ -31,12 +32,13 @@ def load_reference_config(vae_type: str) -> dict:
     if not os.path.exists(config_path):
         raise FileNotFoundError(f"Reference config not found: {config_path}")
 
-    with open(config_path, 'r') as f:
+    with open(config_path, "r") as f:
         config = json.load(f)
 
     print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
     return config
 
+
 def create_parameter_mapping() -> dict:
     """Create mapping from old parameter names to new diffusers names."""
 
@@ -54,6 +56,7 @@ def create_parameter_mapping() -> dict:
 
     return mapping
 
+
 def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
     """Convert old checkpoint parameters to new diffusers format."""
 
@@ -82,15 +85,14 @@ def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
             print(f"  Found QKV projection: {key}")
             # Split QKV weight into separate Q, K, V projections
             qkv_weight = value
-            hidden_size = qkv_weight.shape[1]
             q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
 
             # Extract layer number from key (e.g., blocks.0.img_qkv_proj.weight -> 0)
-            parts = key.split('.')
+            parts = key.split(".")
             layer_idx = None
             for i, part in enumerate(parts):
-                if part == 'blocks' and i + 1 < len(parts) and parts[i+1].isdigit():
-                    layer_idx = parts[i+1]
+                if part == "blocks" and i + 1 < len(parts) and parts[i + 1].isdigit():
+                    layer_idx = parts[i + 1]
                     break
 
             if layer_idx is not None:
@@ -117,14 +119,14 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     if not os.path.exists(checkpoint_path):
         raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
 
-    old_checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    old_checkpoint = torch.load(checkpoint_path, map_location="cpu")
 
     # Handle different checkpoint formats
     if isinstance(old_checkpoint, dict):
-        if 'model' in old_checkpoint:
-            state_dict = old_checkpoint['model']
-        elif 'state_dict' in old_checkpoint:
-            state_dict = old_checkpoint['state_dict']
+        if "model" in old_checkpoint:
+            state_dict = old_checkpoint["model"]
+        elif "state_dict" in old_checkpoint:
+            state_dict = old_checkpoint["state_dict"]
         else:
             state_dict = old_checkpoint
     else:
@@ -153,6 +155,7 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
 
     return transformer
 
+
 def copy_pipeline_components(vae_type: str, output_path: str):
     """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
 
@@ -176,6 +179,7 @@ def copy_pipeline_components(vae_type: str, output_path: str):
         else:
             print(f"⚠ Component not found: {src_path}")
 
+
 def create_model_index(vae_type: str, output_path: str):
     """Create model_index.json for the pipeline."""
 
@@ -188,33 +192,19 @@ def create_model_index(vae_type: str, output_path: str):
         "_class_name": "MiragePipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
-        "scheduler": [
-            "diffusers",
-            "FlowMatchEulerDiscreteScheduler"
-        ],
-        "text_encoder": [
-            "transformers",
-            "T5GemmaEncoder"
-        ],
-        "tokenizer": [
-            "transformers",
-            "GemmaTokenizerFast"
-        ],
-        "transformer": [
-            "diffusers",
-            "MirageTransformer2DModel"
-        ],
-        "vae": [
-            "diffusers",
-            vae_class
-        ]
+        "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
+        "text_encoder": ["transformers", "T5GemmaEncoder"],
+        "tokenizer": ["transformers", "GemmaTokenizerFast"],
+        "transformer": ["diffusers", "MirageTransformer2DModel"],
+        "vae": ["diffusers", vae_class],
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
-    with open(model_index_path, 'w') as f:
+    with open(model_index_path, "w") as f:
         json.dump(model_index, f, indent=2)
 
-    print(f"✓ Created model_index.json")
+    print("✓ Created model_index.json")
+
 
 def main(args):
     # Validate inputs
@@ -236,7 +226,7 @@ def main(args):
     os.makedirs(transformer_path, exist_ok=True)
 
     # Save config
-    with open(os.path.join(transformer_path, "config.json"), 'w') as f:
+    with open(os.path.join(transformer_path, "config.json"), "w") as f:
         json.dump(config, f, indent=2)
 
     # Save model weights as safetensors
@@ -253,7 +243,7 @@ def main(args):
     # Verify the pipeline can be loaded
     try:
         pipeline = MiragePipeline.from_pretrained(args.output_path)
-        print(f"Pipeline loaded successfully!")
+        print("Pipeline loaded successfully!")
         print(f"Transformer: {type(pipeline.transformer).__name__}")
         print(f"VAE: {type(pipeline.vae).__name__}")
         print(f"Text Encoder: {type(pipeline.text_encoder).__name__}")
@@ -271,24 +261,18 @@ def main(args):
     print(f"Converted pipeline saved to: {args.output_path}")
     print(f"VAE type: {args.vae_type}")
 
-
     return True
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
 
     parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="Path to the original Mirage checkpoint (.pth file)"
+        "--checkpoint_path", type=str, required=True, help="Path to the original Mirage checkpoint (.pth file)"
     )
 
     parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Output directory for the converted diffusers pipeline"
+        "--output_path", type=str, required=True, help="Output directory for the converted diffusers pipeline"
     )
 
     parser.add_argument(
@@ -296,7 +280,7 @@ def main(args):
         type=str,
         choices=["flux", "dc-ae"],
         required=True,
-        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)"
+        help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)",
     )
 
     args = parser.parse_args()
@@ -306,7 +290,8 @@ def main(args):
         if not success:
             sys.exit(1)
     except Exception as e:
-        print(f"❌ Conversion failed: {e}")
+        print(f"Conversion failed: {e}")
         import traceback
+
         traceback.print_exc()
-        sys.exit(1)
\ No newline at end of file
+        sys.exit(1)
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 396e000524ec..923d44d4f1ec 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -13,21 +13,21 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Union, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
+
 import torch
-import math
-from torch import Tensor, nn
-from torch.nn.functional import fold, unfold
 from einops import rearrange
 from einops.layers.torch import Rearrange
+from torch import Tensor, nn
+from torch.nn.functional import fold, unfold
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from ..modeling_outputs import Transformer2DModelOutput
-from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..normalization import RMSNorm
+from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
 from ..embeddings import get_timestep_embedding
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import RMSNorm
 
 
 logger = logging.get_logger(__name__)
@@ -72,8 +72,6 @@ def forward(self, ids: Tensor) -> Tensor:
         return emb.unsqueeze(1)
 
 
-
-
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
@@ -85,8 +83,6 @@ def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
 
-
-
 class QKNorm(torch.nn.Module):
     def __init__(self, dim: int):
         super().__init__()
@@ -157,7 +153,6 @@ def __init__(
             processor=MirageAttnProcessor2_0(),
         )
 
-
         # mlp
         self.post_attention_layernorm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.gate_proj = nn.Linear(hidden_size, self.mlp_hidden_dim, bias=False)
@@ -212,9 +207,9 @@ def attn_forward(
             l_txt = txt_k.shape[2]
 
             assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
-            assert (
-                attention_mask.shape[-1] == l_txt
-            ), f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+            assert attention_mask.shape[-1] == l_txt, (
+                f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+            )
 
             device = img_q.device
 
@@ -234,8 +229,8 @@ def attn_forward(
         kv_packed = torch.cat([k, v], dim=-1)
 
         attn = self.attention(
-            hidden_states=img_q,                    
-            encoder_hidden_states=kv_packed,        
+            hidden_states=img_q,
+            encoder_hidden_states=kv_packed,
             attention_mask=attn_mask,
         )
 
@@ -288,8 +283,6 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
         return x
 
 
-
-
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
     """Flatten an image into a sequence of patches"""
     return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
@@ -327,7 +320,7 @@ def __init__(
         time_factor: float = 1000.0,
         time_max_period: int = 10000,
         conditioning_block_ids: list = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -447,7 +440,7 @@ def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Te
                 embedding_dim=256,
                 max_period=self.time_max_period,
                 scale=self.time_factor,
-                flip_sin_to_cos=True  # Match original cos, sin order
+                flip_sin_to_cos=True,  # Match original cos, sin order
             ).to(dtype)
         )
 
@@ -470,9 +463,7 @@ def forward_transformers(
             vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
 
         for block in self.blocks:
-            img = block(
-                img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
-            )
+            img = block(img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs)
 
         img = self.final_layer(img, vec)
         return img
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
index 4fd8ad191b3f..cba951057370 100644
--- a/src/diffusers/pipelines/mirage/__init__.py
+++ b/src/diffusers/pipelines/mirage/__init__.py
@@ -1,4 +1,5 @@
 from .pipeline_mirage import MiragePipeline
 from .pipeline_output import MiragePipelineOutput
 
-__all__ = ["MiragePipeline", "MiragePipelineOutput"]
\ No newline at end of file
+
+__all__ = ["MiragePipeline", "MiragePipelineOutput"]
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 126eab07977c..c4a4783c5f38 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import html
 import inspect
 import os
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import html
 import re
 import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import ftfy
 import torch
@@ -31,7 +30,7 @@
 
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, AutoencoderDC
+from ...models import AutoencoderDC, AutoencoderKL
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -41,6 +40,7 @@
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import MiragePipelineOutput
 
+
 try:
     from ...models.transformers.transformer_mirage import MirageTransformer2DModel
 except ImportError:
@@ -55,7 +55,19 @@ class TextPreprocessor:
     def __init__(self):
         """Initialize text preprocessor."""
         self.bad_punct_regex = re.compile(
-            r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + r"\\" + r"\/" + r"\*" + r"]{1,}"
+            r"["
+            + "#®•©™&@·º½¾¿¡§~"
+            + r"\)"
+            + r"\("
+            + r"\]"
+            + r"\["
+            + r"\}"
+            + r"\{"
+            + r"\|"
+            + r"\\"
+            + r"\/"
+            + r"\*"
+            + r"]{1,}"
         )
 
     def clean_text(self, text: str) -> str:
@@ -93,7 +105,7 @@ def clean_text(self, text: str) -> str:
         )
 
         # кавычки к одному стандарту
-        text = re.sub(r"[`´«»""¨]", '"', text)
+        text = re.sub(r"[`´«»" "¨]", '"', text)
         text = re.sub(r"['']", "'", text)
 
         # &quot; and &amp
@@ -243,9 +255,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         """
         # Ensure T5GemmaEncoder is available for loading
         import transformers
-        if not hasattr(transformers, 'T5GemmaEncoder'):
+
+        if not hasattr(transformers, "T5GemmaEncoder"):
             try:
                 from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+
                 transformers.T5GemmaEncoder = T5GemmaEncoder
             except ImportError:
                 # T5GemmaEncoder not available in this transformers version
@@ -254,7 +268,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # Proceed with standard loading
         return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
 
-
     def __init__(
         self,
         transformer: MirageTransformer2DModel,
@@ -333,7 +346,7 @@ def _enhance_vae_properties(self):
             if hasattr(self.vae, "spatial_compression_ratio") and self.vae.spatial_compression_ratio == 32:
                 self.vae.latent_channels = 32  # DC-AE default
             else:
-                self.vae.latent_channels = 4   # AutoencoderKL default
+                self.vae.latent_channels = 4  # AutoencoderKL default
 
     @property
     def vae_scale_factor(self):
@@ -353,7 +366,10 @@ def prepare_latents(
     ):
         """Prepare initial latents for the diffusion process."""
         if latents is None:
-            latent_height, latent_width = height // self.vae.spatial_compression_ratio, width // self.vae.spatial_compression_ratio
+            latent_height, latent_width = (
+                height // self.vae.spatial_compression_ratio,
+                width // self.vae.spatial_compression_ratio,
+            )
             shape = (batch_size, num_channels_latents, latent_height, latent_width)
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
@@ -424,7 +440,9 @@ def check_inputs(
     ):
         """Check that all inputs are in correct format."""
         if height % self.vae.spatial_compression_ratio != 0 or width % self.vae.spatial_compression_ratio != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}.")
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae.spatial_compression_ratio} but are {height} and {width}."
+            )
 
         if guidance_scale < 1.0:
             raise ValueError(f"guidance_scale has to be >= 1.0 but is {guidance_scale}")
@@ -584,12 +602,16 @@ def __call__(
 
                 # Forward through transformer layers
                 img_seq = self.transformer.forward_transformers(
-                    img_seq, txt, time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
-                    pe=pe, attention_mask=ca_mask
+                    img_seq,
+                    txt,
+                    time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    pe=pe,
+                    attention_mask=ca_mask,
                 )
 
                 # Convert back to image format
                 from ...models.transformers.transformer_mirage import seq2img
+
                 noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
 
                 # Apply CFG
@@ -626,4 +648,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return MiragePipelineOutput(images=image)
\ No newline at end of file
+        return MiragePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/mirage/pipeline_output.py
index dfb55821d142..e41c8e3bea00 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/mirage/pipeline_output.py
@@ -32,4 +32,4 @@ class MiragePipelineOutput(BaseOutput):
             num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
     """
 
-    images: Union[List[PIL.Image.Image], np.ndarray]
\ No newline at end of file
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 5e7b0bd165a6..0085627aa7e4 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -133,8 +133,7 @@ def test_process_inputs(self):
 
         with torch.no_grad():
             img_seq, txt, pe = model.process_inputs(
-                inputs_dict["image_latent"],
-                inputs_dict["cross_attn_conditioning"]
+                inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
         # Check shapes
@@ -144,7 +143,9 @@ def test_process_inputs(self):
         expected_seq_len = (height // patch_size) * (width // patch_size)
 
         self.assertEqual(img_seq.shape, (batch_size, expected_seq_len, init_dict["in_channels"] * patch_size**2))
-        self.assertEqual(txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"]))
+        self.assertEqual(
+            txt.shape, (batch_size, inputs_dict["cross_attn_conditioning"].shape[1], init_dict["hidden_size"])
+        )
         # Check that pe has the correct batch size, sequence length and some embedding dimension
         self.assertEqual(pe.shape[0], batch_size)  # batch size
         self.assertEqual(pe.shape[1], 1)  # unsqueeze(1) in EmbedND
@@ -160,20 +161,14 @@ def test_forward_transformers(self):
         with torch.no_grad():
             # Process inputs first
             img_seq, txt, pe = model.process_inputs(
-                inputs_dict["image_latent"],
-                inputs_dict["cross_attn_conditioning"]
+                inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
             # Test forward_transformers
-            output_seq = model.forward_transformers(
-                img_seq,
-                txt,
-                timestep=inputs_dict["timestep"],
-                pe=pe
-            )
+            output_seq = model.forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
 
         # Check output shape
-        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"]**2
+        expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"] ** 2
         self.assertEqual(output_seq.shape, (img_seq.shape[0], img_seq.shape[1], expected_out_channels))
 
     def test_attention_mask(self):
@@ -186,13 +181,10 @@ def test_attention_mask(self):
         batch_size = inputs_dict["cross_attn_conditioning"].shape[0]
         seq_len = inputs_dict["cross_attn_conditioning"].shape[1]
         attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool).to(torch_device)
-        attention_mask[:, seq_len//2:] = False  # Mask second half
+        attention_mask[:, seq_len // 2 :] = False  # Mask second half
 
         with torch.no_grad():
-            outputs = model(
-                **inputs_dict,
-                cross_attn_mask=attention_mask
-            )
+            outputs = model(**inputs_dict, cross_attn_mask=attention_mask)
 
         self.assertIsNotNone(outputs)
         expected_shape = inputs_dict["image_latent"].shape
@@ -237,7 +229,7 @@ def test_gradient_checkpointing_enable(self):
 
         # Check that _activation_checkpointing is set
         for block in model.blocks:
-            self.assertTrue(hasattr(block, '_activation_checkpointing'))
+            self.assertTrue(hasattr(block, "_activation_checkpointing"))
 
     def test_from_config(self):
         init_dict, _ = self.prepare_init_args_and_inputs_for_common()
@@ -249,4 +241,4 @@ def test_from_config(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 775a115dcc70e9683eb5eb2e07baa4ba3cc941af Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 19:05:51 +0000
Subject: [PATCH 56/69] remove dependencies to old checkpoints

---
 scripts/convert_mirage_to_diffusers.py | 229 +++++++++++++++++++++----
 1 file changed, 192 insertions(+), 37 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 5e2a2ff768f4..eb6de1a37481 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -6,11 +6,12 @@
 import argparse
 import json
 import os
-import shutil
 import sys
 
 import torch
 from safetensors.torch import save_file
+from dataclasses import dataclass, asdict
+from typing import Tuple, Dict
 
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
@@ -18,35 +19,53 @@
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 from diffusers.pipelines.mirage import MiragePipeline
 
+@dataclass(frozen=True)
+class MirageBase:
+    context_in_dim: int = 2304
+    hidden_size: int = 1792
+    mlp_ratio: float = 3.5
+    num_heads: int = 28
+    depth: int = 16
+    axes_dim: Tuple[int, int] = (32, 32)
+    theta: int = 10_000
+    time_factor: float = 1000.0
+    time_max_period: int = 10_000
 
-def load_reference_config(vae_type: str) -> dict:
-    """Load transformer config from existing pipeline checkpoint."""
 
+@dataclass(frozen=True)
+class MirageFlux(MirageBase):
+    in_channels: int = 16
+    patch_size: int = 2
+
+
+@dataclass(frozen=True)
+class MirageDCAE(MirageBase):
+    in_channels: int = 32
+    patch_size: int = 1
+
+
+def build_config(vae_type: str) -> dict:
     if vae_type == "flux":
-        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated/transformer/config.json"
+        cfg = MirageFlux()
     elif vae_type == "dc-ae":
-        config_path = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated/transformer/config.json"
+        cfg = MirageDCAE()
     else:
         raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
 
-    if not os.path.exists(config_path):
-        raise FileNotFoundError(f"Reference config not found: {config_path}")
-
-    with open(config_path, "r") as f:
-        config = json.load(f)
+    config_dict = asdict(cfg)
+    config_dict["axes_dim"] = list(config_dict["axes_dim"])  # type: ignore[index]
+    return config_dict
 
-    print(f"✓ Loaded {vae_type} config: in_channels={config['in_channels']}")
-    return config
 
 
-def create_parameter_mapping() -> dict:
+def create_parameter_mapping(depth: int) -> dict:
     """Create mapping from old parameter names to new diffusers names."""
 
     # Key mappings for structural changes
     mapping = {}
 
     # RMSNorm: scale -> weight
-    for i in range(16):  # 16 layers
+    for i in range(depth):
         mapping[f"blocks.{i}.qk_norm.query_norm.scale"] = f"blocks.{i}.qk_norm.query_norm.weight"
         mapping[f"blocks.{i}.qk_norm.key_norm.scale"] = f"blocks.{i}.qk_norm.key_norm.weight"
         mapping[f"blocks.{i}.k_norm.scale"] = f"blocks.{i}.k_norm.weight"
@@ -57,12 +76,12 @@ def create_parameter_mapping() -> dict:
     return mapping
 
 
-def convert_checkpoint_parameters(old_state_dict: dict) -> dict:
+def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth: int) -> Dict[str, torch.Tensor]:
     """Convert old checkpoint parameters to new diffusers format."""
 
     print("Converting checkpoint parameters...")
 
-    mapping = create_parameter_mapping()
+    mapping = create_parameter_mapping(depth)
     converted_state_dict = {}
 
     # First, print available keys to understand structure
@@ -135,7 +154,8 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     print(f"✓ Loaded checkpoint with {len(state_dict)} parameters")
 
     # Convert parameter names if needed
-    converted_state_dict = convert_checkpoint_parameters(state_dict)
+    model_depth = int(config.get("depth", 16))
+    converted_state_dict = convert_checkpoint_parameters(state_dict, depth=model_depth)
 
     # Create transformer with config
     print("Creating MirageTransformer2DModel...")
@@ -156,28 +176,164 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     return transformer
 
 
-def copy_pipeline_components(vae_type: str, output_path: str):
-    """Copy VAE, scheduler, text encoder, and tokenizer from reference pipeline."""
+
+
+def create_scheduler_config(output_path: str):
+    """Create FlowMatchEulerDiscreteScheduler config."""
+
+    scheduler_config = {
+        "_class_name": "FlowMatchEulerDiscreteScheduler",
+        "num_train_timesteps": 1000,
+        "shift": 1.0
+    }
+
+    scheduler_path = os.path.join(output_path, "scheduler")
+    os.makedirs(scheduler_path, exist_ok=True)
+
+    with open(os.path.join(scheduler_path, "scheduler_config.json"), "w") as f:
+        json.dump(scheduler_config, f, indent=2)
+
+    print("✓ Created scheduler config")
+
+
+def create_vae_config(vae_type: str, output_path: str):
+    """Create VAE config based on type."""
 
     if vae_type == "flux":
-        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_fluxvae_gemmaT5_updated"
+        vae_config = {
+            "_class_name": "AutoencoderKL",
+            "latent_channels": 16,
+            "block_out_channels": [128, 256, 512, 512],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D"
+            ],
+            "up_block_types": [
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D"
+            ],
+            "scaling_factor": 0.3611,
+            "shift_factor": 0.1159,
+            "use_post_quant_conv": False,
+            "use_quant_conv": False
+        }
     else:  # dc-ae
-        ref_pipeline = "/raid/shared/storage/home/davidb/diffusers/diffusers_pipeline_checkpoints/pipeline_checkpoint_dcae_gemmaT5_updated"
+        vae_config = {
+            "_class_name": "AutoencoderDC",
+            "latent_channels": 32,
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock"
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock"
+            ],
+            "encoder_layers_per_block": [2, 2, 2, 3, 3, 3],
+            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
+            "encoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
+            "decoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
+            "scaling_factor": 0.41407,
+            "upsample_block_type": "interpolate"
+        }
+
+    vae_path = os.path.join(output_path, "vae")
+    os.makedirs(vae_path, exist_ok=True)
+
+    with open(os.path.join(vae_path, "config.json"), "w") as f:
+        json.dump(vae_config, f, indent=2)
+
+    print("✓ Created VAE config")
+
+
+def create_text_encoder_config(output_path: str):
+    """Create T5GemmaEncoder config."""
+
+    text_encoder_config = {
+        "model_name": "google/t5gemma-2b-2b-ul2",
+        "model_max_length": 256,
+        "use_attn_mask": True,
+        "use_last_hidden_state": True
+    }
 
-    components = ["vae", "scheduler", "text_encoder", "tokenizer"]
+    text_encoder_path = os.path.join(output_path, "text_encoder")
+    os.makedirs(text_encoder_path, exist_ok=True)
+
+    with open(os.path.join(text_encoder_path, "config.json"), "w") as f:
+        json.dump(text_encoder_config, f, indent=2)
+
+    print("✓ Created text encoder config")
+
+
+def create_tokenizer_config(output_path: str):
+    """Create GemmaTokenizerFast config and files."""
+
+    tokenizer_config = {
+        "add_bos_token": False,
+        "add_eos_token": False,
+        "added_tokens_decoder": {
+            "0": {"content": "<pad>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "1": {"content": "<eos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "2": {"content": "<bos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "3": {"content": "<unk>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "106": {"content": "<start_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
+            "107": {"content": "<end_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True}
+        },
+        "additional_special_tokens": ["<start_of_turn>", "<end_of_turn>"],
+        "bos_token": "<bos>",
+        "clean_up_tokenization_spaces": False,
+        "eos_token": "<eos>",
+        "extra_special_tokens": {},
+        "model_max_length": 256,
+        "pad_token": "<pad>",
+        "padding_side": "right",
+        "sp_model_kwargs": {},
+        "spaces_between_special_tokens": False,
+        "tokenizer_class": "GemmaTokenizer",
+        "unk_token": "<unk>",
+        "use_default_system_prompt": False
+    }
 
-    for component in components:
-        src_path = os.path.join(ref_pipeline, component)
-        dst_path = os.path.join(output_path, component)
+    special_tokens_map = {
+        "bos_token": "<bos>",
+        "eos_token": "<eos>",
+        "pad_token": "<pad>",
+        "unk_token": "<unk>"
+    }
 
-        if os.path.exists(src_path):
-            if os.path.isdir(src_path):
-                shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
-            else:
-                shutil.copy2(src_path, dst_path)
-            print(f"✓ Copied {component}")
-        else:
-            print(f"⚠ Component not found: {src_path}")
+    tokenizer_path = os.path.join(output_path, "tokenizer")
+    os.makedirs(tokenizer_path, exist_ok=True)
+
+    with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w") as f:
+        json.dump(tokenizer_config, f, indent=2)
+
+    with open(os.path.join(tokenizer_path, "special_tokens_map.json"), "w") as f:
+        json.dump(special_tokens_map, f, indent=2)
+
+    print("✓ Created tokenizer config (Note: tokenizer.json and tokenizer.model files need to be provided separately)")
+
+
+def create_pipeline_components(vae_type: str, output_path: str):
+    """Create all pipeline components with proper configs."""
+
+    create_scheduler_config(output_path)
+    create_vae_config(vae_type, output_path)
+    create_text_encoder_config(output_path)
+    create_tokenizer_config(output_path)
 
 
 def create_model_index(vae_type: str, output_path: str):
@@ -211,8 +367,7 @@ def main(args):
     if not os.path.exists(args.checkpoint_path):
         raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint_path}")
 
-    # Load reference config based on VAE type
-    config = load_reference_config(args.vae_type)
+    config = build_config(args.vae_type)
 
     # Create output directory
     os.makedirs(args.output_path, exist_ok=True)
@@ -234,8 +389,8 @@ def main(args):
     save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
     print(f"✓ Saved transformer to {transformer_path}")
 
-    # Copy other pipeline components
-    copy_pipeline_components(args.vae_type, args.output_path)
+    # Create other pipeline components
+    create_pipeline_components(args.vae_type, args.output_path)
 
     # Create model index
     create_model_index(args.vae_type, args.output_path)

From 1c6c25cf1d9588c25362a997a9dfb2d886bf389c Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 22:30:58 +0200
Subject: [PATCH 57/69] remove old checkpoints dependency

---
 scripts/convert_mirage_to_diffusers.py        | 170 ++----------------
 .../pipelines/mirage/pipeline_mirage.py       |  68 +++++--
 2 files changed, 63 insertions(+), 175 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index eb6de1a37481..2ddb708bc704 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -84,13 +84,6 @@ def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth
     mapping = create_parameter_mapping(depth)
     converted_state_dict = {}
 
-    # First, print available keys to understand structure
-    print("Available keys in checkpoint:")
-    for key in sorted(old_state_dict.keys())[:10]:  # Show first 10 keys
-        print(f"  {key}")
-    if len(old_state_dict) > 10:
-        print(f"  ... and {len(old_state_dict) - 10} more")
-
     for key, value in old_state_dict.items():
         new_key = key
 
@@ -196,172 +189,37 @@ def create_scheduler_config(output_path: str):
     print("✓ Created scheduler config")
 
 
-def create_vae_config(vae_type: str, output_path: str):
-    """Create VAE config based on type."""
-
-    if vae_type == "flux":
-        vae_config = {
-            "_class_name": "AutoencoderKL",
-            "latent_channels": 16,
-            "block_out_channels": [128, 256, 512, 512],
-            "down_block_types": [
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D"
-            ],
-            "up_block_types": [
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D",
-                "UpDecoderBlock2D"
-            ],
-            "scaling_factor": 0.3611,
-            "shift_factor": 0.1159,
-            "use_post_quant_conv": False,
-            "use_quant_conv": False
-        }
-    else:  # dc-ae
-        vae_config = {
-            "_class_name": "AutoencoderDC",
-            "latent_channels": 32,
-            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "encoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock"
-            ],
-            "decoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock"
-            ],
-            "encoder_layers_per_block": [2, 2, 2, 3, 3, 3],
-            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
-            "encoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
-            "decoder_qkv_multiscales": [[], [], [], [5], [5], [5]],
-            "scaling_factor": 0.41407,
-            "upsample_block_type": "interpolate"
-        }
-
-    vae_path = os.path.join(output_path, "vae")
-    os.makedirs(vae_path, exist_ok=True)
-
-    with open(os.path.join(vae_path, "config.json"), "w") as f:
-        json.dump(vae_config, f, indent=2)
-
-    print("✓ Created VAE config")
-
-
-def create_text_encoder_config(output_path: str):
-    """Create T5GemmaEncoder config."""
-
-    text_encoder_config = {
-        "model_name": "google/t5gemma-2b-2b-ul2",
-        "model_max_length": 256,
-        "use_attn_mask": True,
-        "use_last_hidden_state": True
-    }
-
-    text_encoder_path = os.path.join(output_path, "text_encoder")
-    os.makedirs(text_encoder_path, exist_ok=True)
-
-    with open(os.path.join(text_encoder_path, "config.json"), "w") as f:
-        json.dump(text_encoder_config, f, indent=2)
-
-    print("✓ Created text encoder config")
-
-
-def create_tokenizer_config(output_path: str):
-    """Create GemmaTokenizerFast config and files."""
-
-    tokenizer_config = {
-        "add_bos_token": False,
-        "add_eos_token": False,
-        "added_tokens_decoder": {
-            "0": {"content": "<pad>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "1": {"content": "<eos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "2": {"content": "<bos>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "3": {"content": "<unk>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "106": {"content": "<start_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True},
-            "107": {"content": "<end_of_turn>", "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True}
-        },
-        "additional_special_tokens": ["<start_of_turn>", "<end_of_turn>"],
-        "bos_token": "<bos>",
-        "clean_up_tokenization_spaces": False,
-        "eos_token": "<eos>",
-        "extra_special_tokens": {},
-        "model_max_length": 256,
-        "pad_token": "<pad>",
-        "padding_side": "right",
-        "sp_model_kwargs": {},
-        "spaces_between_special_tokens": False,
-        "tokenizer_class": "GemmaTokenizer",
-        "unk_token": "<unk>",
-        "use_default_system_prompt": False
-    }
-
-    special_tokens_map = {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>"
-    }
-
-    tokenizer_path = os.path.join(output_path, "tokenizer")
-    os.makedirs(tokenizer_path, exist_ok=True)
-
-    with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w") as f:
-        json.dump(tokenizer_config, f, indent=2)
-
-    with open(os.path.join(tokenizer_path, "special_tokens_map.json"), "w") as f:
-        json.dump(special_tokens_map, f, indent=2)
-
-    print("✓ Created tokenizer config (Note: tokenizer.json and tokenizer.model files need to be provided separately)")
-
-
-def create_pipeline_components(vae_type: str, output_path: str):
-    """Create all pipeline components with proper configs."""
-
-    create_scheduler_config(output_path)
-    create_vae_config(vae_type, output_path)
-    create_text_encoder_config(output_path)
-    create_tokenizer_config(output_path)
 
 
 def create_model_index(vae_type: str, output_path: str):
-    """Create model_index.json for the pipeline."""
+    """Create model_index.json for the pipeline with HuggingFace model references."""
 
     if vae_type == "flux":
-        vae_class = "AutoencoderKL"
+        vae_model_name = "black-forest-labs/FLUX.1-dev"
+        vae_subfolder = "vae"
     else:  # dc-ae
-        vae_class = "AutoencoderDC"
+        vae_model_name = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
+        vae_subfolder = None
+
+    # Text encoder and tokenizer always use T5Gemma
+    text_model_name = "google/t5gemma-2b-2b-ul2"
 
     model_index = {
         "_class_name": "MiragePipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
         "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
-        "text_encoder": ["transformers", "T5GemmaEncoder"],
-        "tokenizer": ["transformers", "GemmaTokenizerFast"],
+        "text_encoder": text_model_name,
+        "tokenizer": text_model_name,
         "transformer": ["diffusers", "MirageTransformer2DModel"],
-        "vae": ["diffusers", vae_class],
+        "vae": vae_model_name,
+        "vae_subfolder": vae_subfolder,
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
     with open(model_index_path, "w") as f:
         json.dump(model_index, f, indent=2)
 
-    print("✓ Created model_index.json")
-
-
 def main(args):
     # Validate inputs
     if not os.path.exists(args.checkpoint_path):
@@ -389,10 +247,8 @@ def main(args):
     save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
     print(f"✓ Saved transformer to {transformer_path}")
 
-    # Create other pipeline components
-    create_pipeline_components(args.vae_type, args.output_path)
+    create_scheduler_config(args.output_path)
 
-    # Create model index
     create_model_index(args.vae_type, args.output_path)
 
     # Verify the pipeline can be loaded
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index c4a4783c5f38..e6a13ff226cd 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -247,26 +247,61 @@ class MiragePipeline(
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         """
-        Override from_pretrained to ensure T5GemmaEncoder is available for loading.
+        Override from_pretrained to load VAE and text encoder from HuggingFace models.
 
-        This ensures that T5GemmaEncoder from transformers is accessible in the module namespace
-        during component loading, which is required for MiragePipeline checkpoints that use
-        T5GemmaEncoder as the text encoder.
+        The MiragePipeline checkpoints only store transformer and scheduler locally.
+        VAE and text encoder are loaded from external HuggingFace models as specified
+        in model_index.json.
         """
-        # Ensure T5GemmaEncoder is available for loading
-        import transformers
+        import json
+        from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
+
+        model_index_path = os.path.join(pretrained_model_name_or_path, "model_index.json")
+        if not os.path.exists(model_index_path):
+            raise ValueError(f"model_index.json not found in {pretrained_model_name_or_path}")
+
+        with open(model_index_path, "r") as f:
+            model_index = json.load(f)
+
+        vae_model_name = model_index.get("vae")
+        vae_subfolder = model_index.get("vae_subfolder")
+        text_model_name = model_index.get("text_encoder")
+        tokenizer_model_name = model_index.get("tokenizer")
+
+        logger.info(f"Loading VAE from {vae_model_name}...")
+        if "FLUX" in vae_model_name or "flux" in vae_model_name:
+            vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder=vae_subfolder)
+        else:  # DC-AE
+            vae = AutoencoderDC.from_pretrained(vae_model_name)
+
+        logger.info(f"Loading text encoder from {text_model_name}...")
+        t5gemma_model = T5GemmaModel.from_pretrained(text_model_name)
+        text_encoder = t5gemma_model.encoder
+
+        logger.info(f"Loading tokenizer from {tokenizer_model_name}...")
+        tokenizer = GemmaTokenizerFast.from_pretrained(tokenizer_model_name)
+        tokenizer.model_max_length = 256
+
+        # Load transformer and scheduler from local checkpoint
+        logger.info(f"Loading transformer from {pretrained_model_name_or_path}...")
+        transformer = MirageTransformer2DModel.from_pretrained(
+            pretrained_model_name_or_path, subfolder="transformer"
+        )
 
-        if not hasattr(transformers, "T5GemmaEncoder"):
-            try:
-                from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
+        logger.info(f"Loading scheduler from {pretrained_model_name_or_path}...")
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            pretrained_model_name_or_path, subfolder="scheduler"
+        )
 
-                transformers.T5GemmaEncoder = T5GemmaEncoder
-            except ImportError:
-                # T5GemmaEncoder not available in this transformers version
-                pass
+        pipeline = cls(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+        )
 
-        # Proceed with standard loading
-        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return pipeline
 
     def __init__(
         self,
@@ -283,11 +318,8 @@ def __init__(
                 "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
             )
 
-        # Store standard components
         self.text_encoder = text_encoder
         self.tokenizer = tokenizer
-
-        # Initialize text preprocessor
         self.text_preprocessor = TextPreprocessor()
 
         self.register_modules(

From b0d965cc508b447569912f0747dfc5b4746e2d6b Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 20:56:51 +0000
Subject: [PATCH 58/69] move default height and width in checkpoint config

---
 scripts/convert_mirage_to_diffusers.py           |  9 +++++++++
 .../pipelines/mirage/pipeline_mirage.py          | 16 ++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_mirage_to_diffusers.py
index 2ddb708bc704..37de253d1448 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_mirage_to_diffusers.py
@@ -19,6 +19,9 @@
 from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
 from diffusers.pipelines.mirage import MiragePipeline
 
+DEFAULT_HEIGHT = 512
+DEFAULT_WIDTH = 512
+
 @dataclass(frozen=True)
 class MirageBase:
     context_in_dim: int = 2304
@@ -197,9 +200,13 @@ def create_model_index(vae_type: str, output_path: str):
     if vae_type == "flux":
         vae_model_name = "black-forest-labs/FLUX.1-dev"
         vae_subfolder = "vae"
+        default_height = DEFAULT_HEIGHT
+        default_width = DEFAULT_WIDTH
     else:  # dc-ae
         vae_model_name = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
         vae_subfolder = None
+        default_height = DEFAULT_HEIGHT
+        default_width = DEFAULT_WIDTH
 
     # Text encoder and tokenizer always use T5Gemma
     text_model_name = "google/t5gemma-2b-2b-ul2"
@@ -214,6 +221,8 @@ def create_model_index(vae_type: str, output_path: str):
         "transformer": ["diffusers", "MirageTransformer2DModel"],
         "vae": vae_model_name,
         "vae_subfolder": vae_subfolder,
+        "default_height": default_height,
+        "default_width": default_width,
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index e6a13ff226cd..9d247eecbd7f 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -31,6 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
+from ...models.transformers.transformer_mirage import seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -46,6 +47,9 @@
 except ImportError:
     MirageTransformer2DModel = None
 
+DEFAULT_HEIGHT = 512
+DEFAULT_WIDTH = 512
+
 logger = logging.get_logger(__name__)
 
 
@@ -267,6 +271,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         vae_subfolder = model_index.get("vae_subfolder")
         text_model_name = model_index.get("text_encoder")
         tokenizer_model_name = model_index.get("tokenizer")
+        default_height = model_index.get("default_height", DEFAULT_HEIGHT)
+        default_width = model_index.get("default_width", DEFAULT_WIDTH)
 
         logger.info(f"Loading VAE from {vae_model_name}...")
         if "FLUX" in vae_model_name or "flux" in vae_model_name:
@@ -301,6 +307,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             vae=vae,
         )
 
+        # Store default dimensions as pipeline attributes
+        pipeline.default_height = default_height
+        pipeline.default_width = default_width
+
         return pipeline
 
     def __init__(
@@ -558,8 +568,8 @@ def __call__(
         """
 
         # 0. Default height and width to transformer config
-        height = height or 256
-        width = width or 256
+        height = height or getattr(self, 'default_height', DEFAULT_HEIGHT)
+        width = width or getattr(self, 'default_width', DEFAULT_WIDTH)
 
         # 1. Check inputs
         self.check_inputs(
@@ -642,8 +652,6 @@ def __call__(
                 )
 
                 # Convert back to image format
-                from ...models.transformers.transformer_mirage import seq2img
-
                 noise_both = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
 
                 # Apply CFG

From 235fe491fe46dcfca7da299a57b48adbe9ad1c96 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:26:03 +0000
Subject: [PATCH 59/69] add docstrings

---
 .../models/transformers/transformer_mirage.py | 367 +++++++++++++++++-
 .../test_models_transformer_mirage.py         |   6 +-
 2 files changed, 351 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index 923d44d4f1ec..c509f797fb8b 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -33,20 +33,70 @@
 logger = logging.get_logger(__name__)
 
 
-def get_image_ids(bs: int, h: int, w: int, patch_size: int, device: torch.device) -> Tensor:
-    img_ids = torch.zeros(h // patch_size, w // patch_size, 2, device=device)
-    img_ids[..., 0] = torch.arange(h // patch_size, device=device)[:, None]
-    img_ids[..., 1] = torch.arange(w // patch_size, device=device)[None, :]
-    return img_ids.reshape((h // patch_size) * (w // patch_size), 2).unsqueeze(0).repeat(bs, 1, 1)
+def get_image_ids(batch_size: int, height: int, width: int, patch_size: int, device: torch.device) -> Tensor:
+    r"""
+    Generates 2D patch coordinate indices for a batch of images.
+
+    Parameters:
+        batch_size (`int`):
+            Number of images in the batch.
+        height (`int`):
+            Height of the input images (in pixels).
+        width (`int`):
+            Width of the input images (in pixels).
+        patch_size (`int`):
+            Size of the square patches that the image is divided into.
+        device (`torch.device`):
+            The device on which to create the tensor.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of shape `(batch_size, num_patches, 2)` containing the (row, col)
+            coordinates of each patch in the image grid.
+    """
+    
+    img_ids = torch.zeros(height // patch_size, width // patch_size, 2, device=device)
+    img_ids[..., 0] = torch.arange(height // patch_size, device=device)[:, None]
+    img_ids[..., 1] = torch.arange(width // patch_size, device=device)[None, :]
+    return img_ids.reshape((height // patch_size) * (width // patch_size), 2).unsqueeze(0).repeat(batch_size, 1, 1)
 
 
 def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
+    r"""
+    Applies rotary positional embeddings (RoPE) to a query tensor.
+
+    Parameters:
+        xq (`torch.Tensor`):
+            Input tensor of shape `(..., dim)` representing the queries.
+        freqs_cis (`torch.Tensor`):
+            Precomputed rotary frequency components of shape `(..., dim/2, 2)` 
+            containing cosine and sine pairs.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of the same shape as `xq` with rotary embeddings applied.
+    """
     xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
     xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
     return xq_out.reshape(*xq.shape).type_as(xq)
 
 
 class EmbedND(nn.Module):
+    r"""
+    N-dimensional rotary positional embedding.
+
+    This module creates rotary embeddings (RoPE) across multiple axes, where each
+    axis can have its own embedding dimension. The embeddings are combined and
+    returned as a single tensor
+    
+    Parameters:
+        dim (int):
+        Base embedding dimension (must be even).
+        theta (int):
+        Scaling factor that controls the frequency spectrum of the rotary embeddings.
+        axes_dim (list[int]):
+        List of embedding dimensions for each axis (each must be even).
+    """
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         super().__init__()
         self.dim = dim
@@ -73,6 +123,19 @@ def forward(self, ids: Tensor) -> Tensor:
 
 
 class MLPEmbedder(nn.Module):
+    r"""
+    A simple 2-layer MLP used for embedding inputs.
+
+    Parameters:
+        in_dim (`int`):
+            Dimensionality of the input features.
+        hidden_dim (`int`):
+            Dimensionality of the hidden and output embedding space.
+
+    Returns:
+        `torch.Tensor`:
+            Tensor of shape `(..., hidden_dim)` containing the embedded representations.
+    """
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
@@ -84,6 +147,19 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class QKNorm(torch.nn.Module):
+    r"""
+    Applies RMS normalization to query and key tensors separately before attention
+    which can help stabilize training and improve numerical precision.
+
+    Parameters:
+        dim (`int`):
+            Dimensionality of the query and key vectors.
+
+    Returns:
+        (`torch.Tensor`, `torch.Tensor`):
+            A tuple `(q, k)` where both are normalized and cast to the same dtype
+            as the value tensor `v`.
+    """
     def __init__(self, dim: int):
         super().__init__()
         self.query_norm = RMSNorm(dim, eps=1e-6)
@@ -103,6 +179,22 @@ class ModulationOut:
 
 
 class Modulation(nn.Module):
+    r"""
+    Modulation network that generates scale, shift, and gating parameters.
+
+    Given an input vector, the module projects it through a linear layer to
+    produce six chunks, which are grouped into two `ModulationOut` objects.
+
+    Parameters:
+        dim (`int`):
+            Dimensionality of the input vector. The output will have `6 * dim`
+            features internally.
+
+    Returns:
+        (`ModulationOut`, `ModulationOut`):
+            A tuple of two modulation outputs. Each `ModulationOut` contains
+            three components (e.g., scale, shift, gate).
+    """
     def __init__(self, dim: int):
         super().__init__()
         self.lin = nn.Linear(dim, 6 * dim, bias=True)
@@ -115,6 +207,68 @@ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
 
 
 class MirageBlock(nn.Module):
+    r"""
+    Multimodal transformer block with text–image cross-attention, modulation, and MLP.
+
+    Parameters:
+        hidden_size (`int`):
+            Dimension of the hidden representations.
+        num_heads (`int`):
+            Number of attention heads.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Expansion ratio for the hidden dimension inside the MLP.
+        qk_scale (`float`, *optional*):
+            Scale factor for queries and keys. If not provided, defaults to
+            ``head_dim**-0.5``.
+
+    Attributes:
+        img_pre_norm (`nn.LayerNorm`):
+            Pre-normalization applied to image tokens before QKV projection.
+        img_qkv_proj (`nn.Linear`):
+            Linear projection to produce image queries, keys, and values.
+        qk_norm (`QKNorm`):
+            RMS normalization applied separately to image queries and keys.
+        txt_kv_proj (`nn.Linear`):
+            Linear projection to produce text keys and values.
+        k_norm (`RMSNorm`):
+            RMS normalization applied to text keys.
+        attention (`Attention`):
+            Multi-head attention module for cross-attention between image, text,
+            and optional spatial conditioning tokens.
+        post_attention_layernorm (`nn.LayerNorm`):
+            Normalization applied after attention.
+        gate_proj / up_proj / down_proj (`nn.Linear`):
+            Feedforward layers forming the gated MLP.
+        mlp_act (`nn.GELU`):
+            Nonlinear activation used in the MLP.
+        modulation (`Modulation`):
+            Produces scale/shift/gating parameters for modulated layers.
+        spatial_cond_kv_proj (`nn.Linear`, *optional*):
+            Projection for optional spatial conditioning tokens.
+
+    Methods:
+        attn_forward(img, txt, pe, modulation, spatial_conditioning=None, attention_mask=None):
+            Compute cross-attention between image and text tokens, with optional
+            spatial conditioning and attention masking.
+
+            Parameters:
+                img (`torch.Tensor`):
+                    Image tokens of shape `(B, L_img, hidden_size)`.
+                txt (`torch.Tensor`):
+                    Text tokens of shape `(B, L_txt, hidden_size)`.
+                pe (`torch.Tensor`):
+                    Rotary positional embeddings to apply to queries and keys.
+                modulation (`ModulationOut`):
+                    Scale and shift parameters for modulating image tokens.
+                spatial_conditioning (`torch.Tensor`, *optional*):
+                    Extra conditioning tokens of shape `(B, L_cond, hidden_size)`.
+                attention_mask (`torch.Tensor`, *optional*):
+                    Boolean mask of shape `(B, L_txt)` where 0 marks padding.
+
+            Returns:
+                `torch.Tensor`:
+                    Attention output of shape `(B, L_img, hidden_size)`.
+    """
     def __init__(
         self,
         hidden_size: int,
@@ -163,7 +317,7 @@ def __init__(
         self.modulation = Modulation(hidden_size)
         self.spatial_cond_kv_proj: None | nn.Linear = None
 
-    def attn_forward(
+    def _attn_forward(
         self,
         img: Tensor,
         txt: Tensor,
@@ -236,7 +390,7 @@ def attn_forward(
 
         return attn
 
-    def ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
+    def _ffn_forward(self, x: Tensor, modulation: ModulationOut) -> Tensor:
         x = (1 + modulation.scale) * self.post_attention_layernorm(x) + modulation.shift
         return self.down_proj(self.mlp_act(self.gate_proj(x)) * self.up_proj(x))
 
@@ -250,9 +404,36 @@ def forward(
         attention_mask: Tensor | None = None,
         **_: dict[str, Any],
     ) -> Tensor:
+        r"""
+    Runs modulation-gated cross-attention and MLP, with residual connections.
+
+    Parameters:
+        img (`torch.Tensor`):
+            Image tokens of shape `(B, L_img, hidden_size)`.
+        txt (`torch.Tensor`):
+            Text tokens of shape `(B, L_txt, hidden_size)`.
+        vec (`torch.Tensor`):
+            Conditioning vector used by `Modulation` to produce scale/shift/gates,
+            shape `(B, hidden_size)` (or broadcastable).
+        pe (`torch.Tensor`):
+            Rotary positional embeddings applied inside attention.
+        spatial_conditioning (`torch.Tensor`, *optional*):
+            Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only
+            if spatial conditioning is enabled in the block.
+        attention_mask (`torch.Tensor`, *optional*):
+            Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
+        **_:
+            Ignored additional keyword arguments for API compatibility.
+
+    Returns:
+        `torch.Tensor`:
+            Updated image tokens of shape `(B, L_img, hidden_size)`.
+    """
+
+
         mod_attn, mod_mlp = self.modulation(vec)
 
-        img = img + mod_attn.gate * self.attn_forward(
+        img = img + mod_attn.gate * self._attn_forward(
             img,
             txt,
             pe,
@@ -260,12 +441,39 @@ def forward(
             spatial_conditioning=spatial_conditioning,
             attention_mask=attention_mask,
         )
-        img = img + mod_mlp.gate * self.ffn_forward(img, mod_mlp)
+        img = img + mod_mlp.gate * self._ffn_forward(img, mod_mlp)
         return img
 
 
 class LastLayer(nn.Module):
+    r"""
+    Final projection layer with adaptive LayerNorm modulation.
+
+    This layer applies a normalized and modulated transformation to input tokens
+    and projects them into patch-level outputs.
+
+    Parameters:
+        hidden_size (`int`):
+            Dimensionality of the input tokens.
+        patch_size (`int`):
+            Size of the square image patches.
+        out_channels (`int`):
+            Number of output channels per pixel (e.g. RGB = 3).
+
+    Forward Inputs:
+        x (`torch.Tensor`):
+            Input tokens of shape `(B, L, hidden_size)`, where `L` is the number of patches.
+        vec (`torch.Tensor`):
+            Conditioning vector of shape `(B, hidden_size)` used to generate
+            shift and scale parameters for adaptive LayerNorm.
+
+    Returns:
+        `torch.Tensor`:
+            Projected patch outputs of shape `(B, L, patch_size * patch_size * out_channels)`.
+    """
+
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
@@ -284,12 +492,41 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
 
 
 def img2seq(img: Tensor, patch_size: int) -> Tensor:
-    """Flatten an image into a sequence of patches"""
+    r"""
+    Flattens an image tensor into a sequence of non-overlapping patches.
+
+    Parameters:
+        img (`torch.Tensor`):
+            Input image tensor of shape `(B, C, H, W)`.
+        patch_size (`int`):
+            Size of each square patch. Must evenly divide both `H` and `W`.
+
+    Returns:
+        `torch.Tensor`:
+            Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`,
+            where `L = (H // patch_size) * (W // patch_size)` is the number of patches.
+    """
     return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
 
 
 def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
-    """Revert img2seq"""
+    r"""
+    Reconstructs an image tensor from a sequence of patches (inverse of `img2seq`).
+
+    Parameters:
+        seq (`torch.Tensor`):
+            Patch sequence of shape `(B, L, C * patch_size * patch_size)`,
+            where `L = (H // patch_size) * (W // patch_size)`.
+        patch_size (`int`):
+            Size of each square patch.
+        shape (`tuple` or `torch.Tensor`):
+            The original image spatial shape `(H, W)`. If a tensor is provided,
+            the first two values are interpreted as height and width.
+
+    Returns:
+        `torch.Tensor`:
+            Reconstructed image tensor of shape `(B, C, H, W)`.
+    """
     if isinstance(shape, tuple):
         shape = shape[-2:]
     elif isinstance(shape, torch.Tensor):
@@ -300,7 +537,70 @@ def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
 
 
 class MirageTransformer2DModel(ModelMixin, ConfigMixin):
-    """Mirage Transformer model with IP-Adapter support."""
+    r"""
+    Transformer-based 2D model for text to image generation.
+    It supports attention processor injection and LoRA scaling.
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 16):
+            Number of input channels in the latent image.
+        patch_size (`int`, *optional*, defaults to 2):
+            Size of the square patches used to flatten the input image.
+        context_in_dim (`int`, *optional*, defaults to 2304):
+            Dimensionality of the text conditioning input.
+        hidden_size (`int`, *optional*, defaults to 1792):
+            Dimension of the hidden representation.
+        mlp_ratio (`float`, *optional*, defaults to 3.5):
+            Expansion ratio for the hidden dimension inside MLP blocks.
+        num_heads (`int`, *optional*, defaults to 28):
+            Number of attention heads.
+        depth (`int`, *optional*, defaults to 16):
+            Number of transformer blocks.
+        axes_dim (`list[int]`, *optional*):
+            List of dimensions for each positional embedding axis. Defaults to `[32, 32]`.
+        theta (`int`, *optional*, defaults to 10000):
+            Frequency scaling factor for rotary embeddings.
+        time_factor (`float`, *optional*, defaults to 1000.0):
+            Scaling factor applied in timestep embeddings.
+        time_max_period (`int`, *optional*, defaults to 10000):
+            Maximum frequency period for timestep embeddings.
+        conditioning_block_ids (`list[int]`, *optional*):
+            Indices of blocks that receive conditioning. Defaults to all blocks.
+        **kwargs:
+            Additional keyword arguments forwarded to the config.
+
+    Attributes:
+        pe_embedder (`EmbedND`):
+            Multi-axis rotary embedding generator for positional encodings.
+        img_in (`nn.Linear`):
+            Projection layer for image patch tokens.
+        time_in (`MLPEmbedder`):
+            Embedding layer for timestep embeddings.
+        txt_in (`nn.Linear`):
+            Projection layer for text conditioning.
+        blocks (`nn.ModuleList`):
+            Stack of transformer blocks (`MirageBlock`).
+        final_layer (`LastLayer`):
+            Projection layer mapping hidden tokens back to patch outputs.
+
+    Methods:
+        attn_processors:
+            Returns a dictionary of all attention processors in the model.
+        set_attn_processor(processor):
+            Replaces attention processors across all attention layers.
+        process_inputs(image_latent, txt):
+            Converts inputs into patch tokens, encodes text, and produces positional encodings.
+        compute_timestep_embedding(timestep, dtype):
+            Creates a timestep embedding of dimension 256, scaled and projected.
+        forward_transformers(image_latent, cross_attn_conditioning, timestep, time_embedding, attention_mask, **block_kwargs):
+            Runs the sequence of transformer blocks over image and text tokens.
+        forward(image_latent, timestep, cross_attn_conditioning, micro_conditioning, cross_attn_mask=None, attention_kwargs=None, return_dict=True):
+            Full forward pass from latent input to reconstructed output image.
+
+    Returns:
+        `Transformer2DModelOutput` if `return_dict=True` (default), otherwise a tuple containing:
+            - `sample` (`torch.Tensor`): Reconstructed image of shape `(B, C, H, W)`.
+    """
 
     config_name = "config.json"
     _supports_gradient_checkpointing = True
@@ -424,8 +724,8 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
 
-    def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
-        """Timestep independent stuff"""
+    def _process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
+        
         txt = self.txt_in(txt)
         img = img2seq(image_latent, self.patch_size)
         bs, _, h, w = image_latent.shape
@@ -433,7 +733,7 @@ def process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[T
         pe = self.pe_embedder(img_ids)
         return img, txt, pe
 
-    def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
+    def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
             get_timestep_embedding(
                 timesteps=timestep,
@@ -444,7 +744,7 @@ def compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Te
             ).to(dtype)
         )
 
-    def forward_transformers(
+    def _forward_transformers(
         self,
         image_latent: Tensor,
         cross_attn_conditioning: Tensor,
@@ -460,7 +760,7 @@ def forward_transformers(
         else:
             if timestep is None:
                 raise ValueError("Please provide either a timestep or a timestep_embedding")
-            vec = self.compute_timestep_embedding(timestep, dtype=img.dtype)
+            vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
 
         for block in self.blocks:
             img = block(img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs)
@@ -478,6 +778,35 @@ def forward(
         attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        r"""
+        Forward pass of the MirageTransformer2DModel.
+
+        The latent image is split into patch tokens, combined with text conditioning,
+        and processed through a stack of transformer blocks modulated by the timestep.
+        The output is reconstructed into the latent image space.
+
+        Parameters:
+            image_latent (`torch.Tensor`):
+                Input latent image tensor of shape `(B, C, H, W)`.
+            timestep (`torch.Tensor`):
+                Timestep tensor of shape `(B,)` or `(1,)`, used for temporal conditioning.
+            cross_attn_conditioning (`torch.Tensor`):
+                Text conditioning tensor of shape `(B, L_txt, context_in_dim)`.
+            micro_conditioning (`torch.Tensor`):
+                Extra conditioning vector (currently unused, reserved for future use).
+            cross_attn_mask (`torch.Tensor`, *optional*):
+                Boolean mask of shape `(B, L_txt)`, where `0` marks padding in the text sequence.
+            attention_kwargs (`dict`, *optional*):
+                Additional arguments passed to attention layers. If using the PEFT backend,
+                the key `"scale"` controls LoRA scaling (default: 1.0).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a `Transformer2DModelOutput` or a tuple.
+
+        Returns:
+            `Transformer2DModelOutput` if `return_dict=True`, otherwise a tuple:
+
+                - `sample` (`torch.Tensor`): Output latent image of shape `(B, C, H, W)`.
+        """
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
             lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -491,8 +820,8 @@ def forward(
                 logger.warning(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
-        img_seq, txt, pe = self.process_inputs(image_latent, cross_attn_conditioning)
-        img_seq = self.forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
+        img_seq, txt, pe = self._process_inputs(image_latent, cross_attn_conditioning)
+        img_seq = self._forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
         output = seq2img(img_seq, self.patch_size, image_latent.shape)
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_mirage.py
index 0085627aa7e4..fe7436debc4c 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_mirage.py
@@ -132,7 +132,7 @@ def test_process_inputs(self):
         model.eval()
 
         with torch.no_grad():
-            img_seq, txt, pe = model.process_inputs(
+            img_seq, txt, pe = model._process_inputs(
                 inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
@@ -160,12 +160,12 @@ def test_forward_transformers(self):
 
         with torch.no_grad():
             # Process inputs first
-            img_seq, txt, pe = model.process_inputs(
+            img_seq, txt, pe = model._process_inputs(
                 inputs_dict["image_latent"], inputs_dict["cross_attn_conditioning"]
             )
 
             # Test forward_transformers
-            output_seq = model.forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
+            output_seq = model._forward_transformers(img_seq, txt, timestep=inputs_dict["timestep"], pe=pe)
 
         # Check output shape
         expected_out_channels = init_dict["in_channels"] * init_dict["patch_size"] ** 2

From a6ff5799588b0ec3181a1f133e99bde4e31077cb Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:28:43 +0000
Subject: [PATCH 60/69] if conditions and raised as ValueError instead of
 asserts

---
 .../models/transformers/transformer_mirage.py          | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_mirage.py
index c509f797fb8b..90ba11fb2d24 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_mirage.py
@@ -360,10 +360,12 @@ def _attn_forward(
             bs, _, l_img, _ = img_q.shape
             l_txt = txt_k.shape[2]
 
-            assert attention_mask.dim() == 2, f"Unsupported attention_mask shape: {attention_mask.shape}"
-            assert attention_mask.shape[-1] == l_txt, (
-                f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
-            )
+            if attention_mask.dim() != 2:
+                raise ValueError(f"Unsupported attention_mask shape: {attention_mask.shape}")
+            if attention_mask.shape[-1] != l_txt:
+                raise ValueError(
+                    f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
+                )
 
             device = img_q.device
 

From 3a915039a14ae793f2debaff06d9ab21b5e8a23d Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:33:28 +0000
Subject: [PATCH 61/69] small fix

---
 src/diffusers/pipelines/mirage/pipeline_mirage.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 9d247eecbd7f..50304ae1a3ad 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -640,13 +640,13 @@ def __call__(
                 t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).repeat(2).to(device)
 
                 # Process inputs for transformer
-                img_seq, txt, pe = self.transformer.process_inputs(latents_in, ca_embed)
+                img_seq, txt, pe = self.transformer._process_inputs(latents_in, ca_embed)
 
                 # Forward through transformer layers
-                img_seq = self.transformer.forward_transformers(
+                img_seq = self.transformer._forward_transformers(
                     img_seq,
                     txt,
-                    time_embedding=self.transformer.compute_timestep_embedding(t_cont, img_seq.dtype),
+                    time_embedding=self.transformer._compute_timestep_embedding(t_cont, img_seq.dtype),
                     pe=pe,
                     attention_mask=ca_mask,
                 )

From e200cf64f4bc91dbe03d61149c7fd0d87b3c6659 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:34:47 +0000
Subject: [PATCH 62/69] nit remove try block at import

---
 src/diffusers/pipelines/mirage/pipeline_mirage.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/mirage/pipeline_mirage.py
index 50304ae1a3ad..ced78adec786 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/mirage/pipeline_mirage.py
@@ -31,7 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
-from ...models.transformers.transformer_mirage import seq2img
+from ...models.transformers.transformer_mirage import MirageTransformer2DModel, seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -42,11 +42,6 @@
 from .pipeline_output import MiragePipelineOutput
 
 
-try:
-    from ...models.transformers.transformer_mirage import MirageTransformer2DModel
-except ImportError:
-    MirageTransformer2DModel = None
-
 DEFAULT_HEIGHT = 512
 DEFAULT_WIDTH = 512
 

From 2ea8976a82bf25b0bd4ac4d5b8a20b3bc2a24b41 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 30 Sep 2025 21:35:16 +0000
Subject: [PATCH 63/69] mirage pipeline doc

---
 docs/source/en/api/pipelines/mirage.md | 158 +++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/mirage.md

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/mirage.md
new file mode 100644
index 000000000000..3383bbecae2a
--- /dev/null
+++ b/docs/source/en/api/pipelines/mirage.md
@@ -0,0 +1,158 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# MiragePipeline
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+Mirage is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
+
+Key features:
+
+- **Transformer Architecture**: Uses a modern transformer-based denoising model with attention mechanisms optimized for image generation
+- **Flow Matching**: Employs flow matching with Euler discrete scheduling for efficient sampling
+- **Flexible VAE Support**: Compatible with both Flux VAE (8x compression, 16 latent channels) and DC-AE (32x compression, 32 latent channels)
+- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding with strong text-image alignment
+- **Efficient Architecture**: ~1.3B parameters in the transformer, enabling fast inference while maintaining quality
+- **Modular Design**: Text encoder and VAE weights are loaded from HuggingFace, keeping checkpoint sizes small
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## Loading the Pipeline
+
+Mirage checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
+
+```py
+from diffusers import MiragePipeline
+
+# Load pipeline - VAE and text encoder will be loaded from HuggingFace
+pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+pipe.to("cuda")
+
+prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+image.save("mirage_output.png")
+```
+
+### Manual Component Loading
+
+You can also load components individually:
+
+```py
+import torch
+from diffusers import MiragePipeline
+from diffusers.models import AutoencoderKL, AutoencoderDC
+from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from transformers import T5GemmaModel, GemmaTokenizerFast
+
+# Load transformer
+transformer = MirageTransformer2DModel.from_pretrained(
+    "path/to/checkpoint", subfolder="transformer"
+)
+
+# Load scheduler
+scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+    "path/to/checkpoint", subfolder="scheduler"
+)
+
+# Load T5Gemma text encoder
+t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
+text_encoder = t5gemma_model.encoder
+tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
+
+# Load VAE - choose either Flux VAE or DC-AE
+# Flux VAE (16 latent channels):
+vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
+# Or DC-AE (32 latent channels):
+# vae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers")
+
+pipe = MiragePipeline(
+    transformer=transformer,
+    scheduler=scheduler,
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    vae=vae
+)
+pipe.to("cuda")
+```
+
+## VAE Variants
+
+Mirage supports two VAE configurations:
+
+### Flux VAE (AutoencoderKL)
+- **Compression**: 8x spatial compression
+- **Latent channels**: 16
+- **Model**: `black-forest-labs/FLUX.1-dev` (subfolder: "vae")
+- **Use case**: Balanced quality and speed
+
+### DC-AE (AutoencoderDC)
+- **Compression**: 32x spatial compression
+- **Latent channels**: 32
+- **Model**: `mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers`
+- **Use case**: Higher compression for faster processing
+
+The VAE type is automatically determined from the checkpoint's `model_index.json` configuration.
+
+## Generation Parameters
+
+Key parameters for image generation:
+
+- **num_inference_steps**: Number of denoising steps (default: 28). More steps generally improve quality at the cost of speed.
+- **guidance_scale**: Classifier-free guidance strength (default: 4.0). Higher values produce images more closely aligned with the prompt.
+- **height/width**: Output image dimensions (default: 512x512). Can be customized in the checkpoint configuration.
+
+```py
+# Example with custom parameters
+image = pipe(
+    prompt="A serene mountain landscape at sunset",
+    num_inference_steps=28,
+    guidance_scale=4.0,
+    height=1024,
+    width=1024,
+    generator=torch.Generator("cuda").manual_seed(42)
+).images[0]
+```
+
+## Memory Optimization
+
+For memory-constrained environments:
+
+```py
+import torch
+from diffusers import MiragePipeline
+
+pipe = MiragePipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
+pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use
+
+# Or use sequential CPU offload for even lower memory
+pipe.enable_sequential_cpu_offload()
+```
+
+## MiragePipeline
+
+[[autodoc]] MiragePipeline
+  - all
+  - __call__
+
+## MiragePipelineOutput
+
+[[autodoc]] pipelines.mirage.pipeline_output.MiragePipelineOutput

From 26429a370a34384562d772e9011cafc13bab8c07 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 7 Oct 2025 14:20:17 +0000
Subject: [PATCH 64/69] update doc

---
 docs/source/en/api/pipelines/mirage.md | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/mirage.md
index 3383bbecae2a..f0117795a989 100644
--- a/docs/source/en/api/pipelines/mirage.md
+++ b/docs/source/en/api/pipelines/mirage.md
@@ -22,18 +22,12 @@ Mirage is a text-to-image diffusion model using a transformer-based architecture
 
 Key features:
 
-- **Transformer Architecture**: Uses a modern transformer-based denoising model with attention mechanisms optimized for image generation
-- **Flow Matching**: Employs flow matching with Euler discrete scheduling for efficient sampling
+- **Simplified MMDIT architecture**: Uses a simplified MMDIT architecture for image generation where text tokens are not updated through the transformer blocks
+- **Flow Matching**: Employs flow matching with discrete scheduling for efficient sampling
 - **Flexible VAE Support**: Compatible with both Flux VAE (8x compression, 16 latent channels) and DC-AE (32x compression, 32 latent channels)
-- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding with strong text-image alignment
+- **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding offering multiple language support
 - **Efficient Architecture**: ~1.3B parameters in the transformer, enabling fast inference while maintaining quality
-- **Modular Design**: Text encoder and VAE weights are loaded from HuggingFace, keeping checkpoint sizes small
 
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
 
 ## Loading the Pipeline
 
@@ -46,7 +40,7 @@ from diffusers import MiragePipeline
 pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
 pipe.to("cuda")
 
-prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
+prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light"
 image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
 image.save("mirage_output.png")
 ```
@@ -123,11 +117,11 @@ Key parameters for image generation:
 ```py
 # Example with custom parameters
 image = pipe(
-    prompt="A serene mountain landscape at sunset",
+    prompt="A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light",
     num_inference_steps=28,
     guidance_scale=4.0,
-    height=1024,
-    width=1024,
+    height=512,
+    width=512,
     generator=torch.Generator("cuda").manual_seed(42)
 ).images[0]
 ```

From 0abe136648c938b690ec49eb2aa09fec5a75f318 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Tue, 7 Oct 2025 14:25:27 +0000
Subject: [PATCH 65/69] rename model to photon

---
 .../en/api/pipelines/{mirage.md => photon.md} | 34 ++++++++--------
 ...sers.py => convert_photon_to_diffusers.py} | 34 ++++++++--------
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/models/__init__.py              |  2 +-
 src/diffusers/models/attention_processor.py   | 20 +++++-----
 src/diffusers/models/transformers/__init__.py |  2 +-
 ...former_mirage.py => transformer_photon.py} | 14 +++----
 src/diffusers/pipelines/__init__.py           |  2 +-
 src/diffusers/pipelines/mirage/__init__.py    |  5 ---
 src/diffusers/pipelines/photon/__init__.py    |  5 +++
 .../{mirage => photon}/pipeline_output.py     |  4 +-
 .../pipeline_photon.py}                       | 40 +++++++++----------
 ...e.py => test_models_transformer_photon.py} | 14 +++----
 13 files changed, 89 insertions(+), 89 deletions(-)
 rename docs/source/en/api/pipelines/{mirage.md => photon.md} (86%)
 rename scripts/{convert_mirage_to_diffusers.py => convert_photon_to_diffusers.py} (92%)
 rename src/diffusers/models/transformers/{transformer_mirage.py => transformer_photon.py} (99%)
 delete mode 100644 src/diffusers/pipelines/mirage/__init__.py
 create mode 100644 src/diffusers/pipelines/photon/__init__.py
 rename src/diffusers/pipelines/{mirage => photon}/pipeline_output.py (93%)
 rename src/diffusers/pipelines/{mirage/pipeline_mirage.py => photon/pipeline_photon.py} (95%)
 rename tests/models/transformers/{test_models_transformer_mirage.py => test_models_transformer_photon.py} (95%)

diff --git a/docs/source/en/api/pipelines/mirage.md b/docs/source/en/api/pipelines/photon.md
similarity index 86%
rename from docs/source/en/api/pipelines/mirage.md
rename to docs/source/en/api/pipelines/photon.md
index f0117795a989..f8f7098545f8 100644
--- a/docs/source/en/api/pipelines/mirage.md
+++ b/docs/source/en/api/pipelines/photon.md
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License. -->
 
-# MiragePipeline
+# PhotonPipeline
 
 <div class="flex flex-wrap space-x-1">
   <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
 </div>
 
-Mirage is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
+Photon is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
 
 Key features:
 
@@ -31,18 +31,18 @@ Key features:
 
 ## Loading the Pipeline
 
-Mirage checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
+Photon checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
 
 ```py
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 
 # Load pipeline - VAE and text encoder will be loaded from HuggingFace
-pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
 pipe.to("cuda")
 
 prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light"
 image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
-image.save("mirage_output.png")
+image.save("photon_output.png")
 ```
 
 ### Manual Component Loading
@@ -51,14 +51,14 @@ You can also load components individually:
 
 ```py
 import torch
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 from diffusers.models import AutoencoderKL, AutoencoderDC
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from transformers import T5GemmaModel, GemmaTokenizerFast
 
 # Load transformer
-transformer = MirageTransformer2DModel.from_pretrained(
+transformer = PhotonTransformer2DModel.from_pretrained(
     "path/to/checkpoint", subfolder="transformer"
 )
 
@@ -78,7 +78,7 @@ vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="v
 # Or DC-AE (32 latent channels):
 # vae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers")
 
-pipe = MiragePipeline(
+pipe = PhotonPipeline(
     transformer=transformer,
     scheduler=scheduler,
     text_encoder=text_encoder,
@@ -90,7 +90,7 @@ pipe.to("cuda")
 
 ## VAE Variants
 
-Mirage supports two VAE configurations:
+Photon supports two VAE configurations:
 
 ### Flux VAE (AutoencoderKL)
 - **Compression**: 8x spatial compression
@@ -132,21 +132,21 @@ For memory-constrained environments:
 
 ```py
 import torch
-from diffusers import MiragePipeline
+from diffusers import PhotonPipeline
 
-pipe = MiragePipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
+pipe = PhotonPipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
 pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use
 
 # Or use sequential CPU offload for even lower memory
 pipe.enable_sequential_cpu_offload()
 ```
 
-## MiragePipeline
+## PhotonPipeline
 
-[[autodoc]] MiragePipeline
+[[autodoc]] PhotonPipeline
   - all
   - __call__
 
-## MiragePipelineOutput
+## PhotonPipelineOutput
 
-[[autodoc]] pipelines.mirage.pipeline_output.MiragePipelineOutput
+[[autodoc]] pipelines.photon.pipeline_output.PhotonPipelineOutput
diff --git a/scripts/convert_mirage_to_diffusers.py b/scripts/convert_photon_to_diffusers.py
similarity index 92%
rename from scripts/convert_mirage_to_diffusers.py
rename to scripts/convert_photon_to_diffusers.py
index 37de253d1448..ad04463e019f 100644
--- a/scripts/convert_mirage_to_diffusers.py
+++ b/scripts/convert_photon_to_diffusers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Script to convert Mirage checkpoint from original codebase to diffusers format.
+Script to convert Photon checkpoint from original codebase to diffusers format.
 """
 
 import argparse
@@ -16,14 +16,14 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
-from diffusers.pipelines.mirage import MiragePipeline
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
+from diffusers.pipelines.photon import PhotonPipeline
 
 DEFAULT_HEIGHT = 512
 DEFAULT_WIDTH = 512
 
 @dataclass(frozen=True)
-class MirageBase:
+class PhotonBase:
     context_in_dim: int = 2304
     hidden_size: int = 1792
     mlp_ratio: float = 3.5
@@ -36,22 +36,22 @@ class MirageBase:
 
 
 @dataclass(frozen=True)
-class MirageFlux(MirageBase):
+class PhotonFlux(PhotonBase):
     in_channels: int = 16
     patch_size: int = 2
 
 
 @dataclass(frozen=True)
-class MirageDCAE(MirageBase):
+class PhotonDCAE(PhotonBase):
     in_channels: int = 32
     patch_size: int = 1
 
 
 def build_config(vae_type: str) -> dict:
     if vae_type == "flux":
-        cfg = MirageFlux()
+        cfg = PhotonFlux()
     elif vae_type == "dc-ae":
-        cfg = MirageDCAE()
+        cfg = PhotonDCAE()
     else:
         raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
 
@@ -125,8 +125,8 @@ def convert_checkpoint_parameters(old_state_dict: Dict[str, torch.Tensor], depth
     return converted_state_dict
 
 
-def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> MirageTransformer2DModel:
-    """Create and load MirageTransformer2DModel from old checkpoint."""
+def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> PhotonTransformer2DModel:
+    """Create and load PhotonTransformer2DModel from old checkpoint."""
 
     print(f"Loading checkpoint from: {checkpoint_path}")
 
@@ -154,8 +154,8 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Mi
     converted_state_dict = convert_checkpoint_parameters(state_dict, depth=model_depth)
 
     # Create transformer with config
-    print("Creating MirageTransformer2DModel...")
-    transformer = MirageTransformer2DModel(**config)
+    print("Creating PhotonTransformer2DModel...")
+    transformer = PhotonTransformer2DModel(**config)
 
     # Load state dict
     print("Loading converted parameters...")
@@ -212,13 +212,13 @@ def create_model_index(vae_type: str, output_path: str):
     text_model_name = "google/t5gemma-2b-2b-ul2"
 
     model_index = {
-        "_class_name": "MiragePipeline",
+        "_class_name": "PhotonPipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
         "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
         "text_encoder": text_model_name,
         "tokenizer": text_model_name,
-        "transformer": ["diffusers", "MirageTransformer2DModel"],
+        "transformer": ["diffusers", "PhotonTransformer2DModel"],
         "vae": vae_model_name,
         "vae_subfolder": vae_subfolder,
         "default_height": default_height,
@@ -262,7 +262,7 @@ def main(args):
 
     # Verify the pipeline can be loaded
     try:
-        pipeline = MiragePipeline.from_pretrained(args.output_path)
+        pipeline = PhotonPipeline.from_pretrained(args.output_path)
         print("Pipeline loaded successfully!")
         print(f"Transformer: {type(pipeline.transformer).__name__}")
         print(f"VAE: {type(pipeline.vae).__name__}")
@@ -285,10 +285,10 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Convert Mirage checkpoint to diffusers format")
+    parser = argparse.ArgumentParser(description="Convert Photon checkpoint to diffusers format")
 
     parser.add_argument(
-        "--checkpoint_path", type=str, required=True, help="Path to the original Mirage checkpoint (.pth file)"
+        "--checkpoint_path", type=str, required=True, help="Path to the original Photon checkpoint (.pth file)"
     )
 
     parser.add_argument(
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 6c419b6e7ad1..4eff8a27ff40 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -224,7 +224,7 @@
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
-            "MirageTransformer2DModel",
+            "PhotonTransformer2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
             "MotionAdapter",
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 279e69216b1b..86e32c1eec3e 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -93,7 +93,7 @@
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
-    _import_structure["transformers.transformer_mirage"] = ["MirageTransformer2DModel"]
+    _import_structure["transformers.transformer_photon"] = ["PhotonTransformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
     _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index e4ab33be9784..47cf4fab4a5e 100755
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -5605,15 +5605,15 @@ def __new__(cls, *args, **kwargs):
         return processor
 
 
-class MirageAttnProcessor2_0:
+class PhotonAttnProcessor2_0:
     r"""
-    Processor for implementing Mirage-style attention with multi-source tokens and RoPE.
-    Properly integrates with diffusers Attention module while handling Mirage-specific logic.
+    Processor for implementing Photon-style attention with multi-source tokens and RoPE.
+    Properly integrates with diffusers Attention module while handling Photon-specific logic.
     """
 
     def __init__(self):
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
-            raise ImportError("MirageAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
+            raise ImportError("PhotonAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.")
 
     def __call__(
         self,
@@ -5625,9 +5625,9 @@ def __call__(
         **kwargs,
     ) -> torch.Tensor:
         """
-        Apply Mirage attention using standard diffusers interface.
+        Apply Photon attention using standard diffusers interface.
 
-        Expected tensor formats from MirageBlock.attn_forward():
+        Expected tensor formats from PhotonBlock.attn_forward():
         - hidden_states: Image queries with RoPE applied [B, H, L_img, D]
         - encoder_hidden_states: Packed key+value tensors [B, H, L_all, 2*D]
           (concatenated keys and values from text + image + spatial conditioning)
@@ -5636,15 +5636,15 @@ def __call__(
 
         if encoder_hidden_states is None:
             raise ValueError(
-                "MirageAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
-                "This should be provided by MirageBlock.attn_forward()."
+                "PhotonAttnProcessor2_0 requires 'encoder_hidden_states' containing packed key+value tensors. "
+                "This should be provided by PhotonBlock.attn_forward()."
             )
 
         # Unpack the combined key+value tensor
         # encoder_hidden_states is [B, H, L_all, 2*D] containing [keys, values]
         key, value = encoder_hidden_states.chunk(2, dim=-1)  # Each [B, H, L_all, D]
 
-        # Apply scaled dot-product attention with Mirage's processed tensors
+        # Apply scaled dot-product attention with Photon's processed tensors
         # hidden_states is image queries [B, H, L_img, D]
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             hidden_states.contiguous(), key.contiguous(), value.contiguous(), attn_mask=attention_mask
@@ -5710,7 +5710,7 @@ def __call__(
     PAGHunyuanAttnProcessor2_0,
     PAGCFGHunyuanAttnProcessor2_0,
     LuminaAttnProcessor2_0,
-    MirageAttnProcessor2_0,
+    PhotonAttnProcessor2_0,
     FusedAttnProcessor2_0,
     CustomDiffusionXFormersAttnProcessor,
     CustomDiffusionAttnProcessor2_0,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index ebe0d0c9b8e1..652f6d811393 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -29,7 +29,7 @@
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
-    from .transformer_mirage import MirageTransformer2DModel
+    from .transformer_photon import PhotonTransformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
diff --git a/src/diffusers/models/transformers/transformer_mirage.py b/src/diffusers/models/transformers/transformer_photon.py
similarity index 99%
rename from src/diffusers/models/transformers/transformer_mirage.py
rename to src/diffusers/models/transformers/transformer_photon.py
index 90ba11fb2d24..9ec6e9756c20 100644
--- a/src/diffusers/models/transformers/transformer_mirage.py
+++ b/src/diffusers/models/transformers/transformer_photon.py
@@ -23,7 +23,7 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..attention_processor import Attention, AttentionProcessor, MirageAttnProcessor2_0
+from ..attention_processor import Attention, AttentionProcessor, PhotonAttnProcessor2_0
 from ..embeddings import get_timestep_embedding
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -206,7 +206,7 @@ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut]:
         return ModulationOut(*out[:3]), ModulationOut(*out[3:])
 
 
-class MirageBlock(nn.Module):
+class PhotonBlock(nn.Module):
     r"""
     Multimodal transformer block with text–image cross-attention, modulation, and MLP.
 
@@ -304,7 +304,7 @@ def __init__(
             dim_head=self.head_dim,
             bias=False,
             out_bias=False,
-            processor=MirageAttnProcessor2_0(),
+            processor=PhotonAttnProcessor2_0(),
         )
 
         # mlp
@@ -538,7 +538,7 @@ def seq2img(seq: Tensor, patch_size: int, shape: Tensor) -> Tensor:
     return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
 
 
-class MirageTransformer2DModel(ModelMixin, ConfigMixin):
+class PhotonTransformer2DModel(ModelMixin, ConfigMixin):
     r"""
     Transformer-based 2D model for text to image generation.
     It supports attention processor injection and LoRA scaling.
@@ -581,7 +581,7 @@ class MirageTransformer2DModel(ModelMixin, ConfigMixin):
         txt_in (`nn.Linear`):
             Projection layer for text conditioning.
         blocks (`nn.ModuleList`):
-            Stack of transformer blocks (`MirageBlock`).
+            Stack of transformer blocks (`PhotonBlock`).
         final_layer (`LastLayer`):
             Projection layer mapping hidden tokens back to patch outputs.
 
@@ -656,7 +656,7 @@ def __init__(
 
         self.blocks = nn.ModuleList(
             [
-                MirageBlock(
+                PhotonBlock(
                     self.hidden_size,
                     self.num_heads,
                     mlp_ratio=mlp_ratio,
@@ -781,7 +781,7 @@ def forward(
         return_dict: bool = True,
     ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
         r"""
-        Forward pass of the MirageTransformer2DModel.
+        Forward pass of the PhotonTransformer2DModel.
 
         The latent image is split into patch tokens, combined with text conditioning,
         and processed through a stack of transformer blocks modulated by the timestep.
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 7b7ebb633c3b..ae0d90c48c63 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -144,7 +144,7 @@
         "FluxKontextPipeline",
         "FluxKontextInpaintPipeline",
     ]
-    _import_structure["mirage"] = ["MiragePipeline"]
+    _import_structure["photon"] = ["PhotonPipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
         "AudioLDM2Pipeline",
diff --git a/src/diffusers/pipelines/mirage/__init__.py b/src/diffusers/pipelines/mirage/__init__.py
deleted file mode 100644
index cba951057370..000000000000
--- a/src/diffusers/pipelines/mirage/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .pipeline_mirage import MiragePipeline
-from .pipeline_output import MiragePipelineOutput
-
-
-__all__ = ["MiragePipeline", "MiragePipelineOutput"]
diff --git a/src/diffusers/pipelines/photon/__init__.py b/src/diffusers/pipelines/photon/__init__.py
new file mode 100644
index 000000000000..d1dd5b2cbf53
--- /dev/null
+++ b/src/diffusers/pipelines/photon/__init__.py
@@ -0,0 +1,5 @@
+from .pipeline_photon import PhotonPipeline
+from .pipeline_output import PhotonPipelineOutput
+
+
+__all__ = ["PhotonPipeline", "PhotonPipelineOutput"]
diff --git a/src/diffusers/pipelines/mirage/pipeline_output.py b/src/diffusers/pipelines/photon/pipeline_output.py
similarity index 93%
rename from src/diffusers/pipelines/mirage/pipeline_output.py
rename to src/diffusers/pipelines/photon/pipeline_output.py
index e41c8e3bea00..ca0674d94b6c 100644
--- a/src/diffusers/pipelines/mirage/pipeline_output.py
+++ b/src/diffusers/pipelines/photon/pipeline_output.py
@@ -22,9 +22,9 @@
 
 
 @dataclass
-class MiragePipelineOutput(BaseOutput):
+class PhotonPipelineOutput(BaseOutput):
     """
-    Output class for Mirage pipelines.
+    Output class for Photon pipelines.
 
     Args:
         images (`List[PIL.Image.Image]` or `np.ndarray`)
diff --git a/src/diffusers/pipelines/mirage/pipeline_mirage.py b/src/diffusers/pipelines/photon/pipeline_photon.py
similarity index 95%
rename from src/diffusers/pipelines/mirage/pipeline_mirage.py
rename to src/diffusers/pipelines/photon/pipeline_photon.py
index ced78adec786..ce3479fedcdd 100644
--- a/src/diffusers/pipelines/mirage/pipeline_mirage.py
+++ b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -31,7 +31,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderDC, AutoencoderKL
-from ...models.transformers.transformer_mirage import MirageTransformer2DModel, seq2img
+from ...models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     logging,
@@ -39,7 +39,7 @@
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import MiragePipelineOutput
+from .pipeline_output import PhotonPipelineOutput
 
 
 DEFAULT_HEIGHT = 512
@@ -49,7 +49,7 @@
 
 
 class TextPreprocessor:
-    """Text preprocessing utility for MiragePipeline."""
+    """Text preprocessing utility for PhotonPipeline."""
 
     def __init__(self):
         """Initialize text preprocessor."""
@@ -179,15 +179,15 @@ def clean_text(self, text: str) -> str:
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import MiragePipeline
+        >>> from diffusers import PhotonPipeline
         >>> from diffusers.models import AutoencoderKL, AutoencoderDC
         >>> from transformers import T5GemmaModel, GemmaTokenizerFast
 
         >>> # Load pipeline directly with from_pretrained
-        >>> pipe = MiragePipeline.from_pretrained("path/to/mirage_checkpoint")
+        >>> pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
 
         >>> # Or initialize pipeline components manually
-        >>> transformer = MirageTransformer2DModel.from_pretrained("path/to/transformer")
+        >>> transformer = PhotonTransformer2DModel.from_pretrained("path/to/transformer")
         >>> scheduler = FlowMatchEulerDiscreteScheduler()
         >>> # Load T5Gemma encoder
         >>> t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
@@ -195,7 +195,7 @@ def clean_text(self, text: str) -> str:
         >>> tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
         >>> vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
 
-        >>> pipe = MiragePipeline(
+        >>> pipe = PhotonPipeline(
         ...     transformer=transformer,
         ...     scheduler=scheduler,
         ...     text_encoder=text_encoder,
@@ -205,26 +205,26 @@ def clean_text(self, text: str) -> str:
         >>> pipe.to("cuda")
         >>> prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
         >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
-        >>> image.save("mirage_output.png")
+        >>> image.save("photon_output.png")
         ```
 """
 
 
-class MiragePipeline(
+class PhotonPipeline(
     DiffusionPipeline,
     LoraLoaderMixin,
     FromSingleFileMixin,
     TextualInversionLoaderMixin,
 ):
     r"""
-    Pipeline for text-to-image generation using Mirage Transformer.
+    Pipeline for text-to-image generation using Photon Transformer.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-        transformer ([`MirageTransformer2DModel`]):
-            The Mirage transformer model to denoise the encoded image latents.
+        transformer ([`PhotonTransformer2DModel`]):
+            The Photon transformer model to denoise the encoded image latents.
         scheduler ([`FlowMatchEulerDiscreteScheduler`]):
             A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
         text_encoder ([`T5EncoderModel`]):
@@ -248,7 +248,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         """
         Override from_pretrained to load VAE and text encoder from HuggingFace models.
 
-        The MiragePipeline checkpoints only store transformer and scheduler locally.
+        The PhotonPipeline checkpoints only store transformer and scheduler locally.
         VAE and text encoder are loaded from external HuggingFace models as specified
         in model_index.json.
         """
@@ -285,7 +285,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # Load transformer and scheduler from local checkpoint
         logger.info(f"Loading transformer from {pretrained_model_name_or_path}...")
-        transformer = MirageTransformer2DModel.from_pretrained(
+        transformer = PhotonTransformer2DModel.from_pretrained(
             pretrained_model_name_or_path, subfolder="transformer"
         )
 
@@ -310,7 +310,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
     def __init__(
         self,
-        transformer: MirageTransformer2DModel,
+        transformer: PhotonTransformer2DModel,
         scheduler: FlowMatchEulerDiscreteScheduler,
         text_encoder: Union[T5EncoderModel, Any],
         tokenizer: Union[T5TokenizerFast, GemmaTokenizerFast, AutoTokenizer],
@@ -318,9 +318,9 @@ def __init__(
     ):
         super().__init__()
 
-        if MirageTransformer2DModel is None:
+        if PhotonTransformer2DModel is None:
             raise ImportError(
-                "MirageTransformer2DModel is not available. Please ensure the transformer_mirage module is properly installed."
+                "PhotonTransformer2DModel is not available. Please ensure the transformer_photon module is properly installed."
             )
 
         self.text_encoder = text_encoder
@@ -544,7 +544,7 @@ def __call__(
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.mirage.MiragePipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.photon.PhotonPipelineOutput`] instead of a plain tuple.
             callback_on_step_end (`Callable`, *optional*):
                 A function that calls at the end of each denoising steps during the inference. The function is called
                 with the following arguments: `callback_on_step_end(self, step, timestep, callback_kwargs)`.
@@ -557,7 +557,7 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.mirage.MiragePipelineOutput`] or `tuple`: [`~pipelines.mirage.MiragePipelineOutput`] if
+            [`~pipelines.photon.PhotonPipelineOutput`] or `tuple`: [`~pipelines.photon.PhotonPipelineOutput`] if
             `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
             generated images.
         """
@@ -683,4 +683,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return MiragePipelineOutput(images=image)
+        return PhotonPipelineOutput(images=image)
diff --git a/tests/models/transformers/test_models_transformer_mirage.py b/tests/models/transformers/test_models_transformer_photon.py
similarity index 95%
rename from tests/models/transformers/test_models_transformer_mirage.py
rename to tests/models/transformers/test_models_transformer_photon.py
index fe7436debc4c..2f08484d230c 100644
--- a/tests/models/transformers/test_models_transformer_mirage.py
+++ b/tests/models/transformers/test_models_transformer_photon.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from diffusers.models.transformers.transformer_mirage import MirageTransformer2DModel
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 
 from ...testing_utils import enable_full_determinism, torch_device
 from ..test_modeling_common import ModelTesterMixin
@@ -26,8 +26,8 @@
 enable_full_determinism()
 
 
-class MirageTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = MirageTransformer2DModel
+class PhotonTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PhotonTransformer2DModel
     main_input_name = "image_latent"
 
     @property
@@ -92,7 +92,7 @@ def test_forward_signature(self):
 
     def test_model_initialization(self):
         # Test model initialization
-        model = MirageTransformer2DModel(
+        model = PhotonTransformer2DModel(
             in_channels=16,
             patch_size=2,
             context_in_dim=1792,
@@ -121,7 +121,7 @@ def test_model_with_dict_config(self):
             "theta": 10_000,
         }
 
-        model = MirageTransformer2DModel.from_config(config_dict)
+        model = PhotonTransformer2DModel.from_config(config_dict)
         self.assertEqual(model.config.in_channels, 16)
         self.assertEqual(model.config.hidden_size, 1792)
 
@@ -193,7 +193,7 @@ def test_attention_mask(self):
     def test_invalid_config(self):
         # Test invalid configuration - hidden_size not divisible by num_heads
         with self.assertRaises(ValueError):
-            MirageTransformer2DModel(
+            PhotonTransformer2DModel(
                 in_channels=16,
                 patch_size=2,
                 context_in_dim=1792,
@@ -207,7 +207,7 @@ def test_invalid_config(self):
 
         # Test invalid axes_dim that doesn't sum to pe_dim
         with self.assertRaises(ValueError):
-            MirageTransformer2DModel(
+            PhotonTransformer2DModel(
                 in_channels=16,
                 patch_size=2,
                 context_in_dim=1792,

From fe0e3d5e6502f172b46761f396d47214452ef59b Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Wed, 8 Oct 2025 13:30:53 +0000
Subject: [PATCH 66/69] add text tower and vae in checkpoint

---
 scripts/convert_photon_to_diffusers.py        |  88 +++++++++---
 .../pipelines/photon/pipeline_photon.py       | 127 ++++--------------
 2 files changed, 93 insertions(+), 122 deletions(-)

diff --git a/scripts/convert_photon_to_diffusers.py b/scripts/convert_photon_to_diffusers.py
index ad04463e019f..8e182bf182d0 100644
--- a/scripts/convert_photon_to_diffusers.py
+++ b/scripts/convert_photon_to_diffusers.py
@@ -19,8 +19,7 @@
 from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.pipelines.photon import PhotonPipeline
 
-DEFAULT_HEIGHT = 512
-DEFAULT_WIDTH = 512
+DEFAULT_RESOLUTION = 512
 
 @dataclass(frozen=True)
 class PhotonBase:
@@ -47,16 +46,19 @@ class PhotonDCAE(PhotonBase):
     patch_size: int = 1
 
 
-def build_config(vae_type: str) -> dict:
+def build_config(vae_type: str, resolution: int = DEFAULT_RESOLUTION) -> dict:
     if vae_type == "flux":
         cfg = PhotonFlux()
+        sample_size = resolution // 8
     elif vae_type == "dc-ae":
         cfg = PhotonDCAE()
+        sample_size = resolution // 32
     else:
         raise ValueError(f"Unsupported VAE type: {vae_type}. Use 'flux' or 'dc-ae'")
 
     config_dict = asdict(cfg)
     config_dict["axes_dim"] = list(config_dict["axes_dim"])  # type: ignore[index]
+    config_dict["sample_size"] = sample_size
     return config_dict
 
 
@@ -194,35 +196,64 @@ def create_scheduler_config(output_path: str):
 
 
 
-def create_model_index(vae_type: str, output_path: str):
-    """Create model_index.json for the pipeline with HuggingFace model references."""
+def download_and_save_vae(vae_type: str, output_path: str):
+    """Download and save VAE to local directory."""
+    from diffusers import AutoencoderKL, AutoencoderDC
+
+    vae_path = os.path.join(output_path, "vae")
+    os.makedirs(vae_path, exist_ok=True)
 
     if vae_type == "flux":
-        vae_model_name = "black-forest-labs/FLUX.1-dev"
-        vae_subfolder = "vae"
-        default_height = DEFAULT_HEIGHT
-        default_width = DEFAULT_WIDTH
+        print("Downloading FLUX VAE from black-forest-labs/FLUX.1-dev...")
+        vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
     else:  # dc-ae
-        vae_model_name = "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"
-        vae_subfolder = None
-        default_height = DEFAULT_HEIGHT
-        default_width = DEFAULT_WIDTH
+        print("Downloading DC-AE VAE from mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers...")
+        vae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers")
+
+    vae.save_pretrained(vae_path)
+    print(f"✓ Saved VAE to {vae_path}")
+
+
+def download_and_save_text_encoder(output_path: str):
+    """Download and save T5Gemma text encoder and tokenizer."""
+    from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
+    from transformers import GemmaTokenizerFast
+
+    text_encoder_path = os.path.join(output_path, "text_encoder")
+    tokenizer_path = os.path.join(output_path, "tokenizer")
+    os.makedirs(text_encoder_path, exist_ok=True)
+    os.makedirs(tokenizer_path, exist_ok=True)
+
+    print("Downloading T5Gemma model from google/t5gemma-2b-2b-ul2...")
+    t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
+
+    t5gemma_model.save_pretrained(text_encoder_path)
+    print(f"✓ Saved T5Gemma model to {text_encoder_path}")
+
+    print("Downloading tokenizer from google/t5gemma-2b-2b-ul2...")
+    tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
+    tokenizer.model_max_length = 256
+    tokenizer.save_pretrained(tokenizer_path)
+    print(f"✓ Saved tokenizer to {tokenizer_path}")
 
-    # Text encoder and tokenizer always use T5Gemma
-    text_model_name = "google/t5gemma-2b-2b-ul2"
+
+def create_model_index(vae_type: str, output_path: str):
+    """Create model_index.json for the pipeline."""
+
+    if vae_type == "flux":
+        vae_class = "AutoencoderKL"
+    else:  # dc-ae
+        vae_class = "AutoencoderDC"
 
     model_index = {
         "_class_name": "PhotonPipeline",
         "_diffusers_version": "0.31.0.dev0",
         "_name_or_path": os.path.basename(output_path),
         "scheduler": ["diffusers", "FlowMatchEulerDiscreteScheduler"],
-        "text_encoder": text_model_name,
-        "tokenizer": text_model_name,
+        "text_encoder": ["transformers", "T5GemmaModel"],
+        "tokenizer": ["transformers", "GemmaTokenizerFast"],
         "transformer": ["diffusers", "PhotonTransformer2DModel"],
-        "vae": vae_model_name,
-        "vae_subfolder": vae_subfolder,
-        "default_height": default_height,
-        "default_width": default_width,
+        "vae": ["diffusers", vae_class],
     }
 
     model_index_path = os.path.join(output_path, "model_index.json")
@@ -234,7 +265,7 @@ def main(args):
     if not os.path.exists(args.checkpoint_path):
         raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint_path}")
 
-    config = build_config(args.vae_type)
+    config = build_config(args.vae_type, args.resolution)
 
     # Create output directory
     os.makedirs(args.output_path, exist_ok=True)
@@ -256,8 +287,13 @@ def main(args):
     save_file(state_dict, os.path.join(transformer_path, "diffusion_pytorch_model.safetensors"))
     print(f"✓ Saved transformer to {transformer_path}")
 
+    # Create scheduler config
     create_scheduler_config(args.output_path)
 
+    download_and_save_vae(args.vae_type, args.output_path)
+    download_and_save_text_encoder(args.output_path)
+
+    # Create model_index.json
     create_model_index(args.vae_type, args.output_path)
 
     # Verify the pipeline can be loaded
@@ -303,6 +339,14 @@ def main(args):
         help="VAE type to use: 'flux' for AutoencoderKL (16 channels) or 'dc-ae' for AutoencoderDC (32 channels)",
     )
 
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        choices=[256, 512, 1024],
+        default=DEFAULT_RESOLUTION,
+        help="Target resolution for the model (256, 512, or 1024). Affects the transformer's sample_size.",
+    )
+
     args = parser.parse_args()
 
     try:
diff --git a/src/diffusers/pipelines/photon/pipeline_photon.py b/src/diffusers/pipelines/photon/pipeline_photon.py
index ce3479fedcdd..6272d7d8ae77 100644
--- a/src/diffusers/pipelines/photon/pipeline_photon.py
+++ b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -28,18 +28,18 @@
     T5TokenizerFast,
 )
 
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderDC, AutoencoderKL
-from ...models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
-from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import (
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderDC, AutoencoderKL
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
     logging,
     replace_example_docstring,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import PhotonPipelineOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.photon.pipeline_output import PhotonPipelineOutput
 
 
 DEFAULT_HEIGHT = 512
@@ -180,29 +180,11 @@ def clean_text(self, text: str) -> str:
         ```py
         >>> import torch
         >>> from diffusers import PhotonPipeline
-        >>> from diffusers.models import AutoencoderKL, AutoencoderDC
-        >>> from transformers import T5GemmaModel, GemmaTokenizerFast
 
-        >>> # Load pipeline directly with from_pretrained
+        >>> # Load pipeline with from_pretrained
         >>> pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
-
-        >>> # Or initialize pipeline components manually
-        >>> transformer = PhotonTransformer2DModel.from_pretrained("path/to/transformer")
-        >>> scheduler = FlowMatchEulerDiscreteScheduler()
-        >>> # Load T5Gemma encoder
-        >>> t5gemma_model = T5GemmaModel.from_pretrained("google/t5gemma-2b-2b-ul2")
-        >>> text_encoder = t5gemma_model.encoder
-        >>> tokenizer = GemmaTokenizerFast.from_pretrained("google/t5gemma-2b-2b-ul2")
-        >>> vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae")
-
-        >>> pipe = PhotonPipeline(
-        ...     transformer=transformer,
-        ...     scheduler=scheduler,
-        ...     text_encoder=text_encoder,
-        ...     tokenizer=tokenizer,
-        ...     vae=vae
-        ... )
         >>> pipe.to("cuda")
+
         >>> prompt = "A digital painting of a rusty, vintage tram on a sandy beach"
         >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
         >>> image.save("photon_output.png")
@@ -240,74 +222,6 @@ class PhotonPipeline(
     _callback_tensor_inputs = ["latents"]
     _optional_components = []
 
-    # Component configurations for automatic loading
-    config_name = "model_index.json"
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        """
-        Override from_pretrained to load VAE and text encoder from HuggingFace models.
-
-        The PhotonPipeline checkpoints only store transformer and scheduler locally.
-        VAE and text encoder are loaded from external HuggingFace models as specified
-        in model_index.json.
-        """
-        import json
-        from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
-
-        model_index_path = os.path.join(pretrained_model_name_or_path, "model_index.json")
-        if not os.path.exists(model_index_path):
-            raise ValueError(f"model_index.json not found in {pretrained_model_name_or_path}")
-
-        with open(model_index_path, "r") as f:
-            model_index = json.load(f)
-
-        vae_model_name = model_index.get("vae")
-        vae_subfolder = model_index.get("vae_subfolder")
-        text_model_name = model_index.get("text_encoder")
-        tokenizer_model_name = model_index.get("tokenizer")
-        default_height = model_index.get("default_height", DEFAULT_HEIGHT)
-        default_width = model_index.get("default_width", DEFAULT_WIDTH)
-
-        logger.info(f"Loading VAE from {vae_model_name}...")
-        if "FLUX" in vae_model_name or "flux" in vae_model_name:
-            vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder=vae_subfolder)
-        else:  # DC-AE
-            vae = AutoencoderDC.from_pretrained(vae_model_name)
-
-        logger.info(f"Loading text encoder from {text_model_name}...")
-        t5gemma_model = T5GemmaModel.from_pretrained(text_model_name)
-        text_encoder = t5gemma_model.encoder
-
-        logger.info(f"Loading tokenizer from {tokenizer_model_name}...")
-        tokenizer = GemmaTokenizerFast.from_pretrained(tokenizer_model_name)
-        tokenizer.model_max_length = 256
-
-        # Load transformer and scheduler from local checkpoint
-        logger.info(f"Loading transformer from {pretrained_model_name_or_path}...")
-        transformer = PhotonTransformer2DModel.from_pretrained(
-            pretrained_model_name_or_path, subfolder="transformer"
-        )
-
-        logger.info(f"Loading scheduler from {pretrained_model_name_or_path}...")
-        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
-            pretrained_model_name_or_path, subfolder="scheduler"
-        )
-
-        pipeline = cls(
-            transformer=transformer,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            vae=vae,
-        )
-
-        # Store default dimensions as pipeline attributes
-        pipeline.default_height = default_height
-        pipeline.default_width = default_width
-
-        return pipeline
-
     def __init__(
         self,
         transformer: PhotonTransformer2DModel,
@@ -323,6 +237,10 @@ def __init__(
                 "PhotonTransformer2DModel is not available. Please ensure the transformer_photon module is properly installed."
             )
 
+        # Extract encoder if text_encoder is T5GemmaModel
+        if hasattr(text_encoder, 'encoder'):
+            text_encoder = text_encoder.encoder
+
         self.text_encoder = text_encoder
         self.tokenizer = tokenizer
         self.text_preprocessor = TextPreprocessor()
@@ -337,7 +255,16 @@ def __init__(
 
         # Enhance VAE with universal properties for both AutoencoderKL and AutoencoderDC
         self._enhance_vae_properties()
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae.spatial_compression_ratio)
+
+        # Set image processor using vae_scale_factor property
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        # Set default dimensions from transformer config
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None and hasattr(self.transformer.config, "sample_size")
+            else 64
+        )
 
     def _enhance_vae_properties(self):
         """Add universal properties to VAE for consistent interface across AutoencoderKL and AutoencoderDC."""
@@ -563,8 +490,8 @@ def __call__(
         """
 
         # 0. Default height and width to transformer config
-        height = height or getattr(self, 'default_height', DEFAULT_HEIGHT)
-        width = width or getattr(self, 'default_width', DEFAULT_WIDTH)
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
 
         # 1. Check inputs
         self.check_inputs(

From 855b068997f965003071411b005fa981ae2a6d49 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Wed, 8 Oct 2025 13:31:15 +0000
Subject: [PATCH 67/69] update doc

---
 docs/source/en/api/pipelines/photon.md | 41 +++++++++++++++++++-------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/api/pipelines/photon.md b/docs/source/en/api/pipelines/photon.md
index f8f7098545f8..e0f963b148d2 100644
--- a/docs/source/en/api/pipelines/photon.md
+++ b/docs/source/en/api/pipelines/photon.md
@@ -18,7 +18,7 @@
   <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
 </div>
 
-Photon is a text-to-image diffusion model using a transformer-based architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports both Flux VAE (AutoencoderKL) and DC-AE (AutoencoderDC) for latent compression.
+Photon is a text-to-image diffusion model using simplified MMDIT architecture with flow matching for efficient high-quality image generation. The model uses T5Gemma as the text encoder and supports either Flux VAE (AutoencoderKL) or DC-AE (AutoencoderDC) for latent compression.
 
 Key features:
 
@@ -28,19 +28,37 @@ Key features:
 - **T5Gemma Text Encoder**: Uses Google's T5Gemma-2B-2B-UL2 model for text encoding offering multiple language support
 - **Efficient Architecture**: ~1.3B parameters in the transformer, enabling fast inference while maintaining quality
 
+## Available models:
+We offer a range of **Photon models** featuring different **VAE configurations**, each optimized for generating images at various resolutions.  
+Both **fine-tuned** and **non-fine-tuned** versions are available:
+
+- **Non-fine-tuned models** perform best with **highly detailed prompts**, capturing fine nuances and complex compositions.  
+- **Fine-tuned models**, trained on the [Alchemist dataset](https://huggingface.co/datasets/yandex/alchemist), enhance the **aesthetic quality** of the base models—especially when prompts are **less detailed**.
+
+
+| Model | Recommended dtype | Resolution | Fine-tuned |
+|:-----:|:-----------------:|:----------:|:----------:|
+| [`Photoroom/photon-256-t2i`](https://huggingface.co/Photoroom/photon-256-t2i) | `torch.bfloat16` | 256x256 | No |
+| [`Photoroom/photon-256-t2i-sft`](https://huggingface.co/Photoroom/photon-256-t2i-sft) | `torch.bfloat16` | 256x256 | Yes |
+| [`Photoroom/photon-512-t2i`](https://huggingface.co/Photoroom/photon-512-t2i) | `torch.bfloat16` | 512x512 | No |
+| [`Photoroom/photon-512-t2i-sft`](hhttps://huggingface.co/Photoroom/photon-512-t2i-sft) | `torch.bfloat16` | 512x512 | Yes |
+| [`Photoroom/photon-512-t2i-dc-ae`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae) | `torch.bfloat16` | 512x512 | No |
+| [`Photoroom/photon-512-t2i-dc-ae-sft`](https://huggingface.co/Photoroom/photon-512-t2i-dc-ae-sft) | `torch.bfloat16` | 512x512 | Yes |
+
+Refer to [this](https://huggingface.co/collections/Photoroom/photon-models-68e66254c202ebfab99ad38e) collection for more information.
 
 ## Loading the Pipeline
 
 Photon checkpoints only store the transformer and scheduler weights locally. The VAE and text encoder are automatically loaded from HuggingFace during pipeline initialization:
 
 ```py
-from diffusers import PhotonPipeline
+from diffusers.pipelines.photon import PhotonPipeline
 
 # Load pipeline - VAE and text encoder will be loaded from HuggingFace
-pipe = PhotonPipeline.from_pretrained("path/to/photon_checkpoint")
+pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i")
 pipe.to("cuda")
 
-prompt = "A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light"
+prompt = "A highly detailed 3D animated scene of a cute, intelligent duck scientist in a futuristic laboratory. The duck stands on a shiny metallic floor surrounded by glowing glass tubes filled with colorful liquids—blue, green, and purple—connected by translucent hoses emitting soft light. The duck wears a tiny white lab coat, safety goggles, and has a curious, determined expression while conducting an experiment. Sparks of energy and soft particle effects fill the air as scientific instruments hum with power. In the background, holographic screens display molecular diagrams and equations. Above the duck’s head, the word “PHOTON” glows vividly in midair as if made of pure light, illuminating the scene with a warm golden glow. The lighting is cinematic, with rich reflections and subtle depth of field, emphasizing a Pixar-like, ultra-polished 3D animation style. Rendered in ultra high resolution, realistic subsurface scattering on the duck’s feathers, and vibrant color grading that gives a sense of wonder and scientific discovery."
 image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
 image.save("photon_output.png")
 ```
@@ -59,12 +77,12 @@ from transformers import T5GemmaModel, GemmaTokenizerFast
 
 # Load transformer
 transformer = PhotonTransformer2DModel.from_pretrained(
-    "path/to/checkpoint", subfolder="transformer"
+    "Photoroom/photon-512-t2i", subfolder="transformer"
 )
 
 # Load scheduler
 scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
-    "path/to/checkpoint", subfolder="scheduler"
+    "Photoroom/photon-512-t2i", subfolder="scheduler"
 )
 
 # Load T5Gemma text encoder
@@ -116,8 +134,11 @@ Key parameters for image generation:
 
 ```py
 # Example with custom parameters
-image = pipe(
-    prompt="A vibrant night sky filled with colorful fireworks, with one large firework burst forming the glowing text “Photon” in bright, sparkling light",
+import torch
+from diffusers.pipelines.photon import PhotonPipeline
+
+pipe = pipe(
+    prompt="A highly detailed 3D animated scene of a cute, intelligent duck scientist in a futuristic laboratory. The duck stands on a shiny metallic floor surrounded by glowing glass tubes filled with colorful liquids—blue, green, and purple—connected by translucent hoses emitting soft light. The duck wears a tiny white lab coat, safety goggles, and has a curious, determined expression while conducting an experiment. Sparks of energy and soft particle effects fill the air as scientific instruments hum with power. In the background, holographic screens display molecular diagrams and equations. Above the duck’s head, the word “PHOTON” glows vividly in midair as if made of pure light, illuminating the scene with a warm golden glow. The lighting is cinematic, with rich reflections and subtle depth of field, emphasizing a Pixar-like, ultra-polished 3D animation style. Rendered in ultra high resolution, realistic subsurface scattering on the duck’s feathers, and vibrant color grading that gives a sense of wonder and scientific discovery.",
     num_inference_steps=28,
     guidance_scale=4.0,
     height=512,
@@ -132,9 +153,9 @@ For memory-constrained environments:
 
 ```py
 import torch
-from diffusers import PhotonPipeline
+from diffusers.pipelines.photon import PhotonPipeline
 
-pipe = PhotonPipeline.from_pretrained("path/to/checkpoint", torch_dtype=torch.float16)
+pipe = PhotonPipeline.from_pretrained("Photoroom/photon-512-t2i", torch_dtype=torch.float16)
 pipe.enable_model_cpu_offload()  # Offload components to CPU when not in use
 
 # Or use sequential CPU offload for even lower memory

From 89beae8286774a257859d26bb316bcb1db4c9cf4 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Wed, 8 Oct 2025 16:15:52 +0200
Subject: [PATCH 68/69] update photon doc

---
 docs/source/en/api/pipelines/photon.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/photon.md b/docs/source/en/api/pipelines/photon.md
index e0f963b148d2..b78219f8b214 100644
--- a/docs/source/en/api/pipelines/photon.md
+++ b/docs/source/en/api/pipelines/photon.md
@@ -68,9 +68,8 @@ image.save("photon_output.png")
 You can also load components individually:
 
 ```py
-import torch
-from diffusers import PhotonPipeline
-from diffusers.models import AutoencoderKL, AutoencoderDC
+from diffusers.pipelines.photon import PhotonPipeline
+from diffusers.models import AutoencoderKL
 from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from transformers import T5GemmaModel, GemmaTokenizerFast

From 2df0e2f74db7c0afd5df9bf57ec1ca9601635d52 Mon Sep 17 00:00:00 2001
From: David Bertoin <david.bertoin@photoroom.com>
Date: Wed, 8 Oct 2025 14:21:06 +0000
Subject: [PATCH 69/69] ruff fixes

---
 scripts/convert_photon_to_diffusers.py        | 22 +++----
 src/diffusers/models/transformers/__init__.py |  2 +-
 .../models/transformers/transformer_photon.py | 62 +++++++++----------
 src/diffusers/pipelines/photon/__init__.py    |  2 +-
 .../pipelines/photon/pipeline_photon.py       | 11 ++--
 5 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/scripts/convert_photon_to_diffusers.py b/scripts/convert_photon_to_diffusers.py
index 8e182bf182d0..0dd114a68997 100644
--- a/scripts/convert_photon_to_diffusers.py
+++ b/scripts/convert_photon_to_diffusers.py
@@ -7,11 +7,11 @@
 import json
 import os
 import sys
+from dataclasses import asdict, dataclass
+from typing import Dict, Tuple
 
 import torch
 from safetensors.torch import save_file
-from dataclasses import dataclass, asdict
-from typing import Tuple, Dict
 
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
@@ -19,8 +19,10 @@
 from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.pipelines.photon import PhotonPipeline
 
+
 DEFAULT_RESOLUTION = 512
 
+
 @dataclass(frozen=True)
 class PhotonBase:
     context_in_dim: int = 2304
@@ -62,7 +64,6 @@ def build_config(vae_type: str, resolution: int = DEFAULT_RESOLUTION) -> dict:
     return config_dict
 
 
-
 def create_parameter_mapping(depth: int) -> dict:
     """Create mapping from old parameter names to new diffusers names."""
 
@@ -174,16 +175,10 @@ def create_transformer_from_checkpoint(checkpoint_path: str, config: dict) -> Ph
     return transformer
 
 
-
-
 def create_scheduler_config(output_path: str):
     """Create FlowMatchEulerDiscreteScheduler config."""
 
-    scheduler_config = {
-        "_class_name": "FlowMatchEulerDiscreteScheduler",
-        "num_train_timesteps": 1000,
-        "shift": 1.0
-    }
+    scheduler_config = {"_class_name": "FlowMatchEulerDiscreteScheduler", "num_train_timesteps": 1000, "shift": 1.0}
 
     scheduler_path = os.path.join(output_path, "scheduler")
     os.makedirs(scheduler_path, exist_ok=True)
@@ -194,11 +189,9 @@ def create_scheduler_config(output_path: str):
     print("✓ Created scheduler config")
 
 
-
-
 def download_and_save_vae(vae_type: str, output_path: str):
     """Download and save VAE to local directory."""
-    from diffusers import AutoencoderKL, AutoencoderDC
+    from diffusers import AutoencoderDC, AutoencoderKL
 
     vae_path = os.path.join(output_path, "vae")
     os.makedirs(vae_path, exist_ok=True)
@@ -216,8 +209,8 @@ def download_and_save_vae(vae_type: str, output_path: str):
 
 def download_and_save_text_encoder(output_path: str):
     """Download and save T5Gemma text encoder and tokenizer."""
-    from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
     from transformers import GemmaTokenizerFast
+    from transformers.models.t5gemma.modeling_t5gemma import T5GemmaModel
 
     text_encoder_path = os.path.join(output_path, "text_encoder")
     tokenizer_path = os.path.join(output_path, "tokenizer")
@@ -260,6 +253,7 @@ def create_model_index(vae_type: str, output_path: str):
     with open(model_index_path, "w") as f:
         json.dump(model_index, f, indent=2)
 
+
 def main(args):
     # Validate inputs
     if not os.path.exists(args.checkpoint_path):
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index 652f6d811393..7fdab560a702 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -29,9 +29,9 @@
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
-    from .transformer_photon import PhotonTransformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
+    from .transformer_photon import PhotonTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
     from .transformer_sd3 import SD3Transformer2DModel
     from .transformer_skyreels_v2 import SkyReelsV2Transformer3DModel
diff --git a/src/diffusers/models/transformers/transformer_photon.py b/src/diffusers/models/transformers/transformer_photon.py
index 9ec6e9756c20..452be9d2e6df 100644
--- a/src/diffusers/models/transformers/transformer_photon.py
+++ b/src/diffusers/models/transformers/transformer_photon.py
@@ -54,7 +54,7 @@ def get_image_ids(batch_size: int, height: int, width: int, patch_size: int, dev
             Tensor of shape `(batch_size, num_patches, 2)` containing the (row, col)
             coordinates of each patch in the image grid.
     """
-    
+
     img_ids = torch.zeros(height // patch_size, width // patch_size, 2, device=device)
     img_ids[..., 0] = torch.arange(height // patch_size, device=device)[:, None]
     img_ids[..., 1] = torch.arange(width // patch_size, device=device)[None, :]
@@ -69,7 +69,7 @@ def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
         xq (`torch.Tensor`):
             Input tensor of shape `(..., dim)` representing the queries.
         freqs_cis (`torch.Tensor`):
-            Precomputed rotary frequency components of shape `(..., dim/2, 2)` 
+            Precomputed rotary frequency components of shape `(..., dim/2, 2)`
             containing cosine and sine pairs.
 
     Returns:
@@ -88,7 +88,7 @@ class EmbedND(nn.Module):
     This module creates rotary embeddings (RoPE) across multiple axes, where each
     axis can have its own embedding dimension. The embeddings are combined and
     returned as a single tensor
-    
+
     Parameters:
         dim (int):
         Base embedding dimension (must be even).
@@ -97,6 +97,7 @@ class EmbedND(nn.Module):
         axes_dim (list[int]):
         List of embedding dimensions for each axis (each must be even).
     """
+
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         super().__init__()
         self.dim = dim
@@ -136,6 +137,7 @@ class MLPEmbedder(nn.Module):
         `torch.Tensor`:
             Tensor of shape `(..., hidden_dim)` containing the embedded representations.
     """
+
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
@@ -160,6 +162,7 @@ class QKNorm(torch.nn.Module):
             A tuple `(q, k)` where both are normalized and cast to the same dtype
             as the value tensor `v`.
     """
+
     def __init__(self, dim: int):
         super().__init__()
         self.query_norm = RMSNorm(dim, eps=1e-6)
@@ -195,6 +198,7 @@ class Modulation(nn.Module):
             A tuple of two modulation outputs. Each `ModulationOut` contains
             three components (e.g., scale, shift, gate).
     """
+
     def __init__(self, dim: int):
         super().__init__()
         self.lin = nn.Linear(dim, 6 * dim, bias=True)
@@ -269,6 +273,7 @@ class PhotonBlock(nn.Module):
                 `torch.Tensor`:
                     Attention output of shape `(B, L_img, hidden_size)`.
     """
+
     def __init__(
         self,
         hidden_size: int,
@@ -363,9 +368,7 @@ def _attn_forward(
             if attention_mask.dim() != 2:
                 raise ValueError(f"Unsupported attention_mask shape: {attention_mask.shape}")
             if attention_mask.shape[-1] != l_txt:
-                raise ValueError(
-                    f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}"
-                )
+                raise ValueError(f"attention_mask last dim {attention_mask.shape[-1]} must equal text length {l_txt}")
 
             device = img_q.device
 
@@ -407,31 +410,30 @@ def forward(
         **_: dict[str, Any],
     ) -> Tensor:
         r"""
-    Runs modulation-gated cross-attention and MLP, with residual connections.
-
-    Parameters:
-        img (`torch.Tensor`):
-            Image tokens of shape `(B, L_img, hidden_size)`.
-        txt (`torch.Tensor`):
-            Text tokens of shape `(B, L_txt, hidden_size)`.
-        vec (`torch.Tensor`):
-            Conditioning vector used by `Modulation` to produce scale/shift/gates,
-            shape `(B, hidden_size)` (or broadcastable).
-        pe (`torch.Tensor`):
-            Rotary positional embeddings applied inside attention.
-        spatial_conditioning (`torch.Tensor`, *optional*):
-            Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only
-            if spatial conditioning is enabled in the block.
-        attention_mask (`torch.Tensor`, *optional*):
-            Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
-        **_:
-            Ignored additional keyword arguments for API compatibility.
+        Runs modulation-gated cross-attention and MLP, with residual connections.
 
-    Returns:
-        `torch.Tensor`:
-            Updated image tokens of shape `(B, L_img, hidden_size)`.
-    """
+        Parameters:
+            img (`torch.Tensor`):
+                Image tokens of shape `(B, L_img, hidden_size)`.
+            txt (`torch.Tensor`):
+                Text tokens of shape `(B, L_txt, hidden_size)`.
+            vec (`torch.Tensor`):
+                Conditioning vector used by `Modulation` to produce scale/shift/gates,
+                shape `(B, hidden_size)` (or broadcastable).
+            pe (`torch.Tensor`):
+                Rotary positional embeddings applied inside attention.
+            spatial_conditioning (`torch.Tensor`, *optional*):
+                Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only
+                if spatial conditioning is enabled in the block.
+            attention_mask (`torch.Tensor`, *optional*):
+                Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
+            **_:
+                Ignored additional keyword arguments for API compatibility.
 
+        Returns:
+            `torch.Tensor`:
+                Updated image tokens of shape `(B, L_img, hidden_size)`.
+        """
 
         mod_attn, mod_mlp = self.modulation(vec)
 
@@ -475,7 +477,6 @@ class LastLayer(nn.Module):
     """
 
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
@@ -727,7 +728,6 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
             fn_recursive_attn_processor(name, module, processor)
 
     def _process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
-        
         txt = self.txt_in(txt)
         img = img2seq(image_latent, self.patch_size)
         bs, _, h, w = image_latent.shape
diff --git a/src/diffusers/pipelines/photon/__init__.py b/src/diffusers/pipelines/photon/__init__.py
index d1dd5b2cbf53..559c9d0b1d2d 100644
--- a/src/diffusers/pipelines/photon/__init__.py
+++ b/src/diffusers/pipelines/photon/__init__.py
@@ -1,5 +1,5 @@
-from .pipeline_photon import PhotonPipeline
 from .pipeline_output import PhotonPipelineOutput
+from .pipeline_photon import PhotonPipeline
 
 
 __all__ = ["PhotonPipeline", "PhotonPipelineOutput"]
diff --git a/src/diffusers/pipelines/photon/pipeline_photon.py b/src/diffusers/pipelines/photon/pipeline_photon.py
index 6272d7d8ae77..0fc926261517 100644
--- a/src/diffusers/pipelines/photon/pipeline_photon.py
+++ b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -14,7 +14,6 @@
 
 import html
 import inspect
-import os
 import re
 import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -32,14 +31,14 @@
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderDC, AutoencoderKL
 from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
+from diffusers.pipelines.photon.pipeline_output import PhotonPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
     logging,
     replace_example_docstring,
 )
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.photon.pipeline_output import PhotonPipelineOutput
 
 
 DEFAULT_HEIGHT = 512
@@ -238,7 +237,7 @@ def __init__(
             )
 
         # Extract encoder if text_encoder is T5GemmaModel
-        if hasattr(text_encoder, 'encoder'):
+        if hasattr(text_encoder, "encoder"):
             text_encoder = text_encoder.encoder
 
         self.text_encoder = text_encoder
@@ -262,7 +261,9 @@ def __init__(
         # Set default dimensions from transformer config
         self.default_sample_size = (
             self.transformer.config.sample_size
-            if hasattr(self, "transformer") and self.transformer is not None and hasattr(self.transformer.config, "sample_size")
+            if hasattr(self, "transformer")
+            and self.transformer is not None
+            and hasattr(self.transformer.config, "sample_size")
             else 64
         )