huggingface
diff --git a/‎examples/community/pipeline_stable_diffusion_xl_t5.py‎
Lines changed: 205 additions & 0 deletions b/‎examples/community/pipeline_stable_diffusion_xl_t5.py‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused_img2img.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused_img2img.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/amused/pipeline_amused_inpaint.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/amused/pipeline_amused_inpaint.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/audioldm/pipeline_audioldm.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/audioldm/pipeline_audioldm.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py‎
Lines changed: 3 additions & 1 deletion b/‎src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py‎
Lines changed: 3 additions & 1 deletion b/‎src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py‎
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py‎
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,205 @@
+# Copyright Philip Brown, ppbrown@github
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################################
+# This pipeline attempts to use a model that has SDXL vae, T5 text encoder,
+# and SDXL unet.
+# At the present time, there are no pretrained models that give pleasing
+# output. So as yet, (2025/06/10) this pipeline is somewhat of a tech
+# demo proving that the pieces can at least be put together.
+# Hopefully, it will encourage someone with the hardware available to
+# throw enough resources into training one up.
+
+
+from typing import Optional
+
+import torch.nn as nn
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+)
+
+from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+
+
+# Note: At this time, the intent is to use the T5 encoder mentioned
+# below, with zero changes.
+# Therefore, the model deliberately does not store the T5 encoder model bytes,
+# (Since they are not unique!)
+# but instead takes advantage of huggingface hub cache loading
+
+T5_NAME = "mcmonkey/google_t5-v1_1-xxl_encoderonly"
+
+# Caller is expected to load this, or equivalent, as model name for now
+#   eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME)
+SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0"
+
+
+class LinearWithDtype(nn.Linear):
+    @property
+    def dtype(self):
+        return self.weight.dtype
+
+
+class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline):
+    _expected_modules = [
+        "vae",
+        "unet",
+        "scheduler",
+        "tokenizer",
+        "image_encoder",
+        "feature_extractor",
+        "t5_encoder",
+        "t5_projection",
+        "t5_pooled_projection",
+    ]
+
+    _optional_components = [
+        "image_encoder",
+        "feature_extractor",
+        "t5_encoder",
+        "t5_projection",
+        "t5_pooled_projection",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        tokenizer: CLIPTokenizer,
+        t5_encoder=None,
+        t5_projection=None,
+        t5_pooled_projection=None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        DiffusionPipeline.__init__(self)
+
+        if t5_encoder is None:
+            self.t5_encoder = T5EncoderModel.from_pretrained(T5_NAME, torch_dtype=unet.dtype)
+        else:
+            self.t5_encoder = t5_encoder
+
+        # ----- build T5 4096 => 2048 dim projection -----
+        if t5_projection is None:
+            self.t5_projection = LinearWithDtype(4096, 2048)  # trainable
+        else:
+            self.t5_projection = t5_projection
+        self.t5_projection.to(dtype=unet.dtype)
+        # ----- build T5 4096 => 1280 dim projection -----
+        if t5_pooled_projection is None:
+            self.t5_pooled_projection = LinearWithDtype(4096, 1280)  # trainable
+        else:
+            self.t5_pooled_projection = t5_pooled_projection
+        self.t5_pooled_projection.to(dtype=unet.dtype)
+
+        print("dtype of Linear is ", self.t5_projection.dtype)
+
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            tokenizer=tokenizer,
+            t5_encoder=self.t5_encoder,
+            t5_projection=self.t5_projection,
+            t5_pooled_projection=self.t5_pooled_projection,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = (
+            self.unet.config.sample_size
+            if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
+            else 128
+        )
+
+        self.watermark = None
+
+        # Parts of original SDXL class complain if these attributes are not
+        # at least PRESENT
+        self.text_encoder = self.text_encoder_2 = None
+
+    # ------------------------------------------------------------------
+    #  Encode a text prompt (T5-XXL + 4096→2048 projection)
+    #  Returns exactly four tensors in the order SDXL’s __call__ expects.
+    # ------------------------------------------------------------------
+    def encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str | None = None,
+        **_,
+    ):
+        """
+        Returns
+        -------
+        prompt_embeds                : Tensor [B, T, 2048]
+        negative_prompt_embeds       : Tensor [B, T, 2048] | None
+        pooled_prompt_embeds         : Tensor [B, 1280]
+        negative_pooled_prompt_embeds: Tensor [B, 1280]    | None
+        where B = batch * num_images_per_prompt
+        """
+
+        # --- helper to tokenize on the pipeline’s device ----------------
+        def _tok(text: str):
+            tok_out = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).to(self.device)
+            return tok_out.input_ids, tok_out.attention_mask
+
+        # ---------- positive stream -------------------------------------
+        ids, mask = _tok(prompt)
+        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state  # [b, T, 4096]
+        tok_pos = self.t5_projection(h_pos)  # [b, T, 2048]
+        pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1))  # [b, 1280]
+
+        # expand for multiple images per prompt
+        tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0)
+        pool_pos = pool_pos.repeat_interleave(num_images_per_prompt, 0)
+
+        # ---------- negative / CFG stream --------------------------------
+        if do_classifier_free_guidance:
+            neg_text = "" if negative_prompt is None else negative_prompt
+            ids_n, mask_n = _tok(neg_text)
+            h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state
+            tok_neg = self.t5_projection(h_neg)
+            pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1))
+
+            tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0)
+            pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0)
+        else:
+            tok_neg = pool_neg = None
+
+        # ----------------- final ordered return --------------------------
+        # 1) positive token embeddings
+        # 2) negative token embeddings (or None)
+        # 3) positive pooled embeddings
+        # 4) negative pooled embeddings (or None)
+        return tok_pos, tok_neg, pool_pos, pool_neg
@@ -21,7 +21,7 @@
 from ...models import UVit2DModel, VQModel
 from ...schedulers import AmusedScheduler
 from ...utils import is_torch_xla_available, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, ImagePipelineOutput
 
 
 if is_torch_xla_available():
@@ -47,7 +47,8 @@
 """
 
 
-class AmusedPipeline(DiffusionPipeline):
+class AmusedPipeline(DeprecatedPipelineMixin, DiffusionPipeline):
+    _last_supported_version = "0.33.1"
     image_processor: VaeImageProcessor
     vqvae: VQModel
     tokenizer: CLIPTokenizer
 
@@ -21,7 +21,7 @@
 from ...models import UVit2DModel, VQModel
 from ...schedulers import AmusedScheduler
 from ...utils import is_torch_xla_available, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, ImagePipelineOutput
 
 
 if is_torch_xla_available():
@@ -57,7 +57,8 @@
 """
 
 
-class AmusedImg2ImgPipeline(DiffusionPipeline):
+class AmusedImg2ImgPipeline(DeprecatedPipelineMixin, DiffusionPipeline):
+    _last_supported_version = "0.33.1"
     image_processor: VaeImageProcessor
     vqvae: VQModel
     tokenizer: CLIPTokenizer
 
@@ -22,7 +22,7 @@
 from ...models import UVit2DModel, VQModel
 from ...schedulers import AmusedScheduler
 from ...utils import is_torch_xla_available, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, ImagePipelineOutput
 
 
 if is_torch_xla_available():
@@ -65,7 +65,8 @@
 """
 
 
-class AmusedInpaintPipeline(DiffusionPipeline):
+class AmusedInpaintPipeline(DeprecatedPipelineMixin, DiffusionPipeline):
+    _last_supported_version = "0.33.1"
     image_processor: VaeImageProcessor
     vqvae: VQModel
     tokenizer: CLIPTokenizer
 
@@ -24,7 +24,7 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
+from ..pipeline_utils import AudioPipelineOutput, DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
 
 
 if is_torch_xla_available():
@@ -57,7 +57,7 @@
 """
 
 
-class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+class AudioLDMPipeline(DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin):
     r"""
     Pipeline for text-to-audio generation using AudioLDM.
 
@@ -81,6 +81,7 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
             Vocoder of class `SpeechT5HifiGan`.
     """
 
+    _last_supported_version = "0.33.1"
     model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
 
@@ -25,7 +25,7 @@
     replace_example_docstring,
 )
 from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, ImagePipelineOutput
 from .blip_image_processing import BlipImageProcessor
 from .modeling_blip2 import Blip2QFormerModel
 from .modeling_ctx_clip import ContextCLIPTextModel
@@ -81,7 +81,7 @@
 """
 
 
-class BlipDiffusionPipeline(DiffusionPipeline):
+class BlipDiffusionPipeline(DeprecatedPipelineMixin, DiffusionPipeline):
     """
     Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.
 
@@ -107,6 +107,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
             Position of the context token in the text encoder.
     """
 
+    _last_supported_version = "0.33.1"
     model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
 
     def __init__(
 
@@ -37,7 +37,7 @@
     unscale_lora_layers,
 )
 from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
 from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
@@ -98,6 +98,7 @@
 
 
 class StableDiffusionControlNetXSPipeline(
+    DeprecatedPipelineMixin,
     DiffusionPipeline,
     StableDiffusionMixin,
     TextualInversionLoaderMixin,
@@ -138,6 +139,7 @@ class StableDiffusionControlNetXSPipeline(
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
 
+    _last_supported_version = "0.33.1"
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
 
@@ -46,7 +46,7 @@
     unscale_lora_layers,
 )
 from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline
 from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
 
@@ -114,6 +114,7 @@
 
 
 class StableDiffusionXLControlNetXSPipeline(
+    DeprecatedPipelineMixin,
     DiffusionPipeline,
     TextualInversionLoaderMixin,
     StableDiffusionXLLoraLoaderMixin,
@@ -158,6 +159,7 @@ class StableDiffusionXLControlNetXSPipeline(
             watermarker is used.
     """
 
+    _last_supported_version = "0.33.1"
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
     _optional_components = [
         "tokenizer",
 
@@ -21,7 +21,7 @@
 from ...schedulers import SchedulerMixin
 from ...utils import is_torch_xla_available, logging
 from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ..pipeline_utils import AudioPipelineOutput, DeprecatedPipelineMixin, DiffusionPipeline
 
 
 if is_torch_xla_available():
@@ -34,7 +34,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class DanceDiffusionPipeline(DiffusionPipeline):
+class DanceDiffusionPipeline(DeprecatedPipelineMixin, DiffusionPipeline):
     r"""
     Pipeline for audio generation.
 
@@ -49,6 +49,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
             [`IPNDMScheduler`].
     """
 
+    _last_supported_version = "0.33.1"
     model_cpu_offload_seq = "unet"
 
     def __init__(self, unet: UNet1DModel, scheduler: SchedulerMixin):