Merge remote-tracking branch '11698/chroma' into chroma-final

DN6 · DN6 · commit f35ec17a837e · 2025-06-13T11:28:57.000+05:30
diff --git a/.github/workflows/pr_style_bot.yml b/.github/workflows/pr_style_bot.yml
@@ -14,4 +14,4 @@ jobs:
     with:
       python_quality_dependencies: "[quality]"
     secrets:
-      bot_token: ${{ secrets.GITHUB_TOKEN }}
+      bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
@@ -65,6 +65,9 @@ transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
 
 For speed and memory benchmarks on Flux and CogVideoX, please refer to the table [here](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450). You can also find some torchao [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) numbers for various hardware.
 
+> [!TIP]
+> The FP8 post-training quantization schemes in torchao are effective for GPUs with compute capability of at least 8.9 (RTX-4090, Hopper, etc.). FP8 often provides the best speed, memory, and quality trade-off when generating images and videos. We recommend combining FP8 and torch.compile if your GPU is compatible.
+
 torchao also supports an automatic quantization API through [autoquant](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#autoquantization). Autoquantization determines the best quantization strategy applicable to a model by comparing the performance of each technique on chosen input types and shapes. Currently, this can be used directly on the underlying modeling components. Diffusers will also expose an autoquant configuration option in the future.
 
 The `TorchAoConfig` class accepts three parameters:
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -3323,8 +3323,8 @@ def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
     num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1  # noqa: C401
     num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1  # noqa: C401
     num_guidance_layers = (
-        list({int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k})[-1] + 1
-    )  # noqa: C401
+        list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1  # noqa: C401
+    )
     mlp_ratio = 4.0
     inner_dim = 3072
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -532,6 +532,7 @@
         )
         from .aura_flow import AuraFlowPipeline
         from .blip_diffusion import BlipDiffusionPipeline
+        from .chroma import ChromaPipeline
         from .cogvideo import (
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -182,7 +182,6 @@ def __init__(
         transformer: ChromaTransformer2DModel,
         image_encoder: CLIPVisionModelWithProjection = None,
         feature_extractor: CLIPImageProcessor = None,
-        variant: str = "flux",
     ):
         super().__init__()
 
@@ -220,24 +219,20 @@ def _get_t5_prompt_embeds(
 
         text_inputs = self.tokenizer(
             prompt,
-            padding="max_length",
+            padding=False,
             max_length=max_sequence_length,
             truncation=True,
             return_length=False,
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
-        text_input_ids = text_inputs.input_ids
+        text_input_ids = text_inputs.input_ids + self.tokenizer.pad_token_id
 
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            attention_mask=text_inputs.attention_mask.to(device),
         )[0]
 
-        max_len = min(text_inputs.attention_mask.sum() + 1, max_sequence_length)
-        prompt_embeds = prompt_embeds[:, :max_len]
-
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
@@ -397,7 +392,6 @@ def check_inputs(
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latent_image_ids
     @staticmethod
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
         latent_image_ids = torch.zeros(height, width, 3)
@@ -412,7 +406,6 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
 
         return latent_image_ids.to(device=device, dtype=dtype)
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
     @staticmethod
     def _pack_latents(latents, batch_size, num_channels_latents, height, width):
         latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
@@ -421,7 +414,6 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
 
         return latents
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
     @staticmethod
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -272,6 +272,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ChromaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CLIPImageProjection(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

Original file line number	Diff line number	Diff line change
`@@ -532,6 +532,7 @@`
`532`	`532`	`)`
`533`	`533`	`from .aura_flow import AuraFlowPipeline`
`534`	`534`	`from .blip_diffusion import BlipDiffusionPipeline`
	`535`	`+ from .chroma import ChromaPipeline`
`535`	`536`	`from .cogvideo import (`
`536`	`537`	`CogVideoXFunControlPipeline,`
`537`	`538`	`CogVideoXImageToVideoPipeline,`