Merge branch 'main' into xpu-precision-ut

kaixuanliu · web-flow · commit 1245393e4ddb · 2025-06-20T12:52:30.000+08:00
diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
@@ -27,9 +27,36 @@ Chroma can use all the same optimizations as Flux.
 
 </Tip>
 
-## Inference (Single File)
+## Inference
 
-The `ChromaTransformer2DModel` supports loading checkpoints in the original format. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
+The Diffusers version of Chroma is based on the [`unlocked-v37`](https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v37.safetensors) version of the original model, which is available in the [Chroma repository](https://huggingface.co/lodestones/Chroma).
+
+```python
+import torch
+from diffusers import ChromaPipeline
+
+pipe = ChromaPipeline.from_pretrained("lodestones/Chroma", torch_dtype=torch.bfloat16)
+pipe.enabe_model_cpu_offload()
+
+prompt = [
+    "A high-fashion close-up portrait of a blonde woman in clear sunglasses. The image uses a bold teal and red color split for dramatic lighting. The background is a simple teal-green. The photo is sharp and well-composed, and is designed for viewing with anaglyph 3D glasses for optimal effect. It looks professionally done."
+]
+negative_prompt =  ["low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"]
+
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    generator=torch.Generator("cpu").manual_seed(433),
+    num_inference_steps=40,
+    guidance_scale=3.0,
+    num_images_per_prompt=1,
+).images[0]
+image.save("chroma.png")
+```
+
+## Loading from a single file
+
+To use updated model checkpoints that are not in the Diffusers format, you can use the `ChromaTransformer2DModel` class to load the model from a single file in the original format. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
 
 The following example demonstrates how to run Chroma from a single file.
 
@@ -38,34 +65,39 @@ Then run the following example
 ```python
 import torch
 from diffusers import ChromaTransformer2DModel, ChromaPipeline
-from transformers import T5EncoderModel
 
-bfl_repo = "black-forest-labs/FLUX.1-dev"
+model_id = "lodestones/Chroma"
 dtype = torch.bfloat16
 
-transformer = ChromaTransformer2DModel.from_single_file("https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v35.safetensors", torch_dtype=dtype)
-
-text_encoder = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
-tokenizer = T5Tokenizer.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype)
-
-pipe = ChromaPipeline.from_pretrained(bfl_repo, transformer=transformer, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=dtype)
+transformer = ChromaTransformer2DModel.from_single_file("https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v37.safetensors", torch_dtype=dtype)
 
+pipe = ChromaPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=dtype)
 pipe.enable_model_cpu_offload()
 
-prompt = "A cat holding a sign that says hello world"
+prompt = [
+    "A high-fashion close-up portrait of a blonde woman in clear sunglasses. The image uses a bold teal and red color split for dramatic lighting. The background is a simple teal-green. The photo is sharp and well-composed, and is designed for viewing with anaglyph 3D glasses for optimal effect. It looks professionally done."
+]
+negative_prompt =  ["low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"]
+
 image = pipe(
-    prompt,
-    guidance_scale=4.0,
-    output_type="pil",
-    num_inference_steps=26,
-    generator=torch.Generator("cpu").manual_seed(0)
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    generator=torch.Generator("cpu").manual_seed(433),
+    num_inference_steps=40,
+    guidance_scale=3.0,
 ).images[0]
 
-image.save("image.png")
+image.save("chroma-single-file.png")
 ```
 
 ## ChromaPipeline
 
 [[autodoc]] ChromaPipeline
 	- all
 	- __call__
+
+## ChromaImg2ImgPipeline
+
+[[autodoc]] ChromaImg2ImgPipeline
+	- all
+	- __call__
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -52,20 +52,21 @@
         >>> import torch
         >>> from diffusers import ChromaPipeline
 
+        >>> model_id = "lodestones/Chroma"
         >>> ckpt_path = "https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v37.safetensors"
         >>> transformer = ChromaTransformer2DModel.from_single_file(ckpt_path, torch_dtype=torch.bfloat16)
-        >>> text_encoder = AutoModel.from_pretrained("black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2")
-        >>> tokenizer = AutoTokenizer.from_pretrained("black-forest-labs/FLUX.1-schnell", subfolder="tokenizer_2")
-        >>> pipe = ChromaImg2ImgPipeline.from_pretrained(
-        ...     "black-forest-labs/FLUX.1-schnell",
+        >>> pipe = ChromaPipeline.from_pretrained(
+        ...     model_id,
         ...     transformer=transformer,
-        ...     text_encoder=text_encoder,
-        ...     tokenizer=tokenizer,
         ...     torch_dtype=torch.bfloat16,
         ... )
         >>> pipe.enable_model_cpu_offload()
-        >>> prompt = "A cat holding a sign that says hello world"
-        >>> negative_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
+        >>> prompt = [
+        ...     "A high-fashion close-up portrait of a blonde woman in clear sunglasses. The image uses a bold teal and red color split for dramatic lighting. The background is a simple teal-green. The photo is sharp and well-composed, and is designed for viewing with anaglyph 3D glasses for optimal effect. It looks professionally done."
+        ... ]
+        >>> negative_prompt = [
+        ...     "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
+        ... ]
         >>> image = pipe(prompt, negative_prompt=negative_prompt).images[0]
         >>> image.save("chroma.png")
         ```
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py
@@ -51,26 +51,21 @@
         ```py
         >>> import torch
         >>> from diffusers import ChromaTransformer2DModel, ChromaImg2ImgPipeline
-        >>> from transformers import AutoModel, Autotokenizer
 
+        >>> model_id = "lodestones/Chroma"
         >>> ckpt_path = "https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v37.safetensors"
-        >>> transformer = ChromaTransformer2DModel.from_single_file(ckpt_path, torch_dtype=torch.bfloat16)
-        >>> text_encoder = AutoModel.from_pretrained("black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2")
-        >>> tokenizer = AutoTokenizer.from_pretrained("black-forest-labs/FLUX.1-schnell", subfolder="tokenizer_2")
         >>> pipe = ChromaImg2ImgPipeline.from_pretrained(
-        ...     "black-forest-labs/FLUX.1-schnell",
+        ...     model_id,
         ...     transformer=transformer,
-        ...     text_encoder=text_encoder,
-        ...     tokenizer=tokenizer,
         ...     torch_dtype=torch.bfloat16,
         ... )
         >>> pipe.enable_model_cpu_offload()
-        >>> image = load_image(
+        >>> init_image = load_image(
         ...     "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
         ... )
         >>> prompt = "a scenic fastasy landscape with a river and mountains in the background, vibrant colors, detailed, high resolution"
         >>> negative_prompt = "low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors"
-        >>> image = pipe(prompt, image=image, negative_prompt=negative_prompt).images[0]
+        >>> image = pipe(prompt, image=init_image, negative_prompt=negative_prompt).images[0]
         >>> image.save("chroma-img2img.png")
         ```
 """
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py b/src/diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py
@@ -44,6 +44,8 @@ def retrieve_latents(
 
 
 class LTXLatentUpsamplePipeline(DiffusionPipeline):
+    model_cpu_offload_seq = ""
+
     def __init__(
         self,
         vae: AutoencoderKLLTXVideo,
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -1131,3 +1131,26 @@ def _maybe_raise_error_for_incorrect_transformers(config_dict):
                 break
     if has_transformers_component and not is_transformers_version(">", "4.47.1"):
         raise ValueError("Please upgrade your `transformers` installation to the latest version to use DDUF.")
+
+
+def _maybe_warn_for_wrong_component_in_quant_config(pipe_init_dict, quant_config):
+    if quant_config is None:
+        return
+
+    actual_pipe_components = set(pipe_init_dict.keys())
+    missing = ""
+    quant_components = None
+    if getattr(quant_config, "components_to_quantize", None) is not None:
+        quant_components = set(quant_config.components_to_quantize)
+    elif getattr(quant_config, "quant_mapping", None) is not None and isinstance(quant_config.quant_mapping, dict):
+        quant_components = set(quant_config.quant_mapping.keys())
+
+    if quant_components and not quant_components.issubset(actual_pipe_components):
+        missing = quant_components - actual_pipe_components
+
+    if missing:
+        logger.warning(
+            f"The following components in the quantization config {missing} will be ignored "
+            "as they do not belong to the underlying pipeline. Acceptable values for the pipeline "
+            f"components are: {', '.join(actual_pipe_components)}."
+        )
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -88,6 +88,7 @@
     _identify_model_variants,
     _maybe_raise_error_for_incorrect_transformers,
     _maybe_raise_warning_for_inpainting,
+    _maybe_warn_for_wrong_component_in_quant_config,
     _resolve_custom_pipeline_and_cls,
     _unwrap_model,
     _update_init_kwargs_with_connected_pipeline,
@@ -984,6 +985,7 @@ def load_module(name, value):
 
         # 7. Load each module in the pipeline
         current_device_map = None
+        _maybe_warn_for_wrong_component_in_quant_config(init_dict, quantization_config)
         for name, (library_name, class_name) in logging.tqdm(init_dict.items(), desc="Loading pipeline components..."):
             # 7.1 device_map shenanigans
             if final_device_map is not None and len(final_device_map) > 0:
diff --git a/tests/quantization/test_pipeline_level_quantization.py b/tests/quantization/test_pipeline_level_quantization.py
@@ -16,10 +16,13 @@
 import unittest
 
 import torch
+from parameterized import parameterized
 
 from diffusers import DiffusionPipeline, QuantoConfig
 from diffusers.quantizers import PipelineQuantizationConfig
+from diffusers.utils import logging
 from diffusers.utils.testing_utils import (
+    CaptureLogger,
     is_transformers_available,
     require_accelerate,
     require_bitsandbytes_version_greater,
@@ -188,3 +191,55 @@ def test_saving_loading(self):
         output_2 = loaded_pipe(**pipe_inputs, generator=torch.manual_seed(self.seed)).images
 
         self.assertTrue(torch.allclose(output_1, output_2))
+
+    @parameterized.expand(["quant_kwargs", "quant_mapping"])
+    def test_warn_invalid_component(self, method):
+        invalid_component = "foo"
+        if method == "quant_kwargs":
+            components_to_quantize = ["transformer", invalid_component]
+            quant_config = PipelineQuantizationConfig(
+                quant_backend="bitsandbytes_8bit",
+                quant_kwargs={"load_in_8bit": True},
+                components_to_quantize=components_to_quantize,
+            )
+        else:
+            quant_config = PipelineQuantizationConfig(
+                quant_mapping={
+                    "transformer": QuantoConfig("int8"),
+                    invalid_component: TranBitsAndBytesConfig(load_in_8bit=True),
+                }
+            )
+
+        logger = logging.get_logger("diffusers.pipelines.pipeline_loading_utils")
+        logger.setLevel(logging.WARNING)
+        with CaptureLogger(logger) as cap_logger:
+            _ = DiffusionPipeline.from_pretrained(
+                self.model_name,
+                quantization_config=quant_config,
+                torch_dtype=torch.bfloat16,
+            )
+        self.assertTrue(invalid_component in cap_logger.out)
+
+    @parameterized.expand(["quant_kwargs", "quant_mapping"])
+    def test_no_quantization_for_all_invalid_components(self, method):
+        invalid_component = "foo"
+        if method == "quant_kwargs":
+            components_to_quantize = [invalid_component]
+            quant_config = PipelineQuantizationConfig(
+                quant_backend="bitsandbytes_8bit",
+                quant_kwargs={"load_in_8bit": True},
+                components_to_quantize=components_to_quantize,
+            )
+        else:
+            quant_config = PipelineQuantizationConfig(
+                quant_mapping={invalid_component: TranBitsAndBytesConfig(load_in_8bit=True)}
+            )
+
+        pipe = DiffusionPipeline.from_pretrained(
+            self.model_name,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        for name, component in pipe.components.items():
+            if isinstance(component, torch.nn.Module):
+                self.assertTrue(not hasattr(component.config, "quantization_config"))