dg845
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 6 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎docs/source/en/api/image_processor.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/api/image_processor.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/pipelines/hunyuan_video.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/en/api/pipelines/hunyuan_video.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 33 additions & 1 deletion b/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎docs/source/en/optimization/memory.md‎
Lines changed: 46 additions & 3 deletions b/‎docs/source/en/optimization/memory.md‎
Lines changed: 46 additions & 3 deletions
diff --git a/‎docs/source/en/quantization/overview.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/source/en/quantization/overview.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/source/en/using-diffusers/image_quality.md‎
Lines changed: 2 additions & 8 deletions b/‎docs/source/en/using-diffusers/image_quality.md‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎docs/source/en/using-diffusers/loading.md‎
Lines changed: 6 additions & 33 deletions b/‎docs/source/en/using-diffusers/loading.md‎
Lines changed: 6 additions & 33 deletions
@@ -24,6 +24,8 @@
     title: Reproducibility
   - local: using-diffusers/schedulers
     title: Load schedulers and models
+  - local: using-diffusers/models
+    title: Models
   - local: using-diffusers/scheduler_features
     title: Scheduler features
   - local: using-diffusers/other-formats
@@ -58,12 +60,6 @@
     title: Batch inference
   - local: training/distributed_inference
     title: Distributed inference
-  - local: using-diffusers/scheduler_features
-    title: Scheduler features
-  - local: using-diffusers/callback
-    title: Pipeline callbacks
-  - local: using-diffusers/image_quality
-    title: Controlling image quality
 
 - title: Inference optimization
   isExpanded: false
@@ -92,6 +88,8 @@
       title: xDiT
     - local: optimization/para_attn
       title: ParaAttention
+    - local: using-diffusers/image_quality
+      title: FreeU
 
 - title: Hybrid Inference
   isExpanded: false
 
@@ -20,6 +20,12 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu
 
 [[autodoc]] image_processor.VaeImageProcessor
 
+## InpaintProcessor
+
+The [`InpaintProcessor`] accepts `mask` and `image` inputs and process them together. Optionally, it can accept padding_mask_crop and apply mask overlay.
+
+[[autodoc]] image_processor.InpaintProcessor
+
 ## VaeImageProcessorLDM3D
 
 The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.
 
@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video
 pipeline_quant_config = PipelineQuantizationConfig(
   quant_backend="torchao",
   quant_kwargs={"quant_type": "int8wo"},
-  components_to_quantize=["transformer"]
+  components_to_quantize="transformer"
 )
 
 # fp8 layerwise weight-casting
 
@@ -54,7 +54,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
       "bnb_4bit_quant_type": "nf4",
       "bnb_4bit_compute_dtype": torch.bfloat16
       },
-    components_to_quantize=["transformer"]
+    components_to_quantize="transformer"
 )
 
 pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -91,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
       "bnb_4bit_quant_type": "nf4",
       "bnb_4bit_compute_dtype": torch.bfloat16
       },
-    components_to_quantize=["transformer"]
+    components_to_quantize="transformer"
 )
 
 pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -139,7 +139,7 @@ export_to_video(video, "output.mp4", fps=15)
         "bnb_4bit_quant_type": "nf4",
         "bnb_4bit_compute_dtype": torch.bfloat16
         },
-      components_to_quantize=["transformer"]
+      components_to_quantize="transformer"
   )
 
   pipeline = HunyuanVideoPipeline.from_pretrained(
 
@@ -26,6 +26,7 @@ Qwen-Image comes in the following variants:
 |:----------:|:--------:|
 | Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
 | Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
+| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |
 
 <Tip>
 
@@ -96,6 +97,29 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan
 
 </Tip>
 
+## Multi-image reference with QwenImageEditPlusPipeline
+
+With [`QwenImageEditPlusPipeline`], one can provide multiple images as input reference.
+
+```
+import torch
+from PIL import Image
+from diffusers import QwenImageEditPlusPipeline
+from diffusers.utils import load_image
+
+pipe = QwenImageEditPlusPipeline.from_pretrained(
+    "Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16
+).to("cuda")
+
+image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/grumpy.jpg")
+image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png")
+image = pipe(
+    image=[image_1, image_2], 
+    prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"", 
+    num_inference_steps=50
+).images[0]
+```
+
 ## QwenImagePipeline
 
 [[autodoc]] QwenImagePipeline
@@ -126,7 +150,15 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan
   - all
   - __call__
 
-## QwenImaggeControlNetPipeline
+## QwenImageControlNetPipeline
+
+[[autodoc]] QwenImageControlNetPipeline
+  - all
+  - __call__
+
+## QwenImageEditPlusPipeline
+
+[[autodoc]] QwenImageEditPlusPipeline
   - all
   - __call__
 
 
@@ -291,13 +291,53 @@ Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://
 > [!WARNING]
 > Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
 
-Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
-
-The `offload_type` parameter can be set to `block_level` or `leaf_level`.
+Enable group offloading by configuring the `offload_type` parameter to `block_level` or `leaf_level`.
 
 - `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
 - `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
 
+Group offloading is supported for entire pipelines or individual models. Applying group offloading to the entire pipeline is the easiest option while selectively applying it to individual models gives users more flexibility to use different offloading techniques for different models.
+
+<hfoptions id="group-offloading">
+<hfoption id="pipeline">
+
+Call [`~DiffusionPipeline.enable_group_offload`] on a pipeline.
+
+```py
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.hooks import apply_group_offloading
+from diffusers.utils import export_to_video
+
+onload_device = torch.device("cuda")
+offload_device = torch.device("cpu")
+
+pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+pipeline.enable_group_offload(
+    onload_device=onload_device,
+    offload_device=offload_device,
+    offload_type="leaf_level",
+    use_stream=True
+)
+
+prompt = (
+    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
+    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
+    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
+    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
+    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
+    "atmosphere of this unique musical performance."
+)
+video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+export_to_video(video, "output.mp4", fps=8)
+```
+
+</hfoption>
+<hfoption id="model">
+
+Call [`~ModelMixin.enable_group_offload`] on standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
+
 ```py
 import torch
 from diffusers import CogVideoXPipeline
@@ -328,6 +368,9 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
 export_to_video(video, "output.mp4", fps=8)
 ```
 
+</hfoption>
+</hfoptions>
+
 #### CUDA stream
 
 The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
 
@@ -34,7 +34,9 @@ Initialize [`~quantizers.PipelineQuantizationConfig`] with the following paramet
 > [!TIP]
 > These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
 
-- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
+- `components_to_quantize` specifies which component(s) of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
+
+   `components_to_quantize` accepts either a list for multiple models or a string for a single model.
 
 The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
 
@@ -62,6 +64,7 @@ pipe = DiffusionPipeline.from_pretrained(
 image = pipe("photo of a cute dog").images[0]
 ```
 
+
 ### Advanced quantization
 
 The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.
 
@@ -10,13 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Controlling image quality
-
-The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
-
-This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
-
-## Details
+# FreeU
 
 [FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
 
@@ -139,7 +133,7 @@ export_to_video(video_frames, "teddy_bear.mp4", fps=10)
 </hfoption>
 </hfoptions>
 
-Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
+Call the [`~pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
 
 ```py
 pipeline.disable_freeu()
 
@@ -108,50 +108,27 @@ print(pipeline.transformer.dtype, pipeline.vae.dtype)
 
 The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs.
 
-Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
+A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Refer to the table below to compare the placement strategies.
 
 | parameter | description |
 |---|---|
-| `"cuda"` | places model or pipeline on CUDA device |
-| `"balanced"` | evenly distributes model or pipeline on all GPUs |
-| `"auto"` | distribute model from fastest device first to slowest |
+| `"cuda"` | places pipeline on a supported accelerator device like CUDA |
+| `"balanced"` | evenly distributes pipeline on all GPUs |
 
 Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
 
-<hfoptions id="device_map">
-<hfoption id="pipeline">
-
 ```py
 import torch
 from diffusers import DiffusionPipeline
 
+max_memory = {0: "16GB", 1: "16GB"}
 pipeline = DiffusionPipeline.from_pretrained(
   "Qwen/Qwen-Image", 
   torch_dtype=torch.bfloat16,
   device_map="cuda",
 )
 ```
 
-</hfoption>
-<hfoption id="individual model">
-
-```py
-import torch
-from diffusers import AutoModel
-
-max_memory = {0: "16GB", 1: "16GB"}
-transformer = AutoModel.from_pretrained(
-    "Qwen/Qwen-Image", 
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-    device_map="cuda",
-    max_memory=max_memory
-)
-```
-
-</hfoption>
-</hfoptions>
-
 The `hf_device_map` attribute allows you to access and view the `device_map`.
 
 ```py
@@ -189,22 +166,18 @@ pipeline = DiffusionPipeline.from_pretrained(
 
 [`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones.
 
-The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder.
+The example below uses a more stable VAE version.
 
 ```py
 import torch
-from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel
+from diffusers import DiffusionPipeline, AutoModel
 
-scheduler = HeunDiscreteScheduler.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
-)
 vae = AutoModel.from_pretrained(
   "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
 )
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  scheduler=scheduler,
   vae=vae,
   torch_dtype=torch.float16,
   device_map="cuda"
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video`
`50`	`50`	`pipeline_quant_config = PipelineQuantizationConfig(`
`51`	`51`	`quant_backend="torchao",`
`52`	`52`	`quant_kwargs={"quant_type": "int8wo"},`
`53`		`- components_to_quantize=["transformer"]`
	`53`	`+ components_to_quantize="transformer"`
`54`	`54`	`)`
`55`	`55`
`56`	`56`	`# fp8 layerwise weight-casting`