huggingface
diff --git a/‎.github/workflows/nightly_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nightly_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/diffusers-onnxruntime-cpu/Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎docker/diffusers-onnxruntime-cpu/Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 43 additions & 33 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 43 additions & 33 deletions
diff --git a/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/auto_model.md‎
Lines changed: 29 additions & 0 deletions b/‎docs/source/en/api/models/auto_model.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/autoencoderkl_allegro.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/autoencoderkl_allegro.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/controlnet_sana.md‎
Lines changed: 29 additions & 0 deletions b/‎docs/source/en/api/models/controlnet_sana.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/hidream_image_transformer.md‎
Lines changed: 30 additions & 0 deletions b/‎docs/source/en/api/models/hidream_image_transformer.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/aura_flow.md‎
Lines changed: 17 additions & 0 deletions b/‎docs/source/en/api/pipelines/aura_flow.md‎
Lines changed: 17 additions & 0 deletions
@@ -417,7 +417,7 @@ jobs:
             additional_deps: ["peft"]
           - backend: "gguf"
             test_location: "gguf"
-            additional_deps: []
+            additional_deps: ["peft"]
           - backend: "torchao"
             test_location: "torchao"
             additional_deps: []
 
@@ -28,9 +28,9 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3 -m uv pip install --no-cache-dir \
-        torch==2.1.2 \
-        torchvision==0.16.2 \
-        torchaudio==2.1.2 \
+        torch \
+        torchvision \
+        torchaudio\
         onnxruntime \
         --extra-index-url https://download.pytorch.org/whl/cpu && \
     python3 -m uv pip install --no-cache-dir \
 
@@ -175,7 +175,7 @@
     title: gguf
   - local: quantization/torchao
     title: torchao
-  - local: quantization/quanto 
+  - local: quantization/quanto
     title: quanto
   title: Quantization Methods
 - sections:
@@ -265,19 +265,23 @@
     sections:
     - local: api/models/overview
       title: Overview
+    - local: api/models/auto_model
+      title: AutoModel
     - sections:
       - local: api/models/controlnet
         title: ControlNetModel
+      - local: api/models/controlnet_union
+        title: ControlNetUnionModel
       - local: api/models/controlnet_flux
         title: FluxControlNetModel
       - local: api/models/controlnet_hunyuandit
         title: HunyuanDiT2DControlNetModel
+      - local: api/models/controlnet_sana
+        title: SanaControlNetModel
       - local: api/models/controlnet_sd3
         title: SD3ControlNetModel
       - local: api/models/controlnet_sparsectrl
         title: SparseControlNetModel
-      - local: api/models/controlnet_union
-        title: ControlNetUnionModel
       title: ControlNets
     - sections:
       - local: api/models/allegro_transformer3d
@@ -286,30 +290,32 @@
         title: AuraFlowTransformer2DModel
       - local: api/models/cogvideox_transformer3d
         title: CogVideoXTransformer3DModel
-      - local: api/models/consisid_transformer3d
-        title: ConsisIDTransformer3DModel
       - local: api/models/cogview3plus_transformer2d
         title: CogView3PlusTransformer2DModel
       - local: api/models/cogview4_transformer2d
         title: CogView4Transformer2DModel
+      - local: api/models/consisid_transformer3d
+        title: ConsisIDTransformer3DModel
       - local: api/models/dit_transformer2d
         title: DiTTransformer2DModel
       - local: api/models/easyanimate_transformer3d
         title: EasyAnimateTransformer3DModel
       - local: api/models/flux_transformer
         title: FluxTransformer2DModel
+      - local: api/models/hidream_image_transformer
+        title: HiDreamImageTransformer2DModel
       - local: api/models/hunyuan_transformer2d
         title: HunyuanDiT2DModel
       - local: api/models/hunyuan_video_transformer_3d
         title: HunyuanVideoTransformer3DModel
       - local: api/models/latte_transformer3d
         title: LatteTransformer3DModel
-      - local: api/models/lumina_nextdit2d
-        title: LuminaNextDiT2DModel
-      - local: api/models/lumina2_transformer2d
-        title: Lumina2Transformer2DModel
       - local: api/models/ltx_video_transformer3d
         title: LTXVideoTransformer3DModel
+      - local: api/models/lumina2_transformer2d
+        title: Lumina2Transformer2DModel
+      - local: api/models/lumina_nextdit2d
+        title: LuminaNextDiT2DModel
       - local: api/models/mochi_transformer3d
         title: MochiTransformer3DModel
       - local: api/models/omnigen_transformer
@@ -318,10 +324,10 @@
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
         title: PriorTransformer
-      - local: api/models/sd3_transformer2d
-        title: SD3Transformer2DModel
       - local: api/models/sana_transformer2d
         title: SanaTransformer2DModel
+      - local: api/models/sd3_transformer2d
+        title: SD3Transformer2DModel
       - local: api/models/stable_audio_transformer
         title: StableAudioDiTModel
       - local: api/models/transformer2d
@@ -336,10 +342,10 @@
         title: StableCascadeUNet
       - local: api/models/unet
         title: UNet1DModel
-      - local: api/models/unet2d
-        title: UNet2DModel
       - local: api/models/unet2d-cond
         title: UNet2DConditionModel
+      - local: api/models/unet2d
+        title: UNet2DModel
       - local: api/models/unet3d-cond
         title: UNet3DConditionModel
       - local: api/models/unet-motion
@@ -348,6 +354,10 @@
         title: UViT2DModel
       title: UNets
     - sections:
+      - local: api/models/asymmetricautoencoderkl
+        title: AsymmetricAutoencoderKL
+      - local: api/models/autoencoder_dc
+        title: AutoencoderDC
       - local: api/models/autoencoderkl
         title: AutoencoderKL
       - local: api/models/autoencoderkl_allegro
@@ -364,10 +374,6 @@
         title: AutoencoderKLMochi
       - local: api/models/autoencoder_kl_wan
         title: AutoencoderKLWan
-      - local: api/models/asymmetricautoencoderkl
-        title: AsymmetricAutoencoderKL
-      - local: api/models/autoencoder_dc
-        title: AutoencoderDC
       - local: api/models/consistency_decoder_vae
         title: ConsistencyDecoderVAE
       - local: api/models/autoencoder_oobleck
@@ -420,6 +426,8 @@
       title: ControlNet with Stable Diffusion 3
     - local: api/pipelines/controlnet_sdxl
       title: ControlNet with Stable Diffusion XL
+    - local: api/pipelines/controlnet_sana
+      title: ControlNet-Sana
     - local: api/pipelines/controlnetxs
       title: ControlNet-XS
     - local: api/pipelines/controlnetxs_sdxl
@@ -444,6 +452,8 @@
       title: Flux
     - local: api/pipelines/control_flux_inpaint
       title: FluxControlInpaint
+    - local: api/pipelines/hidream
+      title: HiDream-I1
     - local: api/pipelines/hunyuandit
       title: Hunyuan-DiT
     - local: api/pipelines/hunyuan_video
@@ -511,40 +521,40 @@
     - sections:
       - local: api/pipelines/stable_diffusion/overview
         title: Overview
-      - local: api/pipelines/stable_diffusion/text2img
-        title: Text-to-image
+      - local: api/pipelines/stable_diffusion/depth2img
+        title: Depth-to-image
+      - local: api/pipelines/stable_diffusion/gligen
+        title: GLIGEN (Grounded Language-to-Image Generation)
+      - local: api/pipelines/stable_diffusion/image_variation
+        title: Image variation
       - local: api/pipelines/stable_diffusion/img2img
         title: Image-to-image
       - local: api/pipelines/stable_diffusion/svd
         title: Image-to-video
       - local: api/pipelines/stable_diffusion/inpaint
         title: Inpainting
-      - local: api/pipelines/stable_diffusion/depth2img
-        title: Depth-to-image
-      - local: api/pipelines/stable_diffusion/image_variation
-        title: Image variation
+      - local: api/pipelines/stable_diffusion/k_diffusion
+        title: K-Diffusion
+      - local: api/pipelines/stable_diffusion/latent_upscale
+        title: Latent upscaler
+      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
+        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
       - local: api/pipelines/stable_diffusion/stable_diffusion_safe
         title: Safe Stable Diffusion
+      - local: api/pipelines/stable_diffusion/sdxl_turbo
+        title: SDXL Turbo
       - local: api/pipelines/stable_diffusion/stable_diffusion_2
         title: Stable Diffusion 2
       - local: api/pipelines/stable_diffusion/stable_diffusion_3
         title: Stable Diffusion 3
       - local: api/pipelines/stable_diffusion/stable_diffusion_xl
         title: Stable Diffusion XL
-      - local: api/pipelines/stable_diffusion/sdxl_turbo
-        title: SDXL Turbo
-      - local: api/pipelines/stable_diffusion/latent_upscale
-        title: Latent upscaler
       - local: api/pipelines/stable_diffusion/upscale
         title: Super-resolution
-      - local: api/pipelines/stable_diffusion/k_diffusion
-        title: K-Diffusion
-      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
-        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
       - local: api/pipelines/stable_diffusion/adapter
         title: T2I-Adapter
-      - local: api/pipelines/stable_diffusion/gligen
-        title: GLIGEN (Grounded Language-to-Image Generation)
+      - local: api/pipelines/stable_diffusion/text2img
+        title: Text-to-image
       title: Stable Diffusion
     - local: api/pipelines/stable_unclip
       title: Stable unCLIP
 
@@ -20,11 +20,15 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
 - [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
 - [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
+- [`AuraFlowLoraLoaderMixin`] provides similar functions for [AuraFlow](https://huggingface.co/fal/AuraFlow).
 - [`LTXVideoLoraLoaderMixin`] provides similar functions for [LTX-Video](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
 - [`SanaLoraLoaderMixin`] provides similar functions for [Sana](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana).
 - [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
 - [`Lumina2LoraLoaderMixin`] provides similar functions for [Lumina2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/lumina2).
+- [`WanLoraLoaderMixin`] provides similar functions for [Wan](https://huggingface.co/docs/diffusers/main/en/api/pipelines/wan).
+- [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
+- [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
 <Tip>
@@ -56,6 +60,9 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 ## Mochi1LoraLoaderMixin
 
 [[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
+## AuraFlowLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.AuraFlowLoraLoaderMixin
 
 ## LTXVideoLoraLoaderMixin
 
@@ -73,10 +80,22 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 
 [[autodoc]] loaders.lora_pipeline.Lumina2LoraLoaderMixin
 
+## CogView4LoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.CogView4LoraLoaderMixin
+
+## WanLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
+
 ## AmusedLoraLoaderMixin
 
 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
 
+## HiDreamImageLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.HiDreamImageLoraLoaderMixin
+
 ## LoraBaseMixin
 
 [[autodoc]] loaders.lora_base.LoraBaseMixin
@@ -0,0 +1,29 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AutoModel
+
+The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
+
+```python
+from diffusers import AutoModel, AutoPipelineForText2Image
+
+unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
+pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
+```
+
+
+## AutoModel
+
+[[autodoc]] AutoModel
+	- all
+	- from_pretrained
@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
 ```python
 from diffusers import AutoencoderKLAllegro
 
-vae = AutoencoderKLCogVideoX.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+vae = AutoencoderKLAllegro.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
 ```
 
 ## AutoencoderKLAllegro
 
@@ -0,0 +1,29 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SanaControlNetModel
+
+The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection.
+
+The abstract from the paper is:
+
+*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
+
+This model was contributed by [ishan24](https://huggingface.co/ishan24). ❤️
+The original codebase can be found at [NVlabs/Sana](https://github.com/NVlabs/Sana), and you can find official ControlNet checkpoints on [Efficient-Large-Model's](https://huggingface.co/Efficient-Large-Model) Hub profile.
+
+## SanaControlNetModel
+[[autodoc]] SanaControlNetModel
+
+## SanaControlNetOutput
+[[autodoc]] models.controlnets.controlnet_sana.SanaControlNetOutput
+
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# HiDreamImageTransformer2DModel
+
+A Transformer model for image-like data from [HiDream-I1](https://huggingface.co/HiDream-ai).
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import HiDreamImageTransformer2DModel
+
+transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## HiDreamImageTransformer2DModel
+
+[[autodoc]] HiDreamImageTransformer2DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -89,6 +89,23 @@ image = pipeline(prompt).images[0]
 image.save("auraflow.png")
 ```
 
+## Support for `torch.compile()`
+
+AuraFlow can be compiled with `torch.compile()` to speed up inference latency even for different resolutions. First, install PyTorch nightly following the instructions from [here](https://pytorch.org/). The snippet below shows the changes needed to enable this:
+
+```diff
++ torch.fx.experimental._config.use_duck_shape = False
++ pipeline.transformer = torch.compile(
+    pipeline.transformer, fullgraph=True, dynamic=True
+)
+```
+
+Specifying `use_duck_shape` to be `False` instructs the compiler if it should use the same symbolic variable to represent input sizes that are the same. For more details, check out [this comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
+
+This enables from 100% (on low resolutions) to a 30% (on 1536x1536 resolution) speed improvements.
+
+Thanks to [AstraliteHeart](https://github.com/huggingface/diffusers/pull/11297/) who helped us rewrite the [`AuraFlowTransformer2DModel`] class so that the above works for different resolutions ([PR](https://github.com/huggingface/diffusers/pull/11297/)).
+
 ## AuraFlowPipeline
 
 [[autodoc]] AuraFlowPipeline