huggingface
diff --git a/‎.github/workflows/push_tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/push_tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/en/api/attnprocessor.md‎
Lines changed: 115 additions & 8 deletions b/‎docs/source/en/api/attnprocessor.md‎
Lines changed: 115 additions & 8 deletions
diff --git a/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/autoencoder_kl_hunyuan_video.md‎
Lines changed: 32 additions & 0 deletions b/‎docs/source/en/api/models/autoencoder_kl_hunyuan_video.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/hunyuan_video_transformer_3d.md‎
Lines changed: 30 additions & 0 deletions b/‎docs/source/en/api/models/hunyuan_video_transformer_3d.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/hunyuan_video.md‎
Lines changed: 43 additions & 0 deletions b/‎docs/source/en/api/pipelines/hunyuan_video.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/ltx_video.md‎
Lines changed: 18 additions & 6 deletions b/‎docs/source/en/api/pipelines/ltx_video.md‎
Lines changed: 18 additions & 6 deletions
@@ -165,7 +165,8 @@ jobs:
       group: gcp-ct5lp-hightpu-8t
     container:
       image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache    defaults:
+      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache 
+    defaults:
       run:
         shell: bash
     steps:
 
@@ -157,6 +157,8 @@
     title: Getting Started
   - local: quantization/bitsandbytes
     title: bitsandbytes
+  - local: quantization/torchao
+    title: torchao
   title: Quantization Methods
 - sections:
   - local: optimization/fp16
@@ -270,6 +272,8 @@
         title: FluxTransformer2DModel
       - local: api/models/hunyuan_transformer2d
         title: HunyuanDiT2DModel
+      - local: api/models/hunyuan_video_transformer_3d
+        title: HunyuanVideoTransformer3DModel
       - local: api/models/latte_transformer3d
         title: LatteTransformer3DModel
       - local: api/models/lumina_nextdit2d
@@ -316,6 +320,8 @@
         title: AutoencoderKLAllegro
       - local: api/models/autoencoderkl_cogvideox
         title: AutoencoderKLCogVideoX
+      - local: api/models/autoencoder_kl_hunyuan_video
+        title: AutoencoderKLHunyuanVideo
       - local: api/models/autoencoderkl_ltx_video
         title: AutoencoderKLLTXVideo
       - local: api/models/autoencoderkl_mochi
@@ -394,6 +400,8 @@
       title: Flux
     - local: api/pipelines/hunyuandit
       title: Hunyuan-DiT
+    - local: api/pipelines/hunyuan_video
+      title: HunyuanVideo
     - local: api/pipelines/i2vgenxl
       title: I2VGen-XL
     - local: api/pipelines/pix2pix
 
@@ -15,45 +15,143 @@ specific language governing permissions and limitations under the License.
 An attention processor is a class for applying different types of attention mechanisms.
 
 ## AttnProcessor
+
 [[autodoc]] models.attention_processor.AttnProcessor
 
-## AttnProcessor2_0
 [[autodoc]] models.attention_processor.AttnProcessor2_0
 
+<<<<<<< HEAD
 ## FusedAttnProcessor2_0
 [[autodoc]] models.attention_processor.FusedAttnProcessor2_0
 
 ## XFormersAttnProcessor
 [[autodoc]] models.attention_processor.XFormersAttnProcessor
 
 ## AttnAddedKVProcessor
+=======
+>>>>>>> main
 [[autodoc]] models.attention_processor.AttnAddedKVProcessor
 
-## AttnAddedKVProcessor2_0
 [[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
 
+<<<<<<< HEAD
 ## XFormersAttnAddedKVProcessor
 [[autodoc]] models.attention_processor.XFormersAttnAddedKVProcessor
 
 ## CrossFrameAttnProcessor
 [[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
+=======
+[[autodoc]] models.attention_processor.AttnProcessorNPU
+>>>>>>> main
+
+[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
+
+## Allegro
+
+[[autodoc]] models.attention_processor.AllegroAttnProcessor2_0
+
+## AuraFlow
+
+[[autodoc]] models.attention_processor.AuraFlowAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FusedAuraFlowAttnProcessor2_0
+
+## CogVideoX
+
+[[autodoc]] models.attention_processor.CogVideoXAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FusedCogVideoXAttnProcessor2_0
+
+## CrossFrameAttnProcessor
+
+[[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
+
+## Custom Diffusion
 
-## CustomDiffusionAttnProcessor
 [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
 
-## CustomDiffusionAttnProcessor2_0
 [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
 
-## CustomDiffusionXFormersAttnProcessor
 [[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
 
-## FusedAttnProcessor2_0
-[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
+## Flux
+
+[[autodoc]] models.attention_processor.FluxAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FusedFluxAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FluxSingleAttnProcessor2_0
+
+## Hunyuan
+
+[[autodoc]] models.attention_processor.HunyuanAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FusedHunyuanAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGHunyuanAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGCFGHunyuanAttnProcessor2_0
+
+## IdentitySelfAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGIdentitySelfAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGCFGIdentitySelfAttnProcessor2_0
+
+## IP-Adapter
+
+[[autodoc]] models.attention_processor.IPAdapterAttnProcessor
+
+[[autodoc]] models.attention_processor.IPAdapterAttnProcessor2_0
+
+## JointAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.JointAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGJointAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGCFGJointAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.FusedJointAttnProcessor2_0
+
+## LoRA
+
+[[autodoc]] models.attention_processor.LoRAAttnProcessor
+
+[[autodoc]] models.attention_processor.LoRAAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor
+
+[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor
+
+## Lumina-T2X
+
+[[autodoc]] models.attention_processor.LuminaAttnProcessor2_0
+
+## Mochi
+
+[[autodoc]] models.attention_processor.MochiAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.MochiVaeAttnProcessor2_0
+
+## Sana
+
+[[autodoc]] models.attention_processor.SanaLinearAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.SanaMultiscaleAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGCFGSanaLinearAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.PAGIdentitySanaLinearAttnProcessor2_0
+
+## Stable Audio
+
+[[autodoc]] models.attention_processor.StableAudioAttnProcessor2_0
 
 ## SlicedAttnProcessor
+
 [[autodoc]] models.attention_processor.SlicedAttnProcessor
 
-## SlicedAttnAddedKVProcessor
 [[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor
 
 ## IPAdapterAttnProcessor
@@ -140,3 +238,12 @@ An attention processor is a class for applying different types of attention mech
 
 ## CogVideoXAttnProcessor2_0
 [[autodoc]] models.attention_processor.CogVideoXAttnProcessor2_0
+## XFormersAttnProcessor
+
+[[autodoc]] models.attention_processor.XFormersAttnProcessor
+
+[[autodoc]] models.attention_processor.XFormersAttnAddedKVProcessor
+
+## XLAFlashAttnProcessor2_0
+
+[[autodoc]] models.attention_processor.XLAFlashAttnProcessor2_0
@@ -17,6 +17,9 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`StableDiffusionLoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model.
 - [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`StableDiffusionLoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model.
 - [`SD3LoraLoaderMixin`] provides similar functions for [Stable Diffusion 3](https://huggingface.co/blog/sd3).
+- [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
+- [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
+- [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
@@ -38,6 +41,18 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 
 [[autodoc]] loaders.lora_pipeline.SD3LoraLoaderMixin
 
+## FluxLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.FluxLoraLoaderMixin
+
+## CogVideoXLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.CogVideoXLoraLoaderMixin
+
+## Mochi1LoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
+
 ## AmusedLoraLoaderMixin
 
 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
 
@@ -0,0 +1,32 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLHunyuanVideo
+
+The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLHunyuanVideo
+
+vae = AutoencoderKLHunyuanVideo.from_pretrained("tencent/HunyuanVideo", torch_dtype=torch.float16)
+```
+
+## AutoencoderKLHunyuanVideo
+
+[[autodoc]] AutoencoderKLHunyuanVideo
+  - decode
+  - all
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# HunyuanVideoTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import HunyuanVideoTransformer3DModel
+
+transformer = HunyuanVideoTransformer3DModel.from_pretrained("tencent/HunyuanVideo", torch_dtype=torch.bfloat16)
+```
+
+## HunyuanVideoTransformer3DModel
+
+[[autodoc]] HunyuanVideoTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,43 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# HunyuanVideo
+
+[HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.
+
+*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).*
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+Recommendations for inference:
+- Both text encoders should be in `torch.float16`.
+- Transformer should be in `torch.bfloat16`.
+- VAE should be in `torch.float16`.
+- `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`.
+- For smaller resolution images, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo.
+- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/).
+
+## HunyuanVideoPipeline
+
+[[autodoc]] HunyuanVideoPipeline
+  - all
+  - __call__
+
+## HunyuanVideoPipelineOutput
+
+[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput
@@ -31,26 +31,38 @@ import torch
 from diffusers import AutoencoderKLLTXVideo, LTXImageToVideoPipeline, LTXVideoTransformer3DModel
 
 single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
-transformer = LTXVideoTransformer3DModel.from_single_file(single_file_url, torch_dtype=torch.bfloat16)
+transformer = LTXVideoTransformer3DModel.from_single_file(
+  single_file_url, torch_dtype=torch.bfloat16
+)
 vae = AutoencoderKLLTXVideo.from_single_file(single_file_url, torch_dtype=torch.bfloat16)
-pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, vae=vae, torch_dtype=torch.bfloat16)
+pipe = LTXImageToVideoPipeline.from_pretrained(
+  "Lightricks/LTX-Video", transformer=transformer, vae=vae, torch_dtype=torch.bfloat16
+)
 
 # ... inference code ...
 ```
 
-Alternatively, the pipeline can be used to load the weights with [~FromSingleFileMixin.from_single_file`].
+Alternatively, the pipeline can be used to load the weights with [`~FromSingleFileMixin.from_single_file`].
 
 ```python
 import torch
 from diffusers import LTXImageToVideoPipeline
 from transformers import T5EncoderModel, T5Tokenizer
 
 single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
-text_encoder = T5EncoderModel.from_pretrained("Lightricks/LTX-Video", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-tokenizer = T5Tokenizer.from_pretrained("Lightricks/LTX-Video", subfolder="tokenizer", torch_dtype=torch.bfloat16)
-pipe = LTXImageToVideoPipeline.from_single_file(single_file_url, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.bfloat16)
+text_encoder = T5EncoderModel.from_pretrained(
+  "Lightricks/LTX-Video", subfolder="text_encoder", torch_dtype=torch.bfloat16
+)
+tokenizer = T5Tokenizer.from_pretrained(
+  "Lightricks/LTX-Video", subfolder="tokenizer", torch_dtype=torch.bfloat16
+)
+pipe = LTXImageToVideoPipeline.from_single_file(
+  single_file_url, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.bfloat16
+)
 ```
 
+Refer to [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox#memory-optimization) to learn more about optimizing memory consumption.
+
 ## LTXPipeline
 
 [[autodoc]] LTXPipeline