huggingface
diff --git a/‎.github/workflows/nightly_tests.yml‎
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/nightly_tests.yml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎.github/workflows/pr_tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 26 additions & 19 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 26 additions & 19 deletions
diff --git a/‎docs/source/en/api/models/asymmetricautoencoderkl.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/asymmetricautoencoderkl.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/autoencoderkl.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/autoencoderkl.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/autoencoderkl_cosmos.md‎
Lines changed: 40 additions & 0 deletions b/‎docs/source/en/api/models/autoencoderkl_cosmos.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/consisid_transformer3d.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/consisid_transformer3d.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/controlnet_hunyuandit.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/controlnet_hunyuandit.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/controlnet_sparsectrl.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/en/api/models/controlnet_sparsectrl.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/en/api/models/cosmos_transformer3d.md‎
Lines changed: 30 additions & 0 deletions b/‎docs/source/en/api/models/cosmos_transformer3d.md‎
Lines changed: 30 additions & 0 deletions
@@ -142,6 +142,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
         # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
         CUBLAS_WORKSPACE_CONFIG: :16:8
+        RUN_COMPILE: yes
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
@@ -525,6 +526,60 @@ jobs:
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
+  run_nightly_pipeline_level_quantization_tests:
+    name: Torch quantization nightly tests
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+    runs-on:
+      group: aws-g6e-xlarge-plus
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "20gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install -U bitsandbytes optimum_quanto
+          python -m uv pip install pytest-reportlog
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Pipeline-level quantization tests on GPU
+        env:
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+          BIG_GPU_MEMORY: 40
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            --make-reports=tests_pipeline_level_quant_torch_cuda \
+            --report-log=tests_pipeline_level_quant_torch_cuda.log \
+            tests/quantization/test_pipeline_level_quantization.py
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
+          cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch_cuda_pipeline_level_quant_reports
+          path: reports
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+  
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
 
@@ -11,6 +11,7 @@ on:
       - "tests/**.py"
       - ".github/**.yml"
       - "utils/**.py"
+      - "setup.py"
   push:
     branches:
       - ci-*
 
@@ -17,12 +17,6 @@
     title: AutoPipeline
   - local: tutorials/basic_training
     title: Train a diffusion model
-  - local: tutorials/using_peft_for_inference
-    title: Load LoRAs for inference
-  - local: tutorials/fast_diffusion
-    title: Accelerate inference of text-to-image diffusion models
-  - local: tutorials/inference_with_big_models
-    title: Working with big models
   title: Tutorials
 - sections:
   - local: using-diffusers/loading
@@ -33,11 +27,24 @@
     title: Load schedulers and models
   - local: using-diffusers/other-formats
     title: Model files and layouts
-  - local: using-diffusers/loading_adapters
-    title: Load adapters
   - local: using-diffusers/push_to_hub
     title: Push files to the Hub
   title: Load pipelines and adapters
+- sections:
+  - local: tutorials/using_peft_for_inference
+    title: LoRA
+  - local: using-diffusers/ip_adapter
+    title: IP-Adapter
+  - local: using-diffusers/controlnet
+    title: ControlNet
+  - local: using-diffusers/t2i_adapter
+    title: T2I-Adapter
+  - local: using-diffusers/dreambooth
+    title: DreamBooth
+  - local: using-diffusers/textual_inversion_inference
+    title: Textual inversion
+  title: Adapters
+  isExpanded: false
 - sections:
   - local: using-diffusers/unconditional_image_generation
     title: Unconditional image generation
@@ -59,8 +66,6 @@
     title: Create a server
   - local: training/distributed_inference
     title: Distributed inference
-  - local: using-diffusers/merge_loras
-    title: Merge LoRAs
   - local: using-diffusers/scheduler_features
     title: Scheduler features
   - local: using-diffusers/callback
@@ -97,20 +102,12 @@
     title: SDXL Turbo
   - local: using-diffusers/kandinsky
     title: Kandinsky
-  - local: using-diffusers/ip_adapter
-    title: IP-Adapter
   - local: using-diffusers/omnigen
     title: OmniGen
   - local: using-diffusers/pag
     title: PAG
-  - local: using-diffusers/controlnet
-    title: ControlNet
-  - local: using-diffusers/t2i_adapter
-    title: T2I-Adapter
   - local: using-diffusers/inference_with_lcm
     title: Latent Consistency Model
-  - local: using-diffusers/textual_inversion_inference
-    title: Textual inversion
   - local: using-diffusers/shap-e
     title: Shap-E
   - local: using-diffusers/diffedit
@@ -180,7 +177,7 @@
   title: Quantization Methods
 - sections:
   - local: optimization/fp16
-    title: Speed up inference
+    title: Accelerate inference
   - local: optimization/memory
     title: Reduce memory usage
   - local: optimization/torch2.0
@@ -296,6 +293,8 @@
         title: CogView4Transformer2DModel
       - local: api/models/consisid_transformer3d
         title: ConsisIDTransformer3DModel
+      - local: api/models/cosmos_transformer3d
+        title: CosmosTransformer3DModel
       - local: api/models/dit_transformer2d
         title: DiTTransformer2DModel
       - local: api/models/easyanimate_transformer3d
@@ -364,6 +363,8 @@
         title: AutoencoderKLAllegro
       - local: api/models/autoencoderkl_cogvideox
         title: AutoencoderKLCogVideoX
+      - local: api/models/autoencoderkl_cosmos
+        title: AutoencoderKLCosmos
       - local: api/models/autoencoder_kl_hunyuan_video
         title: AutoencoderKLHunyuanVideo
       - local: api/models/autoencoderkl_ltx_video
@@ -434,6 +435,8 @@
       title: ControlNet-XS with Stable Diffusion XL
     - local: api/pipelines/controlnet_union
       title: ControlNetUnion
+    - local: api/pipelines/cosmos
+      title: Cosmos
     - local: api/pipelines/dance_diffusion
       title: Dance Diffusion
     - local: api/pipelines/ddim
@@ -452,6 +455,8 @@
       title: Flux
     - local: api/pipelines/control_flux_inpaint
       title: FluxControlInpaint
+    - local: api/pipelines/framepack
+      title: Framepack
     - local: api/pipelines/hidream
       title: HiDream-I1
     - local: api/pipelines/hunyuandit
@@ -568,6 +573,8 @@
       title: UniDiffuser
     - local: api/pipelines/value_guided_sampling
       title: Value-guided sampling
+    - local: api/pipelines/visualcloze
+      title: VisualCloze
     - local: api/pipelines/wan
       title: Wan
     - local: api/pipelines/wuerstchen
 
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # AsymmetricAutoencoderKL
 
-Improved larger variational autoencoder (VAE) model with KL loss for inpainting task: [Designing a Better Asymmetric VQGAN for StableDiffusion](https://arxiv.org/abs/2306.04632) by Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, Gang Hua.
+Improved larger variational autoencoder (VAE) model with KL loss for inpainting task: [Designing a Better Asymmetric VQGAN for StableDiffusion](https://huggingface.co/papers/2306.04632) by Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, Gang Hua.
 
 The abstract from the paper is:
 
 
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # AutoencoderKL
 
-The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images.
+The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://huggingface.co/papers/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images.
 
 The abstract from the paper is:
 
 
@@ -0,0 +1,40 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLCosmos
+
+[Cosmos Tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer).
+
+Supported models:
+- [nvidia/Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8)
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLCosmos
+
+vae = AutoencoderKLCosmos.from_pretrained("nvidia/Cosmos-1.0-Tokenizer-CV8x8x8", subfolder="vae")
+```
+
+## AutoencoderKLCosmos
+
+[[autodoc]] AutoencoderKLCosmos
+    - decode
+    - encode
+    - all
+
+## AutoencoderKLOutput
+
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # ConsisIDTransformer3DModel
 
-A Diffusion Transformer model for 3D data from [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) was introduced in [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://arxiv.org/pdf/2411.17440) by Peking University & University of Rochester & etc.
+A Diffusion Transformer model for 3D data from [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) was introduced in [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://huggingface.co/papers/2411.17440) by Peking University & University of Rochester & etc.
 
 The model can be loaded with the following code snippet.
 
 
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # HunyuanDiT2DControlNetModel
 
-HunyuanDiT2DControlNetModel is an implementation of ControlNet for [Hunyuan-DiT](https://arxiv.org/abs/2405.08748).
+HunyuanDiT2DControlNetModel is an implementation of ControlNet for [Hunyuan-DiT](https://huggingface.co/papers/2405.08748).
 
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
 
 
@@ -11,11 +11,11 @@ specific language governing permissions and limitations under the License. -->
 
 # SparseControlNetModel
 
-SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://arxiv.org/abs/2307.04725).
+SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://huggingface.co/papers/2307.04725).
 
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
 
-The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
+The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://huggingface.co/papers/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
 
 The abstract from the paper is:
 
 
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# CosmosTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import CosmosTransformer3DModel
+
+transformer = CosmosTransformer3DModel.from_pretrained("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## CosmosTransformer3DModel
+
+[[autodoc]] CosmosTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput