huggingface
diff --git a/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 20 additions & 23 deletions b/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 20 additions & 23 deletions
diff --git a/‎docs/source/en/api/pipelines/controlnet_sd3.md‎
Lines changed: 16 additions & 2 deletions b/‎docs/source/en/api/pipelines/controlnet_sd3.md‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/marigold_depth_estimation.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/community/marigold_depth_estimation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/consistency_distillation/train_lcm_distill_lora_sdxl.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/consistency_distillation/train_lcm_distill_lora_sdxl.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/consistency_distillation/train_lcm_distill_sd_wds.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/consistency_distillation/train_lcm_distill_sd_wds.py‎
Lines changed: 1 addition & 1 deletion
@@ -15,9 +15,7 @@
 
 # CogVideoX
 
-<!-- TODO: update paper with ArXiv link when ready. -->
-
-[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) from Tsinghua University & ZhipuAI.
+[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.
 
 The abstract from the paper is:
 
@@ -43,43 +41,42 @@ from diffusers import CogVideoXPipeline
 from diffusers.utils import export_to_video
 
 pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b").to("cuda")
-prompt = (
-    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
-    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
-    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
-    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
-    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
-    "atmosphere of this unique musical performance."
-)
-video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-export_to_video(video, "output.mp4", fps=8)
 ```
 
-Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
+Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
 
 ```python
-pipeline.transformer.to(memory_format=torch.channels_last)
-pipeline.vae.to(memory_format=torch.channels_last)
+pipe.transformer.to(memory_format=torch.channels_last)
 ```
 
 Finally, compile the components and run inference:
 
 ```python
-pipeline.transformer = torch.compile(pipeline.transformer)
-pipeline.vae.decode = torch.compile(pipeline.vae.decode)
+pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
 
-# CogVideoX works very well with long and well-described prompts
+# CogVideoX works well with long and well-described prompts
 prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 ```
 
-The [benchmark](TODO: link) results on an 80GB A100 machine are:
+The [benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
 
 ```
-Without torch.compile(): Average inference time: TODO seconds.
-With torch.compile(): Average inference time: TODO seconds.
+Without torch.compile(): Average inference time: 96.89 seconds.
+With torch.compile(): Average inference time: 76.27 seconds.
 ```
 
+### Memory optimization
+
+CogVideoX requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
+
+- `pipe.enable_model_cpu_offload()`:
+  - Without enabling cpu offloading, memory usage is `33 GB`
+  - With enabling cpu offloading, memory usage is `19 GB`
+- `pipe.vae.enable_tiling()`:
+  - With enabling cpu offloading and tiling, memory usage is `11 GB`
+- `pipe.vae.enable_slicing()`
+
 ## CogVideoXPipeline
 
 [[autodoc]] CogVideoXPipeline
 
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -22,7 +22,16 @@ The abstract from the paper is:
 
 *We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
 
-This code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for SD3-ControlNet on [The InstantX Team](https://huggingface.co/InstantX) Hub profile.
+This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below: 
+
+
+| ControlNet type | Developer | Link |
+| -------- | ---------- | ---- |
+| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Canny) |
+| Pose | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Pose) |
+| Tile | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Tile) |
+| Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
+
 
 <Tip>
 
@@ -35,5 +44,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__
 
+## StableDiffusion3ControlNetInpaintingPipeline
+[[autodoc]] pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline
+	- all
+	- __call__
+
 ## StableDiffusion3PipelineOutput
 [[autodoc]] pipelines.stable_diffusion_3.pipeline_output.StableDiffusion3PipelineOutput
@@ -48,7 +48,7 @@ accelerate launch run_distributed.py --num_processes=2
 
 <Tip>
 
-To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
 
 </Tip>
 
@@ -108,4 +108,4 @@ torchrun run_distributed.py --nproc_per_node=2
 ```
 
 > [!TIP]
-> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
+> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
@@ -71,7 +71,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)
 
 
@@ -79,7 +79,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)
 
 
@@ -43,7 +43,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 
 class MarigoldDepthOutput(BaseOutput):
 
@@ -73,7 +73,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)
 
 
@@ -66,7 +66,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)
 
 
@@ -79,7 +79,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)
 
 
@@ -72,7 +72,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.30.0.dev0")
+check_min_version("0.31.0.dev0")
 
 logger = get_logger(__name__)