huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/source/en/api/loaders/lora.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/cogview4_transformer2d.md‎
Lines changed: 30 additions & 0 deletions b/‎docs/source/en/api/models/cogview4_transformer2d.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/cogview4.md‎
Lines changed: 34 additions & 0 deletions b/‎docs/source/en/api/pipelines/cogview4.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎docs/source/en/api/utilities.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/api/utilities.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/en/optimization/memory.md‎
Lines changed: 40 additions & 0 deletions b/‎docs/source/en/optimization/memory.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎docs/source/en/training/custom_diffusion.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/source/en/training/custom_diffusion.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/source/en/tutorials/using_peft_for_inference.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/tutorials/using_peft_for_inference.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/community/pipeline_flux_differential_img2img.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/community/pipeline_flux_differential_img2img.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/community/pipeline_flux_rf_inversion.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/community/pipeline_flux_rf_inversion.py‎
Lines changed: 3 additions & 3 deletions
@@ -278,6 +278,8 @@
         title: ConsisIDTransformer3DModel
       - local: api/models/cogview3plus_transformer2d
         title: CogView3PlusTransformer2DModel
+      - local: api/models/cogview4_transformer2d
+        title: CogView4Transformer2DModel
       - local: api/models/dit_transformer2d
         title: DiTTransformer2DModel
       - local: api/models/flux_transformer
@@ -382,6 +384,8 @@
       title: CogVideoX
     - local: api/pipelines/cogview3
       title: CogView3
+    - local: api/pipelines/cogview4
+      title: CogView4
     - local: api/pipelines/consisid
       title: ConsisID
     - local: api/pipelines/consistency_models
 
@@ -20,6 +20,9 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
 - [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
 - [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
+- [`LTXVideoLoraLoaderMixin`] provides similar functions for [LTX-Video](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
+- [`SanaLoraLoaderMixin`] provides similar functions for [Sana](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana).
+- [`HunyuanVideoLoraLoaderMixin`] provides similar functions for [HunyuanVideo](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuan_video).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
@@ -53,6 +56,18 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 
 [[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
 
+## LTXVideoLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.LTXVideoLoraLoaderMixin
+
+## SanaLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.SanaLoraLoaderMixin
+
+## HunyuanVideoLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.HunyuanVideoLoraLoaderMixin
+
 ## AmusedLoraLoaderMixin
 
 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
 
@@ -0,0 +1,30 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# CogView4Transformer2DModel
+
+A Diffusion Transformer model for 2D data from [CogView4]()
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import CogView4Transformer2DModel
+
+transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+```
+
+## CogView4Transformer2DModel
+
+[[autodoc]] CogView4Transformer2DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,34 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+-->
+
+# CogView4
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
+
+## CogView4Pipeline
+
+[[autodoc]] CogView4Pipeline
+  - all
+  - __call__
+
+## CogView4PipelineOutput
+
+[[autodoc]] pipelines.cogview4.pipeline_output.CogView4PipelineOutput
@@ -45,3 +45,7 @@ Utility and helper functions for working with 🤗 Diffusers.
 ## apply_layerwise_casting
 
 [[autodoc]] hooks.layerwise_casting.apply_layerwise_casting
+
+## apply_group_offloading
+
+[[autodoc]] hooks.group_offloading.apply_group_offloading
@@ -158,6 +158,46 @@ In order to properly offload models after they're called, it is required to run
 
 </Tip>
 
+## Group offloading
+
+Group offloading is the middle ground between sequential and model offloading. It works by offloading groups of internal layers (either `torch.nn.ModuleList` or `torch.nn.Sequential`), which uses less memory than model-level offloading. It is also faster than sequential-level offloading because the number of device synchronizations is reduced.
+
+To enable group offloading, call the [`~ModelMixin.enable_group_offload`] method on the model if it is a Diffusers model implementation. For any other model implementation, use [`~hooks.group_offloading.apply_group_offloading`]:
+
+```python
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.hooks import apply_group_offloading
+from diffusers.utils import export_to_video
+
+# Load the pipeline
+onload_device = torch.device("cuda")
+offload_device = torch.device("cpu")
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+
+# We can utilize the enable_group_offload method for Diffusers model implementations
+pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
+
+# For any other model implementations, the apply_group_offloading function can be used
+apply_group_offloading(pipe.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
+apply_group_offloading(pipe.vae, onload_device=onload_device, offload_type="leaf_level")
+
+prompt = (
+    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
+    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
+    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
+    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
+    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
+    "atmosphere of this unique musical performance."
+)
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+# This utilized about 14.79 GB. It can be further reduced by using tiling and using leaf_level offloading throughout the pipeline.
+print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+export_to_video(video, "output.mp4", fps=8)
+```
+
+Group offloading (for CUDA devices with support for asynchronous data transfer streams) overlaps data transfer and computation to reduce the overall execution time compared to sequential offloading. This is enabled using layer prefetching with CUDA streams. The next layer to be executed is loaded onto the accelerator device while the current layer is being executed - this increases the memory requirements slightly. Group offloading also supports leaf-level offloading (equivalent to sequential CPU offloading) but can be made much faster when using streams.
+
 ## FP8 layerwise weight-casting
 
 PyTorch supports `torch.float8_e4m3fn` and `torch.float8_e5m2` as weight storage dtypes, but they can't be used for computation in many different tensor operations due to unimplemented kernel support. However, you can use these dtypes to store model weights in fp8 precision and upcast them on-the-fly when the layers are used in the forward pass. This is known as layerwise weight-casting.
 
@@ -339,7 +339,10 @@ import torch
 from huggingface_hub.repocard import RepoCard
 from diffusers import DiffusionPipeline
 
-pipeline = DiffusionPipeline.from_pretrained("sayakpaul/custom-diffusion-cat-wooden-pot", torch_dtype=torch.float16).to("cuda")
+pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16,
+).to("cuda")
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
 pipeline.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
 pipeline.load_textual_inversion(model_id, weight_name="<new1>.bin")
 pipeline.load_textual_inversion(model_id, weight_name="<new2>.bin")
 
@@ -221,3 +221,7 @@ pipe.delete_adapters("toy")
 pipe.get_active_adapters()
 ["pixel"]
 ```
+
+## PeftInputAutocastDisableHook
+
+[[autodoc]] hooks.layerwise_casting.PeftInputAutocastDisableHook
@@ -87,7 +87,7 @@ def calculate_shift(
     base_seq_len: int = 256,
     max_seq_len: int = 4096,
     base_shift: float = 0.5,
-    max_shift: float = 1.16,
+    max_shift: float = 1.15,
 ):
     m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
     b = base_shift - m * base_seq_len
@@ -878,7 +878,7 @@ def __call__(
             self.scheduler.config.get("base_image_seq_len", 256),
             self.scheduler.config.get("max_image_seq_len", 4096),
             self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.16),
+            self.scheduler.config.get("max_shift", 1.15),
         )
         timesteps, num_inference_steps = retrieve_timesteps(
             self.scheduler,
 
@@ -94,7 +94,7 @@ def calculate_shift(
     base_seq_len: int = 256,
     max_seq_len: int = 4096,
     base_shift: float = 0.5,
-    max_shift: float = 1.16,
+    max_shift: float = 1.15,
 ):
     m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
     b = base_shift - m * base_seq_len
@@ -823,7 +823,7 @@ def __call__(
             self.scheduler.config.get("base_image_seq_len", 256),
             self.scheduler.config.get("max_image_seq_len", 4096),
             self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.16),
+            self.scheduler.config.get("max_shift", 1.15),
         )
         timesteps, num_inference_steps = retrieve_timesteps(
             self.scheduler,
@@ -993,7 +993,7 @@ def invert(
             self.scheduler.config.get("base_image_seq_len", 256),
             self.scheduler.config.get("max_image_seq_len", 4096),
             self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.16),
+            self.scheduler.config.get("max_shift", 1.15),
         )
         timesteps, num_inversion_steps = retrieve_timesteps(
             self.scheduler,