Skip to content

Commit 9f81c89

Browse files
authored
Merge branch 'main' into AnandK27-lr-prev-timestep-patch
2 parents 8a03162 + cef4f65 commit 9f81c89

File tree

9 files changed

+685
-40
lines changed

9 files changed

+685
-40
lines changed

docs/source/en/_toctree.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@
7575
title: Outpainting
7676
title: Advanced inference
7777
- sections:
78+
- local: using-diffusers/cogvideox
79+
title: CogVideoX
7880
- local: using-diffusers/sdxl
7981
title: Stable Diffusion XL
8082
- local: using-diffusers/sdxl_turbo
@@ -129,6 +131,8 @@
129131
title: T2I-Adapters
130132
- local: training/instructpix2pix
131133
title: InstructPix2Pix
134+
- local: training/cogvideox
135+
title: CogVideoX
132136
title: Models
133137
- isExpanded: false
134138
sections:

docs/source/en/training/cogvideox.md

Lines changed: 291 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
-->
12+
# CogVideoX
13+
14+
CogVideoX is a text-to-video generation model focused on creating more coherent videos aligned with a prompt. It achieves this using several methods.
15+
16+
- a 3D variational autoencoder that compresses videos spatially and temporally, improving compression rate and video accuracy.
17+
18+
- an expert transformer block to help align text and video, and a 3D full attention module for capturing and creating spatially and temporally accurate videos.
19+
20+
21+
22+
## Load model checkpoints
23+
Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~DiffusionPipeline.from_pretrained`] method.
24+
25+
26+
```py
27+
from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
28+
pipe = CogVideoXPipeline.from_pretrained(
29+
"THUDM/CogVideoX-2b",
30+
torch_dtype=torch.float16
31+
)
32+
33+
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
34+
"THUDM/CogVideoX-5b-I2V",
35+
torch_dtype=torch.bfloat16
36+
)
37+
38+
```
39+
40+
## Text-to-Video
41+
For text-to-video, pass a text prompt. By default, CogVideoX generates a 720x480 video for the best results.
42+
43+
```py
44+
import torch
45+
from diffusers import CogVideoXPipeline
46+
from diffusers.utils import export_to_video
47+
48+
prompt = "An elderly gentleman, with a serene expression, sits at the water's edge, a steaming cup of tea by his side. He is engrossed in his artwork, brush in hand, as he renders an oil painting on a canvas that's propped up against a small, weathered table. The sea breeze whispers through his silver hair, gently billowing his loose-fitting white shirt, while the salty air adds an intangible element to his masterpiece in progress. The scene is one of tranquility and inspiration, with the artist's canvas capturing the vibrant hues of the setting sun reflecting off the tranquil sea."
49+
50+
pipe = CogVideoXPipeline.from_pretrained(
51+
"THUDM/CogVideoX-5b",
52+
torch_dtype=torch.bfloat16
53+
)
54+
55+
pipe.enable_model_cpu_offload()
56+
pipe.vae.enable_tiling()
57+
58+
video = pipe(
59+
prompt=prompt,
60+
num_videos_per_prompt=1,
61+
num_inference_steps=50,
62+
num_frames=49,
63+
guidance_scale=6,
64+
generator=torch.Generator(device="cuda").manual_seed(42),
65+
).frames[0]
66+
67+
export_to_video(video, "output.mp4", fps=8)
68+
69+
```
70+
71+
72+
<div class="flex justify-center">
73+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_out.gif" alt="generated image of an astronaut in a jungle"/>
74+
</div>
75+
76+
77+
## Image-to-Video
78+
79+
80+
You'll use the [THUDM/CogVideoX-5b-I2V](https://huggingface.co/THUDM/CogVideoX-5b-I2V) checkpoint for this guide.
81+
82+
```py
83+
import torch
84+
from diffusers import CogVideoXImageToVideoPipeline
85+
from diffusers.utils import export_to_video, load_image
86+
87+
prompt = "A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."
88+
image = load_image(image="cogvideox_rocket.png")
89+
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
90+
"THUDM/CogVideoX-5b-I2V",
91+
torch_dtype=torch.bfloat16
92+
)
93+
94+
pipe.vae.enable_tiling()
95+
pipe.vae.enable_slicing()
96+
97+
video = pipe(
98+
prompt=prompt,
99+
image=image,
100+
num_videos_per_prompt=1,
101+
num_inference_steps=50,
102+
num_frames=49,
103+
guidance_scale=6,
104+
generator=torch.Generator(device="cuda").manual_seed(42),
105+
).frames[0]
106+
107+
export_to_video(video, "output.mp4", fps=8)
108+
```
109+
110+
<div class="flex gap-4">
111+
<div>
112+
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png"/>
113+
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
114+
</div>
115+
<div>
116+
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_outrocket.gif"/>
117+
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
118+
</div>
119+
</div>
120+

docs/source/en/using-diffusers/text-img2vid.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,59 @@ This guide will show you how to generate videos, how to configure video model pa
2323
2424
[Stable Video Diffusions (SVD)](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid), [I2VGen-XL](https://huggingface.co/ali-vilab/i2vgen-xl/), [AnimateDiff](https://huggingface.co/guoyww/animatediff), and [ModelScopeT2V](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b) are popular models used for video diffusion. Each model is distinct. For example, AnimateDiff inserts a motion modeling module into a frozen text-to-image model to generate personalized animated images, whereas SVD is entirely pretrained from scratch with a three-stage training process to generate short high-quality videos.
2525

26+
[CogVideoX](https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce) is another popular video generation model. The model is a multidimensional transformer that integrates text, time, and space. It employs full attention in the attention module and includes an expert block at the layer level to spatially align text and video.
27+
28+
### CogVideoX
29+
30+
[CogVideoX](../api/pipelines/cogvideox) uses a 3D Variational Autoencoder (VAE) to compress videos along the spatial and temporal dimensions.
31+
32+
Begin by loading the [`CogVideoXPipeline`] and passing an initial text or image to generate a video.
33+
<Tip>
34+
35+
CogVideoX is available for image-to-video and text-to-video. [THUDM/CogVideoX-5b-I2V](https://huggingface.co/THUDM/CogVideoX-5b-I2V) uses the [`CogVideoXImageToVideoPipeline`] for image-to-video. [THUDM/CogVideoX-5b](https://huggingface.co/THUDM/CogVideoX-5b) and [THUDM/CogVideoX-2b](https://huggingface.co/THUDM/CogVideoX-2b) are available for text-to-video with the [`CogVideoXPipeline`].
36+
37+
</Tip>
38+
39+
```py
40+
import torch
41+
from diffusers import CogVideoXImageToVideoPipeline
42+
from diffusers.utils import export_to_video, load_image
43+
44+
prompt = "A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."
45+
image = load_image(image="cogvideox_rocket.png")
46+
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
47+
"THUDM/CogVideoX-5b-I2V",
48+
torch_dtype=torch.bfloat16
49+
)
50+
51+
pipe.vae.enable_tiling()
52+
pipe.vae.enable_slicing()
53+
54+
video = pipe(
55+
prompt=prompt,
56+
image=image,
57+
num_videos_per_prompt=1,
58+
num_inference_steps=50,
59+
num_frames=49,
60+
guidance_scale=6,
61+
generator=torch.Generator(device="cuda").manual_seed(42),
62+
).frames[0]
63+
64+
export_to_video(video, "output.mp4", fps=8)
65+
```
66+
67+
<div class="flex gap-4">
68+
<div>
69+
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png"/>
70+
<figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
71+
</div>
72+
<div>
73+
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_outrocket.gif"/>
74+
<figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
75+
</div>
76+
</div>
77+
78+
2679
### Stable Video Diffusion
2780

2881
[SVD](../api/pipelines/svd) is based on the Stable Diffusion 2.1 model and it is trained on images, then low-resolution videos, and finally a smaller dataset of high-resolution videos. This model generates a short 2-4 second video from an initial image. You can learn more details about model, like micro-conditioning, in the [Stable Video Diffusion](../using-diffusers/svd) guide.

src/diffusers/loaders/lora_pipeline.py

Lines changed: 84 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,14 +1358,30 @@ def load_lora_into_transformer(
13581358
inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name, **peft_kwargs)
13591359
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
13601360

1361+
warn_msg = ""
13611362
if incompatible_keys is not None:
1362-
# check only for unexpected keys
1363+
# Check only for unexpected keys.
13631364
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
13641365
if unexpected_keys:
1365-
logger.warning(
1366-
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
1367-
f" {unexpected_keys}. "
1368-
)
1366+
lora_unexpected_keys = [k for k in unexpected_keys if "lora_" in k and adapter_name in k]
1367+
if lora_unexpected_keys:
1368+
warn_msg = (
1369+
f"Loading adapter weights from state_dict led to unexpected keys found in the model:"
1370+
f" {', '.join(lora_unexpected_keys)}. "
1371+
)
1372+
1373+
# Filter missing keys specific to the current adapter.
1374+
missing_keys = getattr(incompatible_keys, "missing_keys", None)
1375+
if missing_keys:
1376+
lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k]
1377+
if lora_missing_keys:
1378+
warn_msg += (
1379+
f"Loading adapter weights from state_dict led to missing keys in the model:"
1380+
f" {', '.join(lora_missing_keys)}."
1381+
)
1382+
1383+
if warn_msg:
1384+
logger.warning(warn_msg)
13691385

13701386
# Offload back.
13711387
if is_model_cpu_offload:
@@ -1932,14 +1948,30 @@ def load_lora_into_transformer(
19321948
inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name, **peft_kwargs)
19331949
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
19341950

1951+
warn_msg = ""
19351952
if incompatible_keys is not None:
1936-
# check only for unexpected keys
1953+
# Check only for unexpected keys.
19371954
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
19381955
if unexpected_keys:
1939-
logger.warning(
1940-
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
1941-
f" {unexpected_keys}. "
1942-
)
1956+
lora_unexpected_keys = [k for k in unexpected_keys if "lora_" in k and adapter_name in k]
1957+
if lora_unexpected_keys:
1958+
warn_msg = (
1959+
f"Loading adapter weights from state_dict led to unexpected keys found in the model:"
1960+
f" {', '.join(lora_unexpected_keys)}. "
1961+
)
1962+
1963+
# Filter missing keys specific to the current adapter.
1964+
missing_keys = getattr(incompatible_keys, "missing_keys", None)
1965+
if missing_keys:
1966+
lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k]
1967+
if lora_missing_keys:
1968+
warn_msg += (
1969+
f"Loading adapter weights from state_dict led to missing keys in the model:"
1970+
f" {', '.join(lora_missing_keys)}."
1971+
)
1972+
1973+
if warn_msg:
1974+
logger.warning(warn_msg)
19431975

19441976
# Offload back.
19451977
if is_model_cpu_offload:
@@ -2279,14 +2311,30 @@ def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, ada
22792311
inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
22802312
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
22812313

2314+
warn_msg = ""
22822315
if incompatible_keys is not None:
2283-
# check only for unexpected keys
2316+
# Check only for unexpected keys.
22842317
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
22852318
if unexpected_keys:
2286-
logger.warning(
2287-
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
2288-
f" {unexpected_keys}. "
2289-
)
2319+
lora_unexpected_keys = [k for k in unexpected_keys if "lora_" in k and adapter_name in k]
2320+
if lora_unexpected_keys:
2321+
warn_msg = (
2322+
f"Loading adapter weights from state_dict led to unexpected keys found in the model:"
2323+
f" {', '.join(lora_unexpected_keys)}. "
2324+
)
2325+
2326+
# Filter missing keys specific to the current adapter.
2327+
missing_keys = getattr(incompatible_keys, "missing_keys", None)
2328+
if missing_keys:
2329+
lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k]
2330+
if lora_missing_keys:
2331+
warn_msg += (
2332+
f"Loading adapter weights from state_dict led to missing keys in the model:"
2333+
f" {', '.join(lora_missing_keys)}."
2334+
)
2335+
2336+
if warn_msg:
2337+
logger.warning(warn_msg)
22902338

22912339
# Offload back.
22922340
if is_model_cpu_offload:
@@ -2717,14 +2765,30 @@ def load_lora_into_transformer(
27172765
inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name, **peft_kwargs)
27182766
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
27192767

2768+
warn_msg = ""
27202769
if incompatible_keys is not None:
2721-
# check only for unexpected keys
2770+
# Check only for unexpected keys.
27222771
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
27232772
if unexpected_keys:
2724-
logger.warning(
2725-
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
2726-
f" {unexpected_keys}. "
2727-
)
2773+
lora_unexpected_keys = [k for k in unexpected_keys if "lora_" in k and adapter_name in k]
2774+
if lora_unexpected_keys:
2775+
warn_msg = (
2776+
f"Loading adapter weights from state_dict led to unexpected keys found in the model:"
2777+
f" {', '.join(lora_unexpected_keys)}. "
2778+
)
2779+
2780+
# Filter missing keys specific to the current adapter.
2781+
missing_keys = getattr(incompatible_keys, "missing_keys", None)
2782+
if missing_keys:
2783+
lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k]
2784+
if lora_missing_keys:
2785+
warn_msg += (
2786+
f"Loading adapter weights from state_dict led to missing keys in the model:"
2787+
f" {', '.join(lora_missing_keys)}."
2788+
)
2789+
2790+
if warn_msg:
2791+
logger.warning(warn_msg)
27282792

27292793
# Offload back.
27302794
if is_model_cpu_offload:

src/diffusers/loaders/unet.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -354,14 +354,30 @@ def _process_lora(
354354
inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs)
355355
incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs)
356356

357+
warn_msg = ""
357358
if incompatible_keys is not None:
358-
# check only for unexpected keys
359+
# Check only for unexpected keys.
359360
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
360361
if unexpected_keys:
361-
logger.warning(
362-
f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
363-
f" {unexpected_keys}. "
364-
)
362+
lora_unexpected_keys = [k for k in unexpected_keys if "lora_" in k and adapter_name in k]
363+
if lora_unexpected_keys:
364+
warn_msg = (
365+
f"Loading adapter weights from state_dict led to unexpected keys found in the model:"
366+
f" {', '.join(lora_unexpected_keys)}. "
367+
)
368+
369+
# Filter missing keys specific to the current adapter.
370+
missing_keys = getattr(incompatible_keys, "missing_keys", None)
371+
if missing_keys:
372+
lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k]
373+
if lora_missing_keys:
374+
warn_msg += (
375+
f"Loading adapter weights from state_dict led to missing keys in the model:"
376+
f" {', '.join(lora_missing_keys)}."
377+
)
378+
379+
if warn_msg:
380+
logger.warning(warn_msg)
365381

366382
return is_model_cpu_offload, is_sequential_cpu_offload
367383

0 commit comments

Comments
 (0)