Skip to content

Commit 9bdb6a1

Browse files
authored
Merge branch 'main' into dreambooth-lora-flux-exploration
2 parents 5dfd685 + d9029f2 commit 9bdb6a1

File tree

10 files changed

+1163
-9
lines changed

10 files changed

+1163
-9
lines changed

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ There are two models available that can be used with the text-to-video and video
3636
There is one model available that can be used with the image-to-video CogVideoX pipeline:
3737
- [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `bf16`.
3838

39+
There are two models that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team):
40+
- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `bf16`.
41+
- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `bf16`.
42+
3943
## Inference
4044

4145
Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
@@ -118,6 +122,12 @@ It is also worth noting that torchao quantization is fully compatible with [torc
118122
- all
119123
- __call__
120124

125+
## CogVideoXFunControlPipeline
126+
127+
[[autodoc]] CogVideoXFunControlPipeline
128+
- all
129+
- __call__
130+
121131
## CogVideoXPipelineOutput
122132

123133
[[autodoc]] pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@
256256
"BlipDiffusionControlNetPipeline",
257257
"BlipDiffusionPipeline",
258258
"CLIPImageProjection",
259+
"CogVideoXFunControlPipeline",
259260
"CogVideoXImageToVideoPipeline",
260261
"CogVideoXPipeline",
261262
"CogVideoXVideoToVideoPipeline",
@@ -711,6 +712,7 @@
711712
AudioLDMPipeline,
712713
AuraFlowPipeline,
713714
CLIPImageProjection,
715+
CogVideoXFunControlPipeline,
714716
CogVideoXImageToVideoPipeline,
715717
CogVideoXPipeline,
716718
CogVideoXVideoToVideoPipeline,

src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,8 @@ def _encode(self, x: torch.Tensor) -> torch.Tensor:
11821182

11831183
frame_batch_size = self.num_sample_frames_batch_size
11841184
# Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
1185-
num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
1185+
# As the extra single frame is handled inside the loop, it is not required to round up here.
1186+
num_batches = max(num_frames // frame_batch_size, 1)
11861187
conv_cache = None
11871188
enc = []
11881189

@@ -1330,7 +1331,8 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
13301331
row = []
13311332
for j in range(0, width, overlap_width):
13321333
# Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
1333-
num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
1334+
# As the extra single frame is handled inside the loop, it is not required to round up here.
1335+
num_batches = max(num_frames // frame_batch_size, 1)
13341336
conv_cache = None
13351337
time = []
13361338

@@ -1409,7 +1411,7 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[Decod
14091411
for i in range(0, height, overlap_height):
14101412
row = []
14111413
for j in range(0, width, overlap_width):
1412-
num_batches = num_frames // frame_batch_size
1414+
num_batches = max(num_frames // frame_batch_size, 1)
14131415
conv_cache = None
14141416
time = []
14151417

src/diffusers/pipelines/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@
144144
"CogVideoXPipeline",
145145
"CogVideoXImageToVideoPipeline",
146146
"CogVideoXVideoToVideoPipeline",
147+
"CogVideoXFunControlPipeline",
147148
]
148149
_import_structure["cogview3"] = ["CogView3PlusPipeline"]
149150
_import_structure["controlnet"].extend(
@@ -470,7 +471,12 @@
470471
)
471472
from .aura_flow import AuraFlowPipeline
472473
from .blip_diffusion import BlipDiffusionPipeline
473-
from .cogvideo import CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline
474+
from .cogvideo import (
475+
CogVideoXFunControlPipeline,
476+
CogVideoXImageToVideoPipeline,
477+
CogVideoXPipeline,
478+
CogVideoXVideoToVideoPipeline,
479+
)
474480
from .cogview3 import CogView3PlusPipeline
475481
from .controlnet import (
476482
BlipDiffusionControlNetPipeline,

src/diffusers/pipelines/cogvideo/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
2424
else:
2525
_import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
26+
_import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
2627
_import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
2728
_import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
2829

@@ -35,6 +36,7 @@
3536
from ...utils.dummy_torch_and_transformers_objects import *
3637
else:
3738
from .pipeline_cogvideox import CogVideoXPipeline
39+
from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
3840
from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
3941
from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
4042

0 commit comments

Comments
 (0)