huggingface
diff --git a/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/source/en/api/pipelines/cogvideox.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py‎
Lines changed: 5 additions & 3 deletions b/‎src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/diffusers/pipelines/cogvideo/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/pipelines/cogvideo/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -36,6 +36,10 @@ There are two models available that can be used with the text-to-video and video
 There is one model available that can be used with the image-to-video CogVideoX pipeline:
 - [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `bf16`.
 
+There are two models that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team):
+- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `bf16`.
+- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `bf16`.
+
 ## Inference
 
 Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
@@ -118,6 +122,12 @@ It is also worth noting that torchao quantization is fully compatible with [torc
   - all
   - __call__
 
+## CogVideoXFunControlPipeline
+
+[[autodoc]] CogVideoXFunControlPipeline
+  - all
+  - __call__
+
 ## CogVideoXPipelineOutput
 
 [[autodoc]] pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput
@@ -256,6 +256,7 @@
             "BlipDiffusionControlNetPipeline",
             "BlipDiffusionPipeline",
             "CLIPImageProjection",
+            "CogVideoXFunControlPipeline",
             "CogVideoXImageToVideoPipeline",
             "CogVideoXPipeline",
             "CogVideoXVideoToVideoPipeline",
@@ -711,6 +712,7 @@
             AudioLDMPipeline,
             AuraFlowPipeline,
             CLIPImageProjection,
+            CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
             CogVideoXPipeline,
             CogVideoXVideoToVideoPipeline,
 
@@ -1182,7 +1182,8 @@ def _encode(self, x: torch.Tensor) -> torch.Tensor:
 
         frame_batch_size = self.num_sample_frames_batch_size
         # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-        num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+        # As the extra single frame is handled inside the loop, it is not required to round up here.
+        num_batches = max(num_frames // frame_batch_size, 1)
         conv_cache = None
         enc = []
 
@@ -1330,7 +1331,8 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
             row = []
             for j in range(0, width, overlap_width):
                 # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-                num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+                # As the extra single frame is handled inside the loop, it is not required to round up here.
+                num_batches = max(num_frames // frame_batch_size, 1)
                 conv_cache = None
                 time = []
 
@@ -1409,7 +1411,7 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[Decod
         for i in range(0, height, overlap_height):
             row = []
             for j in range(0, width, overlap_width):
-                num_batches = num_frames // frame_batch_size
+                num_batches = max(num_frames // frame_batch_size, 1)
                 conv_cache = None
                 time = []
 
 
@@ -144,6 +144,7 @@
         "CogVideoXPipeline",
         "CogVideoXImageToVideoPipeline",
         "CogVideoXVideoToVideoPipeline",
+        "CogVideoXFunControlPipeline",
     ]
     _import_structure["cogview3"] = ["CogView3PlusPipeline"]
     _import_structure["controlnet"].extend(
@@ -470,7 +471,12 @@
         )
         from .aura_flow import AuraFlowPipeline
         from .blip_diffusion import BlipDiffusionPipeline
-        from .cogvideo import CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline
+        from .cogvideo import (
+            CogVideoXFunControlPipeline,
+            CogVideoXImageToVideoPipeline,
+            CogVideoXPipeline,
+            CogVideoXVideoToVideoPipeline,
+        )
         from .cogview3 import CogView3PlusPipeline
         from .controlnet import (
             BlipDiffusionControlNetPipeline,
 
@@ -23,6 +23,7 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
+    _import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
     _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
     _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
 
@@ -35,6 +36,7 @@
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_cogvideox import CogVideoXPipeline
+        from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
         from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
         from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline