Skip to content

Commit be80dbf

Browse files
more resolution for cogvideox1.5-5b-i2v
1 parent e254bcb commit be80dbf

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,7 @@ def __call__(
734734
progress_bar.update()
735735

736736
if not output_type == "latent":
737+
breakpoint()
737738
video = self.decode_latents(latents)
738739
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
739740
else:

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -567,8 +567,8 @@ def __call__(
567567
image: PipelineImageInput,
568568
prompt: Optional[Union[str, List[str]]] = None,
569569
negative_prompt: Optional[Union[str, List[str]]] = None,
570-
height: int = 480,
571-
width: int = 720,
570+
height: int = 768,
571+
width: int = 1360,
572572
num_frames: int = 49,
573573
num_inference_steps: int = 50,
574574
timesteps: Optional[List[int]] = None,
@@ -675,7 +675,6 @@ def __call__(
675675
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
676676

677677
num_videos_per_prompt = 1
678-
679678
# 1. Check inputs. Raise error if not correct
680679
self.check_inputs(
681680
image=image,
@@ -726,6 +725,22 @@ def __call__(
726725
self._num_timesteps = len(timesteps)
727726

728727
# 5. Prepare latents
728+
# TODO: Only CogVideoX1.5-5B-I2V can use this method. Need to Change
729+
def adjust_resolution_to_divisible(image_height, image_width, tgt_height, tgt_width, divisor=16):
730+
# Step 1: Compare image dimensions with target dimensions
731+
if image_height > tgt_height:
732+
image_height = tgt_height
733+
if image_width > tgt_width:
734+
image_width = tgt_width
735+
736+
# Step 2: Ensure height and width are divisible by the divisor
737+
image_height = (image_height // divisor) * divisor
738+
image_width = (image_width // divisor) * divisor
739+
return image_height, image_width
740+
741+
image_width, image_height = image.size[-2:]
742+
743+
height, width = adjust_resolution_to_divisible(image_height, image_width, height, width)
729744
image = self.video_processor.preprocess(image, height=height, width=width).to(
730745
device, dtype=prompt_embeds.dtype
731746
)
@@ -746,7 +761,6 @@ def __call__(
746761

747762
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
748763
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
749-
750764
# 7. Create rotary embeds if required
751765
image_rotary_emb = (
752766
self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)

0 commit comments

Comments
 (0)