@@ -567,8 +567,8 @@ def __call__(
567567 image : PipelineImageInput ,
568568 prompt : Optional [Union [str , List [str ]]] = None ,
569569 negative_prompt : Optional [Union [str , List [str ]]] = None ,
570- height : int = 480 ,
571- width : int = 720 ,
570+ height : int = 768 ,
571+ width : int = 1360 ,
572572 num_frames : int = 49 ,
573573 num_inference_steps : int = 50 ,
574574 timesteps : Optional [List [int ]] = None ,
@@ -675,7 +675,6 @@ def __call__(
675675 callback_on_step_end_tensor_inputs = callback_on_step_end .tensor_inputs
676676
677677 num_videos_per_prompt = 1
678-
679678 # 1. Check inputs. Raise error if not correct
680679 self .check_inputs (
681680 image = image ,
@@ -726,6 +725,22 @@ def __call__(
726725 self ._num_timesteps = len (timesteps )
727726
728727 # 5. Prepare latents
728+ # TODO: Only CogVideoX1.5-5B-I2V can use this method. Need to Change
729+ def adjust_resolution_to_divisible (image_height , image_width , tgt_height , tgt_width , divisor = 16 ):
730+ # Step 1: Compare image dimensions with target dimensions
731+ if image_height > tgt_height :
732+ image_height = tgt_height
733+ if image_width > tgt_width :
734+ image_width = tgt_width
735+
736+ # Step 2: Ensure height and width are divisible by the divisor
737+ image_height = (image_height // divisor ) * divisor
738+ image_width = (image_width // divisor ) * divisor
739+ return image_height , image_width
740+
741+ image_width , image_height = image .size [- 2 :]
742+
743+ height , width = adjust_resolution_to_divisible (image_height , image_width , height , width )
729744 image = self .video_processor .preprocess (image , height = height , width = width ).to (
730745 device , dtype = prompt_embeds .dtype
731746 )
@@ -746,7 +761,6 @@ def __call__(
746761
747762 # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
748763 extra_step_kwargs = self .prepare_extra_step_kwargs (generator , eta )
749-
750764 # 7. Create rotary embeds if required
751765 image_rotary_emb = (
752766 self ._prepare_rotary_positional_embeddings (height , width , latents .size (1 ), device )
0 commit comments