@@ -67,15 +67,18 @@ def get_dummy_components(self):
6767 # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
6868 # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
6969 # to be 32. The internal dim is product of num_attention_heads and attention_head_dim
70- num_attention_heads = 4 ,
71- attention_head_dim = 8 ,
70+ # Note: The num_attention_heads and attention_head_dim is different from the T2V and I2V tests because
71+ # attention_head_dim must be divisible by 16 for RoPE to work. We also need to maintain a product of 32 as
72+ # detailed above.
73+ num_attention_heads = 2 ,
74+ attention_head_dim = 16 ,
7275 in_channels = 8 ,
7376 out_channels = 4 ,
7477 time_embed_dim = 2 ,
7578 text_embed_dim = 32 , # Must match with tiny-random-t5
7679 num_layers = 1 ,
77- sample_width = 16 , # latent width: 2 -> final width: 16
78- sample_height = 16 , # latent height: 2 -> final height: 16
80+ sample_width = 2 , # latent width: 2 -> final width: 16
81+ sample_height = 2 , # latent height: 2 -> final height: 16
7982 sample_frames = 9 , # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
8083 patch_size = 2 ,
8184 temporal_compression_ratio = 4 ,
@@ -127,7 +130,8 @@ def get_dummy_inputs(self, device, seed=0):
127130 else :
128131 generator = torch .Generator (device = device ).manual_seed (seed )
129132
130- # Cannot reduce because convolution kernel becomes bigger than sample
133+ # Cannot reduce below 16 because convolution kernel becomes bigger than sample
134+ # Cannot reduce below 32 because 3D RoPE errors out
131135 image_height = 16
132136 image_width = 16
133137 image = Image .new ("RGB" , (image_width , image_height ))
@@ -265,6 +269,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.3):
265269 generator_device = "cpu"
266270 components = self .get_dummy_components ()
267271
272+ # The reason to modify it this way is because I2V Transformer limits the generation to resolutions.
273+ # See the if-statement on "self.use_learned_positional_embeddings"
274+ components ["transformer" ] = CogVideoXTransformer3DModel .from_config (
275+ components ["transformer" ].config ,
276+ sample_height = 16 ,
277+ sample_width = 16 ,
278+ )
279+
268280 pipe = self .pipeline_class (** components )
269281 pipe .to ("cpu" )
270282 pipe .set_progress_bar_config (disable = None )
0 commit comments