Skip to content

Commit 380a820

Browse files
committed
fix tests
1 parent edeb626 commit 380a820

File tree

1 file changed

+17
-5
lines changed

1 file changed

+17
-5
lines changed

tests/pipelines/cogvideo/test_cogvideox_image2video.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,18 @@ def get_dummy_components(self):
6767
# Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
6868
# But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel
6969
# to be 32. The internal dim is product of num_attention_heads and attention_head_dim
70-
num_attention_heads=4,
71-
attention_head_dim=8,
70+
# Note: The num_attention_heads and attention_head_dim is different from the T2V and I2V tests because
71+
# attention_head_dim must be divisible by 16 for RoPE to work. We also need to maintain a product of 32 as
72+
# detailed above.
73+
num_attention_heads=2,
74+
attention_head_dim=16,
7275
in_channels=8,
7376
out_channels=4,
7477
time_embed_dim=2,
7578
text_embed_dim=32, # Must match with tiny-random-t5
7679
num_layers=1,
77-
sample_width=16, # latent width: 2 -> final width: 16
78-
sample_height=16, # latent height: 2 -> final height: 16
80+
sample_width=2, # latent width: 2 -> final width: 16
81+
sample_height=2, # latent height: 2 -> final height: 16
7982
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
8083
patch_size=2,
8184
temporal_compression_ratio=4,
@@ -127,7 +130,8 @@ def get_dummy_inputs(self, device, seed=0):
127130
else:
128131
generator = torch.Generator(device=device).manual_seed(seed)
129132

130-
# Cannot reduce because convolution kernel becomes bigger than sample
133+
# Cannot reduce below 16 because convolution kernel becomes bigger than sample
134+
# Cannot reduce below 32 because 3D RoPE errors out
131135
image_height = 16
132136
image_width = 16
133137
image = Image.new("RGB", (image_width, image_height))
@@ -265,6 +269,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.3):
265269
generator_device = "cpu"
266270
components = self.get_dummy_components()
267271

272+
# The reason to modify it this way is because I2V Transformer limits the generation to resolutions.
273+
# See the if-statement on "self.use_learned_positional_embeddings"
274+
components["transformer"] = CogVideoXTransformer3DModel.from_config(
275+
components["transformer"].config,
276+
sample_height=16,
277+
sample_width=16,
278+
)
279+
268280
pipe = self.pipeline_class(**components)
269281
pipe.to("cpu")
270282
pipe.set_progress_bar_config(disable=None)

0 commit comments

Comments
 (0)