Skip to content

Commit 7debcec

Browse files
committed
revert changes to tests
1 parent cbc086f commit 7debcec

File tree

4 files changed

+13
-143
lines changed

4 files changed

+13
-143
lines changed

tests/pipelines/cogvideo/test_cogvideox.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,6 @@
2121
from transformers import AutoTokenizer, T5EncoderModel
2222

2323
from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
24-
from diffusers.pipelines.pyramid_attention_broadcast_utils import (
25-
PyramidAttentionBroadcastConfig,
26-
apply_pyramid_attention_broadcast,
27-
)
2824
from diffusers.utils.testing_utils import (
2925
enable_full_determinism,
3026
numpy_cosine_similarity_distance,
@@ -63,7 +59,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
6359
)
6460
test_xformers_attention = False
6561

66-
def get_dummy_components(self, num_layers: int = 1):
62+
def get_dummy_components(self):
6763
torch.manual_seed(0)
6864
transformer = CogVideoXTransformer3DModel(
6965
# Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -75,7 +71,7 @@ def get_dummy_components(self, num_layers: int = 1):
7571
out_channels=4,
7672
time_embed_dim=2,
7773
text_embed_dim=32, # Must match with tiny-random-t5
78-
num_layers=num_layers,
74+
num_layers=1,
7975
sample_width=2, # latent width: 2 -> final width: 16
8076
sample_height=2, # latent height: 2 -> final height: 16
8177
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -323,35 +319,6 @@ def test_fused_qkv_projections(self):
323319
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
324320
), "Original outputs should match when fused QKV projections are disabled."
325321

326-
def test_pyramid_attention_broadcast(self):
327-
device = "cpu" # ensure determinism for the device-dependent torch.Generator
328-
num_layers = 4
329-
components = self.get_dummy_components(num_layers=num_layers)
330-
pipe = self.pipeline_class(**components)
331-
pipe = pipe.to(device)
332-
pipe.set_progress_bar_config(disable=None)
333-
334-
inputs = self.get_dummy_inputs(device)
335-
inputs["num_inference_steps"] = 4
336-
frames = pipe(**inputs).frames # [B, F, C, H, W]
337-
original_image_slice = frames[0, -2:, -1, -3:, -3:]
338-
339-
config = PyramidAttentionBroadcastConfig(
340-
spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
341-
)
342-
apply_pyramid_attention_broadcast(pipe, config)
343-
344-
inputs = self.get_dummy_inputs(device)
345-
inputs["num_inference_steps"] = 4
346-
frames = pipe(**inputs).frames
347-
image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
348-
349-
# We need to use higher tolerance because we are using a random model. With a converged/trained
350-
# model, the tolerance can be lower.
351-
assert np.allclose(
352-
original_image_slice, image_slice_pab_enabled, atol=0.2
353-
), "PAB outputs should not differ much in specified timestep range."
354-
355322

356323
@slow
357324
@require_torch_gpu

tests/pipelines/cogvideo/test_cogvideox_image2video.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@
2222
from transformers import AutoTokenizer, T5EncoderModel
2323

2424
from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
25-
from diffusers.pipelines.pyramid_attention_broadcast_utils import (
26-
PyramidAttentionBroadcastConfig,
27-
apply_pyramid_attention_broadcast,
28-
)
2925
from diffusers.utils import load_image
3026
from diffusers.utils.testing_utils import (
3127
enable_full_determinism,
@@ -65,7 +61,7 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
6561
)
6662
test_xformers_attention = False
6763

68-
def get_dummy_components(self, num_layers: int = 1):
64+
def get_dummy_components(self):
6965
torch.manual_seed(0)
7066
transformer = CogVideoXTransformer3DModel(
7167
# Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -80,7 +76,7 @@ def get_dummy_components(self, num_layers: int = 1):
8076
out_channels=4,
8177
time_embed_dim=2,
8278
text_embed_dim=32, # Must match with tiny-random-t5
83-
num_layers=num_layers,
79+
num_layers=1,
8480
sample_width=2, # latent width: 2 -> final width: 16
8581
sample_height=2, # latent height: 2 -> final height: 16
8682
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -346,35 +342,6 @@ def test_fused_qkv_projections(self):
346342
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
347343
), "Original outputs should match when fused QKV projections are disabled."
348344

349-
def test_pyramid_attention_broadcast(self):
350-
device = "cpu" # ensure determinism for the device-dependent torch.Generator
351-
num_layers = 4
352-
components = self.get_dummy_components(num_layers=num_layers)
353-
pipe = self.pipeline_class(**components)
354-
pipe = pipe.to(device)
355-
pipe.set_progress_bar_config(disable=None)
356-
357-
inputs = self.get_dummy_inputs(device)
358-
inputs["num_inference_steps"] = 4
359-
frames = pipe(**inputs).frames # [B, F, C, H, W]
360-
original_image_slice = frames[0, -2:, -1, -3:, -3:]
361-
362-
config = PyramidAttentionBroadcastConfig(
363-
spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
364-
)
365-
apply_pyramid_attention_broadcast(pipe, config)
366-
367-
inputs = self.get_dummy_inputs(device)
368-
inputs["num_inference_steps"] = 4
369-
frames = pipe(**inputs).frames
370-
image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
371-
372-
# We need to use higher tolerance because we are using a random model. With a converged/trained
373-
# model, the tolerance can be lower.
374-
assert np.allclose(
375-
original_image_slice, image_slice_pab_enabled, atol=0.2
376-
), "PAB outputs should not differ much in specified timestep range."
377-
378345

379346
@slow
380347
@require_torch_gpu

tests/pipelines/cogvideo/test_cogvideox_video2video.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,6 @@
2121
from transformers import AutoTokenizer, T5EncoderModel
2222

2323
from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
24-
from diffusers.pipelines.pyramid_attention_broadcast_utils import (
25-
PyramidAttentionBroadcastConfig,
26-
apply_pyramid_attention_broadcast,
27-
)
2824
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
2925

3026
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -57,7 +53,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
5753
)
5854
test_xformers_attention = False
5955

60-
def get_dummy_components(self, num_layers: int = 1):
56+
def get_dummy_components(self):
6157
torch.manual_seed(0)
6258
transformer = CogVideoXTransformer3DModel(
6359
# Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings
@@ -69,7 +65,7 @@ def get_dummy_components(self, num_layers: int = 1):
6965
out_channels=4,
7066
time_embed_dim=2,
7167
text_embed_dim=32, # Must match with tiny-random-t5
72-
num_layers=num_layers,
68+
num_layers=1,
7369
sample_width=2, # latent width: 2 -> final width: 16
7470
sample_height=2, # latent height: 2 -> final height: 16
7571
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
@@ -327,32 +323,3 @@ def test_fused_qkv_projections(self):
327323
assert np.allclose(
328324
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
329325
), "Original outputs should match when fused QKV projections are disabled."
330-
331-
def test_pyramid_attention_broadcast(self):
332-
device = "cpu" # ensure determinism for the device-dependent torch.Generator
333-
num_layers = 4
334-
components = self.get_dummy_components(num_layers=num_layers)
335-
pipe = self.pipeline_class(**components)
336-
pipe = pipe.to(device)
337-
pipe.set_progress_bar_config(disable=None)
338-
339-
inputs = self.get_dummy_inputs(device)
340-
inputs["num_inference_steps"] = 4
341-
frames = pipe(**inputs).frames # [B, F, C, H, W]
342-
original_image_slice = frames[0, -2:, -1, -3:, -3:]
343-
344-
config = PyramidAttentionBroadcastConfig(
345-
spatial_attention_block_skip_range=2, spatial_attention_timestep_skip_range=(100, 800)
346-
)
347-
apply_pyramid_attention_broadcast(pipe, config)
348-
349-
inputs = self.get_dummy_inputs(device)
350-
inputs["num_inference_steps"] = 4
351-
frames = pipe(**inputs).frames
352-
image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
353-
354-
# We need to use higher tolerance because we are using a random model. With a converged/trained
355-
# model, the tolerance can be lower.
356-
assert np.allclose(
357-
original_image_slice, image_slice_pab_enabled, atol=0.2
358-
), "PAB outputs should not differ much in specified timestep range."

tests/pipelines/latte/test_latte.py

Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@
2222
import torch
2323
from transformers import AutoTokenizer, T5EncoderModel
2424

25-
from diffusers import AutoencoderKL, DDIMScheduler, LattePipeline, LatteTransformer3DModel
26-
from diffusers.pipelines.pyramid_attention_broadcast_utils import (
27-
PyramidAttentionBroadcastConfig,
28-
apply_pyramid_attention_broadcast,
25+
from diffusers import (
26+
AutoencoderKL,
27+
DDIMScheduler,
28+
LattePipeline,
29+
LatteTransformer3DModel,
2930
)
3031
from diffusers.utils.import_utils import is_xformers_available
3132
from diffusers.utils.testing_utils import (
@@ -52,11 +53,11 @@ class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
5253

5354
required_optional_params = PipelineTesterMixin.required_optional_params
5455

55-
def get_dummy_components(self, num_layers: int = 1):
56+
def get_dummy_components(self):
5657
torch.manual_seed(0)
5758
transformer = LatteTransformer3DModel(
5859
sample_size=8,
59-
num_layers=num_layers,
60+
num_layers=1,
6061
patch_size=2,
6162
attention_head_dim=8,
6263
num_attention_heads=3,
@@ -263,38 +264,6 @@ def test_save_load_optional_components(self):
263264
def test_xformers_attention_forwardGenerator_pass(self):
264265
super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
265266

266-
def test_pyramid_attention_broadcast(self):
267-
device = "cpu" # ensure determinism for the device-dependent torch.Generator
268-
num_layers = 4
269-
components = self.get_dummy_components(num_layers=num_layers)
270-
pipe = self.pipeline_class(**components)
271-
pipe = pipe.to(device)
272-
pipe.set_progress_bar_config(disable=None)
273-
274-
inputs = self.get_dummy_inputs(device)
275-
inputs["num_inference_steps"] = 4
276-
frames = pipe(**inputs).frames # [B, F, C, H, W]
277-
original_image_slice = frames[0, -2:, -1, -3:, -3:]
278-
279-
config = PyramidAttentionBroadcastConfig(
280-
spatial_attention_block_skip_range=2,
281-
temporal_attention_block_skip_range=3,
282-
spatial_attention_timestep_skip_range=(100, 800),
283-
temporal_attention_timestep_skip_range=(100, 800),
284-
)
285-
apply_pyramid_attention_broadcast(pipe, config)
286-
287-
inputs = self.get_dummy_inputs(device)
288-
inputs["num_inference_steps"] = 4
289-
frames = pipe(**inputs).frames
290-
image_slice_pab_enabled = frames[0, -2:, -1, -3:, -3:]
291-
292-
# We need to use higher tolerance because we are using a random model. With a converged/trained
293-
# model, the tolerance can be lower.
294-
assert np.allclose(
295-
original_image_slice, image_slice_pab_enabled, atol=0.2
296-
), "PAB outputs should not differ much in specified timestep range."
297-
298267

299268
@slow
300269
@require_torch_gpu

0 commit comments

Comments
 (0)