Skip to content

Commit 6b53b85

Browse files
committed
pipeline tests (broken atm)
1 parent 015cc78 commit 6b53b85

File tree

2 files changed

+314
-3
lines changed

2 files changed

+314
-3
lines changed

tests/models/transformers/test_models_transformer_allegro.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class AllegroTransformerTests(ModelTesterMixin, unittest.TestCase):
3737
def dummy_input(self):
3838
batch_size = 2
3939
num_channels = 4
40-
num_frames = 8
40+
num_frames = 2
4141
height = 8
4242
width = 8
4343
embedding_dim = 16
@@ -55,11 +55,11 @@ def dummy_input(self):
5555

5656
@property
5757
def input_shape(self):
58-
return (4, 8, 8, 8)
58+
return (4, 2, 8, 8)
5959

6060
@property
6161
def output_shape(self):
62-
return (4, 8, 8, 8)
62+
return (4, 2, 8, 8)
6363

6464
def prepare_init_args_and_inputs_for_common(self):
6565
init_dict = {
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
# Copyright 2024 The HuggingFace Team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import gc
16+
import inspect
17+
import unittest
18+
19+
import numpy as np
20+
import torch
21+
from transformers import AutoTokenizer, T5EncoderModel
22+
23+
from diffusers import AllegroPipeline, AllegroTransformer3DModel, AutoencoderKLAllegro, DDIMScheduler
24+
from diffusers.utils.testing_utils import (
25+
enable_full_determinism,
26+
numpy_cosine_similarity_distance,
27+
require_torch_gpu,
28+
slow,
29+
torch_device,
30+
)
31+
32+
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
33+
from ..test_pipelines_common import PipelineTesterMixin, to_np
34+
35+
36+
enable_full_determinism()
37+
38+
39+
class AllegroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
40+
pipeline_class = AllegroPipeline
41+
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
42+
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
43+
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
44+
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
45+
required_optional_params = frozenset(
46+
[
47+
"num_inference_steps",
48+
"generator",
49+
"latents",
50+
"return_dict",
51+
"callback_on_step_end",
52+
"callback_on_step_end_tensor_inputs",
53+
]
54+
)
55+
test_xformers_attention = False
56+
57+
def get_dummy_components(self):
58+
torch.manual_seed(0)
59+
transformer = AllegroTransformer3DModel(
60+
num_attention_heads=2,
61+
attention_head_dim=12,
62+
in_channels=4,
63+
out_channels=4,
64+
num_layers=1,
65+
cross_attention_dim=32,
66+
sample_width=8,
67+
sample_height=8,
68+
sample_frames=8,
69+
caption_channels=32,
70+
)
71+
72+
torch.manual_seed(0)
73+
vae = AutoencoderKLAllegro(
74+
in_channels=3,
75+
out_channels=3,
76+
down_block_types=(
77+
"AllegroDownBlock3D",
78+
"AllegroDownBlock3D",
79+
"AllegroDownBlock3D",
80+
"AllegroDownBlock3D",
81+
),
82+
up_block_types=(
83+
"AllegroUpBlock3D",
84+
"AllegroUpBlock3D",
85+
"AllegroUpBlock3D",
86+
"AllegroUpBlock3D",
87+
),
88+
block_out_channels=(8, 8, 8, 8),
89+
latent_channels=4,
90+
layers_per_block=1,
91+
norm_num_groups=2,
92+
temporal_compression_ratio=4,
93+
)
94+
95+
torch.manual_seed(0)
96+
scheduler = DDIMScheduler()
97+
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
98+
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
99+
100+
components = {
101+
"transformer": transformer,
102+
"vae": vae,
103+
"scheduler": scheduler,
104+
"text_encoder": text_encoder,
105+
"tokenizer": tokenizer,
106+
}
107+
return components
108+
109+
def get_dummy_inputs(self, device, seed=0):
110+
if str(device).startswith("mps"):
111+
generator = torch.manual_seed(seed)
112+
else:
113+
generator = torch.Generator(device=device).manual_seed(seed)
114+
115+
inputs = {
116+
"prompt": "dance monkey",
117+
"negative_prompt": "",
118+
"generator": generator,
119+
"num_inference_steps": 2,
120+
"guidance_scale": 6.0,
121+
"height": 48,
122+
"width": 48,
123+
"num_frames": 8,
124+
"max_sequence_length": 16,
125+
"output_type": "pt",
126+
}
127+
128+
return inputs
129+
130+
def test_inference(self):
131+
device = "cpu"
132+
133+
components = self.get_dummy_components()
134+
pipe = self.pipeline_class(**components)
135+
pipe.to(device)
136+
pipe.set_progress_bar_config(disable=None)
137+
138+
inputs = self.get_dummy_inputs(device)
139+
video = pipe(**inputs).frames
140+
generated_video = video[0]
141+
142+
self.assertEqual(generated_video.shape, (8, 3, 16, 16))
143+
expected_video = torch.randn(8, 3, 16, 16)
144+
max_diff = np.abs(generated_video - expected_video).max()
145+
self.assertLessEqual(max_diff, 1e10)
146+
147+
def test_callback_inputs(self):
148+
sig = inspect.signature(self.pipeline_class.__call__)
149+
has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
150+
has_callback_step_end = "callback_on_step_end" in sig.parameters
151+
152+
if not (has_callback_tensor_inputs and has_callback_step_end):
153+
return
154+
155+
components = self.get_dummy_components()
156+
pipe = self.pipeline_class(**components)
157+
pipe = pipe.to(torch_device)
158+
pipe.set_progress_bar_config(disable=None)
159+
self.assertTrue(
160+
hasattr(pipe, "_callback_tensor_inputs"),
161+
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
162+
)
163+
164+
def callback_inputs_subset(pipe, i, t, callback_kwargs):
165+
# iterate over callback args
166+
for tensor_name, tensor_value in callback_kwargs.items():
167+
# check that we're only passing in allowed tensor inputs
168+
assert tensor_name in pipe._callback_tensor_inputs
169+
170+
return callback_kwargs
171+
172+
def callback_inputs_all(pipe, i, t, callback_kwargs):
173+
for tensor_name in pipe._callback_tensor_inputs:
174+
assert tensor_name in callback_kwargs
175+
176+
# iterate over callback args
177+
for tensor_name, tensor_value in callback_kwargs.items():
178+
# check that we're only passing in allowed tensor inputs
179+
assert tensor_name in pipe._callback_tensor_inputs
180+
181+
return callback_kwargs
182+
183+
inputs = self.get_dummy_inputs(torch_device)
184+
185+
# Test passing in a subset
186+
inputs["callback_on_step_end"] = callback_inputs_subset
187+
inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
188+
output = pipe(**inputs)[0]
189+
190+
# Test passing in a everything
191+
inputs["callback_on_step_end"] = callback_inputs_all
192+
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
193+
output = pipe(**inputs)[0]
194+
195+
def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
196+
is_last = i == (pipe.num_timesteps - 1)
197+
if is_last:
198+
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
199+
return callback_kwargs
200+
201+
inputs["callback_on_step_end"] = callback_inputs_change_tensor
202+
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
203+
output = pipe(**inputs)[0]
204+
assert output.abs().sum() < 1e10
205+
206+
def test_inference_batch_single_identical(self):
207+
self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
208+
209+
# def test_attention_slicing_forward_pass(
210+
# self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
211+
# ):
212+
# if not self.test_attention_slicing:
213+
# return
214+
215+
# components = self.get_dummy_components()
216+
# pipe = self.pipeline_class(**components)
217+
# for component in pipe.components.values():
218+
# if hasattr(component, "set_default_attn_processor"):
219+
# component.set_default_attn_processor()
220+
# pipe.to(torch_device)
221+
# pipe.set_progress_bar_config(disable=None)
222+
223+
# generator_device = "cpu"
224+
# inputs = self.get_dummy_inputs(generator_device)
225+
# output_without_slicing = pipe(**inputs)[0]
226+
227+
# pipe.enable_attention_slicing(slice_size=1)
228+
# inputs = self.get_dummy_inputs(generator_device)
229+
# output_with_slicing1 = pipe(**inputs)[0]
230+
231+
# pipe.enable_attention_slicing(slice_size=2)
232+
# inputs = self.get_dummy_inputs(generator_device)
233+
# output_with_slicing2 = pipe(**inputs)[0]
234+
235+
# if test_max_difference:
236+
# max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
237+
# max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
238+
# self.assertLess(
239+
# max(max_diff1, max_diff2),
240+
# expected_max_diff,
241+
# "Attention slicing should not affect the inference results",
242+
# )
243+
244+
def test_vae_tiling(self, expected_diff_max: float = 0.2):
245+
generator_device = "cpu"
246+
components = self.get_dummy_components()
247+
248+
pipe = self.pipeline_class(**components)
249+
pipe.to("cpu")
250+
pipe.set_progress_bar_config(disable=None)
251+
252+
# Without tiling
253+
inputs = self.get_dummy_inputs(generator_device)
254+
inputs["height"] = inputs["width"] = 128
255+
output_without_tiling = pipe(**inputs)[0]
256+
257+
# With tiling
258+
pipe.vae.enable_tiling(
259+
tile_sample_min_height=96,
260+
tile_sample_min_width=96,
261+
tile_overlap_factor_height=1 / 12,
262+
tile_overlap_factor_width=1 / 12,
263+
)
264+
inputs = self.get_dummy_inputs(generator_device)
265+
inputs["height"] = inputs["width"] = 128
266+
output_with_tiling = pipe(**inputs)[0]
267+
268+
self.assertLess(
269+
(to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
270+
expected_diff_max,
271+
"VAE tiling should not affect the inference results",
272+
)
273+
274+
275+
@slow
276+
@require_torch_gpu
277+
class AllegroPipelineIntegrationTests(unittest.TestCase):
278+
prompt = "A painting of a squirrel eating a burger."
279+
280+
def setUp(self):
281+
super().setUp()
282+
gc.collect()
283+
torch.cuda.empty_cache()
284+
285+
def tearDown(self):
286+
super().tearDown()
287+
gc.collect()
288+
torch.cuda.empty_cache()
289+
290+
def test_cogvideox(self):
291+
generator = torch.Generator("cpu").manual_seed(0)
292+
293+
pipe = AllegroPipeline.from_pretrained("rhymes-ai/Allegro", torch_dtype=torch.float16)
294+
pipe.enable_model_cpu_offload()
295+
prompt = self.prompt
296+
297+
videos = pipe(
298+
prompt=prompt,
299+
height=720,
300+
width=1280,
301+
num_frames=88,
302+
generator=generator,
303+
num_inference_steps=2,
304+
output_type="pt",
305+
).frames
306+
307+
video = videos[0]
308+
expected_video = torch.randn(1, 88, 720, 1280, 3).numpy()
309+
310+
max_diff = numpy_cosine_similarity_distance(video, expected_video)
311+
assert max_diff < 1e-3, f"Max diff is too high. got {video}"

0 commit comments

Comments
 (0)