@@ -66,16 +66,17 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
6666 def get_dummy_components (self ):
6767 torch .manual_seed (0 )
6868 unet = UNet2DConditionModel (
69- block_out_channels = (32 , 64 ),
70- layers_per_block = 2 ,
69+ block_out_channels = (8 , 16 ),
70+ layers_per_block = 1 ,
71+ norm_num_groups = 8 ,
7172 sample_size = 32 ,
7273 in_channels = 4 ,
7374 out_channels = 4 ,
7475 down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
7576 up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
76- cross_attention_dim = (32 , 64 ),
77+ cross_attention_dim = (8 , 16 ),
7778 class_embed_type = "simple_projection" ,
78- projection_class_embeddings_input_dim = 32 ,
79+ projection_class_embeddings_input_dim = 8 ,
7980 class_embeddings_concat = True ,
8081 )
8182 scheduler = DDIMScheduler (
@@ -87,9 +88,10 @@ def get_dummy_components(self):
8788 )
8889 torch .manual_seed (0 )
8990 vae = AutoencoderKL (
90- block_out_channels = [32 , 64 ],
91+ block_out_channels = [8 , 16 ],
9192 in_channels = 1 ,
9293 out_channels = 1 ,
94+ norm_num_groups = 8 ,
9395 down_block_types = ["DownEncoderBlock2D" , "DownEncoderBlock2D" ],
9496 up_block_types = ["UpDecoderBlock2D" , "UpDecoderBlock2D" ],
9597 latent_channels = 4 ,
@@ -98,14 +100,14 @@ def get_dummy_components(self):
98100 text_encoder_config = ClapTextConfig (
99101 bos_token_id = 0 ,
100102 eos_token_id = 2 ,
101- hidden_size = 32 ,
103+ hidden_size = 8 ,
102104 intermediate_size = 37 ,
103105 layer_norm_eps = 1e-05 ,
104- num_attention_heads = 4 ,
105- num_hidden_layers = 5 ,
106+ num_attention_heads = 1 ,
107+ num_hidden_layers = 1 ,
106108 pad_token_id = 1 ,
107109 vocab_size = 1000 ,
108- projection_dim = 32 ,
110+ projection_dim = 8 ,
109111 )
110112 text_encoder = ClapTextModelWithProjection (text_encoder_config )
111113 tokenizer = RobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-random-roberta" , model_max_length = 77 )
0 commit comments