@@ -64,9 +64,9 @@ def get_dummy_components(self):
6464 torch .manual_seed (0 )
6565 text_encoder_config = CLIPTextConfig (
6666 vocab_size = 1000 ,
67- hidden_size = 16 ,
68- intermediate_size = 16 ,
69- projection_dim = 16 ,
67+ hidden_size = 8 ,
68+ intermediate_size = 8 ,
69+ projection_dim = 8 ,
7070 num_hidden_layers = 1 ,
7171 num_attention_heads = 1 ,
7272 max_position_embeddings = 77 ,
@@ -78,17 +78,17 @@ def get_dummy_components(self):
7878 out_channels = 4 ,
7979 down_block_types = ("DownEncoderBlock2D" ,),
8080 up_block_types = ("UpDecoderBlock2D" ,),
81- block_out_channels = (32 ,),
81+ block_out_channels = (8 ,),
82+ norm_num_groups = 8 ,
8283 layers_per_block = 1 ,
8384 act_fn = "silu" ,
8485 latent_channels = 4 ,
85- norm_num_groups = 16 ,
86- sample_size = 16 ,
86+ sample_size = 8 ,
8787 )
8888
8989 blip_vision_config = {
90- "hidden_size" : 16 ,
91- "intermediate_size" : 16 ,
90+ "hidden_size" : 8 ,
91+ "intermediate_size" : 8 ,
9292 "num_hidden_layers" : 1 ,
9393 "num_attention_heads" : 1 ,
9494 "image_size" : 224 ,
@@ -98,32 +98,32 @@ def get_dummy_components(self):
9898
9999 blip_qformer_config = {
100100 "vocab_size" : 1000 ,
101- "hidden_size" : 16 ,
101+ "hidden_size" : 8 ,
102102 "num_hidden_layers" : 1 ,
103103 "num_attention_heads" : 1 ,
104- "intermediate_size" : 16 ,
104+ "intermediate_size" : 8 ,
105105 "max_position_embeddings" : 512 ,
106106 "cross_attention_frequency" : 1 ,
107- "encoder_hidden_size" : 16 ,
107+ "encoder_hidden_size" : 8 ,
108108 }
109109 qformer_config = Blip2Config (
110110 vision_config = blip_vision_config ,
111111 qformer_config = blip_qformer_config ,
112- num_query_tokens = 16 ,
112+ num_query_tokens = 8 ,
113113 tokenizer = "hf-internal-testing/tiny-random-bert" ,
114114 )
115115 qformer = Blip2QFormerModel (qformer_config )
116116
117117 unet = UNet2DConditionModel (
118- block_out_channels = (16 , 32 ),
119- norm_num_groups = 16 ,
118+ block_out_channels = (8 , 16 ),
119+ norm_num_groups = 8 ,
120120 layers_per_block = 1 ,
121121 sample_size = 16 ,
122122 in_channels = 4 ,
123123 out_channels = 4 ,
124124 down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
125125 up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
126- cross_attention_dim = 16 ,
126+ cross_attention_dim = 8 ,
127127 )
128128 tokenizer = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
129129
@@ -189,7 +189,9 @@ def test_blipdiffusion(self):
189189
190190 assert image .shape == (1 , 16 , 16 , 4 )
191191
192- expected_slice = np .array ([0.7096 , 0.5900 , 0.6703 , 0.4032 , 0.7766 , 0.3629 , 0.5447 , 0.4149 , 0.8172 ])
192+ expected_slice = np .array (
193+ [0.5329548 , 0.8372512 , 0.33269387 , 0.82096875 , 0.43657133 , 0.3783 , 0.5953028 , 0.51934963 , 0.42142007 ]
194+ )
193195
194196 assert (
195197 np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
0 commit comments