1+ # coding=utf-8
2+ # Copyright 2024 HuggingFace Inc.
3+ #
4+ # Licensed under the Apache License, Version 2.0 (the "License");
5+ # you may not use this file except in compliance with the License.
6+ # You may obtain a copy of the License at
7+ #
8+ # http://www.apache.org/licenses/LICENSE-2.0
9+ #
10+ # Unless required by applicable law or agreed to in writing, software
11+ # distributed under the License is distributed on an "AS IS" BASIS,
12+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ # See the License for the specific language governing permissions and
14+ # limitations under the License.
15+
16+ import unittest
17+
18+ import numpy as np
19+ import torch
20+ from transformers import CLIPTokenizer , CLIPTextConfig , CLIPTextModelWithProjection , T5EncoderModel , AutoTokenizer , LlamaForCausalLM , PreTrainedTokenizerFast
21+
22+ from diffusers import AutoencoderKL , FlowMatchEulerDiscreteScheduler , HiDreamImageTransformer2DModel , HiDreamImagePipeline
23+ from diffusers .utils .testing_utils import (
24+ enable_full_determinism ,
25+ )
26+
27+ from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS , TEXT_TO_IMAGE_IMAGE_PARAMS , TEXT_TO_IMAGE_PARAMS
28+ from ..test_pipelines_common import (
29+ PipelineTesterMixin ,
30+ )
31+
32+
33+ enable_full_determinism ()
34+
35+
36+ class HiDreamImagePipelineFastTests (PipelineTesterMixin , unittest .TestCase ):
37+ pipeline_class = HiDreamImagePipeline
38+ params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs" }
39+ batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
40+ image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
41+ image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
42+
43+ required_optional_params = PipelineTesterMixin .required_optional_params
44+ test_layerwise_casting = True
45+
46+ def get_dummy_components (self ):
47+ torch .manual_seed (0 )
48+ transformer = HiDreamImageTransformer2DModel (
49+ patch_size = 2 ,
50+ in_channels = 4 ,
51+ out_channels = 4 ,
52+ num_layers = 1 ,
53+ num_single_layers = 1 ,
54+ attention_head_dim = 8 ,
55+ num_attention_heads = 4 ,
56+ caption_channels = [32 , 16 ],
57+ text_emb_dim = 64 ,
58+ num_routed_experts = 4 ,
59+ num_activated_experts = 2 ,
60+ axes_dims_rope = (4 , 2 , 2 ),
61+ max_resolution = (32 , 32 ),
62+ llama_layers = (0 , 1 ),
63+ ).eval ()
64+ torch .manual_seed (0 )
65+ vae = AutoencoderKL (scaling_factor = 0.3611 , shift_factor = 0.1159 )
66+ clip_text_encoder_config = CLIPTextConfig (
67+ bos_token_id = 0 ,
68+ eos_token_id = 2 ,
69+ hidden_size = 32 ,
70+ intermediate_size = 37 ,
71+ layer_norm_eps = 1e-05 ,
72+ num_attention_heads = 4 ,
73+ num_hidden_layers = 5 ,
74+ pad_token_id = 1 ,
75+ vocab_size = 1000 ,
76+ hidden_act = "gelu" ,
77+ projection_dim = 32 ,
78+ max_position_embeddings = 128 ,
79+ )
80+
81+ torch .manual_seed (0 )
82+ text_encoder = CLIPTextModelWithProjection (clip_text_encoder_config )
83+
84+ torch .manual_seed (0 )
85+ text_encoder_2 = CLIPTextModelWithProjection (clip_text_encoder_config )
86+
87+ torch .manual_seed (0 )
88+ text_encoder_3 = T5EncoderModel .from_pretrained ("hf-internal-testing/tiny-random-t5" )
89+
90+ torch .manual_seed (0 )
91+ text_encoder_4 = LlamaForCausalLM .from_pretrained ("hf-internal-testing/tiny-random-LlamaForCausalLM" )
92+ text_encoder_4 .generation_config .pad_token_id = 1
93+ tokenizer = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
94+ tokenizer_2 = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
95+ tokenizer_3 = AutoTokenizer .from_pretrained ("hf-internal-testing/tiny-random-t5" )
96+ tokenizer_4 = AutoTokenizer .from_pretrained ("hf-internal-testing/tiny-random-LlamaForCausalLM" )
97+
98+ scheduler = FlowMatchEulerDiscreteScheduler ()
99+
100+ components = {
101+ "scheduler" : scheduler ,
102+ "vae" : vae ,
103+ "text_encoder" : text_encoder ,
104+ "tokenizer" : tokenizer ,
105+ "text_encoder_2" : text_encoder_2 ,
106+ "tokenizer_2" : tokenizer_2 ,
107+ "text_encoder_3" : text_encoder_3 ,
108+ "tokenizer_3" : tokenizer_3 ,
109+ "text_encoder_4" : text_encoder_4 ,
110+ "tokenizer_4" : tokenizer_4 ,
111+ "transformer" : transformer ,
112+ }
113+ return components
114+
115+ def get_dummy_inputs (self , device , seed = 0 ):
116+ if str (device ).startswith ("mps" ):
117+ generator = torch .manual_seed (seed )
118+ else :
119+ generator = torch .Generator (device = device ).manual_seed (seed )
120+ inputs = {
121+ "prompt" : "A painting of a squirrel eating a burger" ,
122+ "generator" : generator ,
123+ "num_inference_steps" : 2 ,
124+ "guidance_scale" : 5.0 ,
125+ "output_type" : "np" ,
126+ }
127+ return inputs
128+
129+ def test_inference (self ):
130+ device = "cpu"
131+
132+ components = self .get_dummy_components ()
133+ pipe = self .pipeline_class (** components )
134+ pipe .to (device )
135+ pipe .set_progress_bar_config (disable = None )
136+
137+ inputs = self .get_dummy_inputs (device )
138+ image = pipe (** inputs ).images
139+ image_slice = image [0 , - 3 :, - 3 :, - 1 ]
140+
141+ self .assertEqual (image .shape , (1 , 128 , 128 , 3 ))
142+ expected_slice = np .array (
143+ [0.6253079 , 0.6115351 , 0.5223988 , 0.5683453 , 0.44545278 , 0.53524655 , 0.3968956 , 0.5558849 , 0.5917772 ]
144+ )
145+ max_diff = np .abs (image_slice .flatten () - expected_slice ).max ()
146+ self .assertLessEqual (max_diff , 1e-3 , f"Got { image_slice .flatten ()= } " )
147+
148+ def test_inference_batch_single_identical (self ):
149+ super ().test_inference_batch_single_identical (expected_max_diff = 3e-4 )
0 commit comments