1717from  typing  import  Any , Dict , List , Optional , Tuple , Union 
1818
1919import  torch 
20- from  transformers  import  (
21-     CLIPImageProcessor ,
20+ from  transformers  import  (    
2221    CLIPTextModel ,
2322    CLIPTextModelWithProjection ,
24-     CLIPTokenizer ,
25-     CLIPVisionModelWithProjection ,
23+     CLIPTokenizer ,    
2624)
2725
2826from  diffusers .image_processor  import  VaeImageProcessor 
2927from  diffusers .loaders  import  (
30-     FromSingleFileMixin ,
31-     IPAdapterMixin ,
28+     FromSingleFileMixin ,    
3229    StableDiffusionXLLoraLoaderMixin ,
3330    TextualInversionLoaderMixin ,
3431)
@@ -247,8 +244,7 @@ class StableDiffusionXLTilingPipeline(
247244    StableDiffusionMixin ,
248245    FromSingleFileMixin ,
249246    StableDiffusionXLLoraLoaderMixin ,
250-     TextualInversionLoaderMixin ,
251-     IPAdapterMixin ,
247+     TextualInversionLoaderMixin ,    
252248):
253249    r""" 
254250    Pipeline for text-to-image generation using Stable Diffusion XL. 
@@ -260,8 +256,7 @@ class StableDiffusionXLTilingPipeline(
260256        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings 
261257        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files 
262258        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights 
263-         - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights 
264-         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters 
259+         - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights         
265260
266261    Args: 
267262        vae ([`AutoencoderKL`]): 
@@ -300,9 +295,7 @@ class StableDiffusionXLTilingPipeline(
300295        "tokenizer" ,
301296        "tokenizer_2" ,
302297        "text_encoder" ,
303-         "text_encoder_2" ,
304-         "image_encoder" ,
305-         "feature_extractor" ,
298+         "text_encoder_2" ,        
306299    ]
307300
308301    def  __init__ (
@@ -313,9 +306,7 @@ def __init__(
313306        tokenizer : CLIPTokenizer ,
314307        tokenizer_2 : CLIPTokenizer ,
315308        unet : UNet2DConditionModel ,
316-         scheduler : KarrasDiffusionSchedulers ,
317-         image_encoder : CLIPVisionModelWithProjection  =  None ,
318-         feature_extractor : CLIPImageProcessor  =  None ,
309+         scheduler : KarrasDiffusionSchedulers ,        
319310        force_zeros_for_empty_prompt : bool  =  True ,
320311        add_watermarker : Optional [bool ] =  None ,
321312    ):
@@ -328,9 +319,7 @@ def __init__(
328319            tokenizer = tokenizer ,
329320            tokenizer_2 = tokenizer_2 ,
330321            unet = unet ,
331-             scheduler = scheduler ,
332-             image_encoder = image_encoder ,
333-             feature_extractor = feature_extractor ,
322+             scheduler = scheduler ,            
334323        )
335324        self .register_to_config (force_zeros_for_empty_prompt = force_zeros_for_empty_prompt )
336325        self .vae_scale_factor  =  2  **  (len (self .vae .config .block_out_channels ) -  1 ) if  getattr (self , "vae" , None ) else  8 
@@ -1089,15 +1078,15 @@ def __call__(
10891078                            "text_embeds" : embeddings_and_added_time [row ][col ][1 ],
10901079                            "time_ids" : embeddings_and_added_time [row ][col ][2 ],
10911080                        }
1092-                         # with torch.amp.autocast(device.type, dtype=dtype, enabled=dtype != self.unet.dtype):
1093-                         noise_pred  =  self .unet (
1094-                             latent_model_input ,
1095-                             t ,
1096-                             encoder_hidden_states = embeddings_and_added_time [row ][col ][0 ],
1097-                             cross_attention_kwargs = self .cross_attention_kwargs ,
1098-                             added_cond_kwargs = added_cond_kwargs ,
1099-                             return_dict = False ,
1100-                         )[0 ]
1081+                         with  torch .amp .autocast (device .type , dtype = dtype , enabled = dtype  !=  self .unet .dtype ):
1082+                              noise_pred  =  self .unet (
1083+                                  latent_model_input ,
1084+                                  t ,
1085+                                  encoder_hidden_states = embeddings_and_added_time [row ][col ][0 ],
1086+                                  cross_attention_kwargs = self .cross_attention_kwargs ,
1087+                                  added_cond_kwargs = added_cond_kwargs ,
1088+                                  return_dict = False ,
1089+                              )[0 ]
11011090
11021091                        # perform guidance 
11031092                        if  self .do_classifier_free_guidance :
0 commit comments