huggingface · asomoza · Jan 28, 2025 · Jan 28, 2025
diff --git a/examples/community/stable_diffusion_xl_controlnet_reference.py b/examples/community/stable_diffusion_xl_controlnet_reference.py
@@ -193,7 +193,8 @@ class StableDiffusionXLControlNetReferencePipeline(StableDiffusionXLControlNetPi
 
     def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
         refimage = refimage.to(device=device)
-        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
             self.upcast_vae()
             refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
         if refimage.dtype != self.vae.dtype:
@@ -223,6 +224,11 @@ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do
 
         # aligning device to prevent device errors when concating it with the latent model input
         ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
         return ref_image_latents
 
     def prepare_ref_image(

diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py
@@ -139,7 +139,8 @@ def retrieve_timesteps(
 class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
     def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
         refimage = refimage.to(device=device)
-        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
             self.upcast_vae()
             refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
         if refimage.dtype != self.vae.dtype:
@@ -169,6 +170,11 @@ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do
 
         # aligning device to prevent device errors when concating it with the latent model input
         ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
         return ref_image_latents
 
     def prepare_ref_image(