handle fp16 in UNet2DModel (#1216)

patil-suraj · pcuenca · web-flow · commit 9e234d8048e5 · 2022-11-23T11:13:34.000+01:00
* make sure fp16 runs well

* add fp16 test for superes

* Update src/diffusers/models/unet_2d.py

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;

* gen on cuda

* always run fast inferecne test on cpu

* run on cpu

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
@@ -209,6 +209,11 @@ def forward(
         timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
 
         t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
         emb = self.time_embedding(t_emb)
 
         # 2. pre-process
@@ -242,9 +247,7 @@ def forward(
                 sample = upsample_block(sample, res_samples, emb)
 
         # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
+        sample = self.conv_norm_out(sample)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -87,6 +87,27 @@ def test_inference_superresolution(self):
         expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_inference_superresolution_fp16(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vqvae = self.dummy_vq_model
+
+        # put models in fp16
+        unet = unet.half()
+        vqvae = vqvae.half()
+
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        init_image = self.dummy_image.to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        assert image.shape == (1, 64, 64, 3)
+
 
 @slow
 @require_torch