add some basic tests

a-r-r-o-w · a-r-r-o-w · commit 5f898a1fce77 · 2025-01-02T23:12:09.000+01:00
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
@@ -291,7 +291,8 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        # TODO(aryan): Need to have this reviewed
+        t_emb = t_emb.to(dtype=sample.dtype)
         emb = self.time_embedding(t_emb)
 
         if self.class_embedding is not None:
@@ -301,7 +302,7 @@ def forward(
             if self.config.class_embed_type == "timestep":
                 class_labels = self.time_proj(class_labels)
 
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
             emb = emb + class_emb
         elif self.class_embedding is None and class_labels is not None:
             raise ValueError("class_embedding needs to be initialized in order to use class conditioning")
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
@@ -97,7 +97,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
     """
 
     _supports_gradient_checkpointing = False
-    _always_upcast_modules = ["norm.*"]
+    _always_upcast_modules = ["norm.*", "time_embedding"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
@@ -2132,7 +2132,8 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        # TODO(aryan): Need to have this reviewed
+        t_emb = t_emb.to(dtype=sample.dtype)
 
         emb = self.time_embedding(t_emb, timestep_cond)
         aug_emb = None
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import copy
+import gc
 import inspect
 import json
 import os
@@ -56,6 +57,7 @@
     CaptureLogger,
     get_python_version,
     is_torch_compile,
+    numpy_cosine_similarity_distance,
     require_torch_2,
     require_torch_accelerator_with_training,
     require_torch_gpu,
@@ -1331,6 +1333,82 @@ def test_variant_sharded_ckpt_right_format(self):
                 # Example: diffusion_pytorch_model.fp16-00001-of-00002.safetensors
                 assert all(f.split(".")[1].split("-")[0] == variant for f in shard_files)
 
+    def test_layerwise_upcasting_inference(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        base_slice = model(**inputs_dict)[0].flatten().detach().cpu().numpy()
+
+        # fp16-fp32
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        model.enable_layerwise_upcasting(storage_dtype=torch.float16, compute_dtype=torch.float32)
+        layerwise_upcast_slice_fp16 = model(**inputs_dict)[0].flatten().detach().cpu().numpy()
+
+        # The precision test is not very important for fast tests. In most cases, the outputs will not be the same.
+        # We just want to make sure that the layerwise upcasting is working as expected.
+        self.assertTrue(numpy_cosine_similarity_distance(base_slice, layerwise_upcast_slice_fp16) < 1.0)
+
+        # fp8_e4m3-fp32
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        model.enable_layerwise_upcasting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.float32)
+        layerwise_upcast_slice_fp8_e4m3 = model(**inputs_dict)[0].flatten().detach().cpu().numpy()
+
+        self.assertTrue(numpy_cosine_similarity_distance(base_slice, layerwise_upcast_slice_fp8_e4m3) < 1.0)
+
+        # fp8_e5m2-fp32
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        model.enable_layerwise_upcasting(storage_dtype=torch.float8_e5m2, compute_dtype=torch.float32)
+        layerwise_upcast_slice_fp8_e5m2 = model(**inputs_dict)[0].flatten().detach().cpu().numpy()
+
+        self.assertTrue(numpy_cosine_similarity_distance(base_slice, layerwise_upcast_slice_fp8_e5m2) < 1.0)
+
+    @require_torch_gpu
+    def test_layerwise_upcasting_memory(self):
+        # fp32
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        model(**inputs_dict)
+        base_memory_footprint = model.get_memory_footprint()
+        base_max_memory = torch.cuda.max_memory_allocated()
+
+        model.to("cpu")
+        del model
+
+        # fp8_e4m3-fp32
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+        model.enable_layerwise_upcasting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.float32)
+        model(**inputs_dict)
+        fp8_e4m3_memory_footprint = model.get_memory_footprint()
+        fp8_e4m3_max_memory = torch.cuda.max_memory_allocated()
+
+        self.assertTrue(fp8_e4m3_memory_footprint < base_memory_footprint)
+        self.assertTrue(fp8_e4m3_max_memory < base_max_memory)
+
 
 @is_staging_test
 class ModelPushToHubTester(unittest.TestCase):