fixes

sayakpaul · sayakpaul · commit af2d8c750f0d · 2024-09-23T08:30:07.000+05:30
diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py
@@ -177,6 +177,10 @@ def test_forward_signature(self):
     def test_training(self):
         pass
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Decoder", "Encoder"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
     def test_from_pretrained_hub(self):
         model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
         self.assertIsNotNone(model)
@@ -330,6 +334,10 @@ def prepare_init_args_and_inputs_for_common(self):
     def test_outputs_equivalence(self):
         pass
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"DecoderTiny", "EncoderTiny"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
 
 class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase):
     model_class = ConsistencyDecoderVAE
@@ -414,6 +422,10 @@ def test_forward_signature(self):
     def test_training(self):
         pass
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Encoder", "TemporalDecoder"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
 
 class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
     model_class = AutoencoderOobleck
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -735,7 +735,7 @@ def test_enable_disable_gradient_checkpointing(self):
         self.assertFalse(model.is_gradient_checkpointing)
 
     @require_torch_accelerator_with_training
-    def test_effective_gradient_checkpointing(self):
+    def test_effective_gradient_checkpointing(self, loss_tolerance=1e-5):
         if not self.model_class._supports_gradient_checkpointing:
             return  # Skip test if model does not support gradient checkpointing
         if torch_device == "mps" and self.model_class.__name__ in [
@@ -777,23 +777,33 @@ def test_effective_gradient_checkpointing(self):
         loss_2.backward()
 
         # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        self.assertTrue((loss - loss_2).abs() < loss_tolerance)
         named_params = dict(model.named_parameters())
         named_params_2 = dict(model_2.named_parameters())
         for name, param in named_params.items():
             if "post_quant_conv" in name:
                 continue
             self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
 
-    def test_gradient_checkpointing_is_applied(self, expected_set=None):
+    def test_gradient_checkpointing_is_applied(
+        self, expected_set=None, attention_head_dim=None, num_attention_heads=None, block_out_channels=None
+    ):
         if not self.model_class._supports_gradient_checkpointing:
             return  # Skip test if model does not support gradient checkpointing
-        if torch_device == "mps" and self.model_class.__name__ == "UNetSpatioTemporalConditionModel":
+        if torch_device == "mps" and self.model_class.__name__ in [
+            "UNetSpatioTemporalConditionModel",
+            "AutoencoderKLTemporalDecoder",
+        ]:
             return
 
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
-        init_dict["num_attention_heads"] = (8, 16)
+        if attention_head_dim is not None:
+            init_dict["attention_head_dim"] = attention_head_dim
+        if num_attention_heads is not None:
+            init_dict["num_attention_heads"] = num_attention_heads
+        if block_out_channels is not None:
+            init_dict["block_out_channels"] = block_out_channels
 
         model_class_copy = copy.copy(self.model_class)
 
diff --git a/tests/models/transformers/test_models_dit_transformer2d.py b/tests/models/transformers/test_models_dit_transformer2d.py
@@ -84,6 +84,10 @@ def test_correct_class_remapping_from_dict_config(self):
         model = Transformer2DModel.from_config(init_dict)
         assert isinstance(model, DiTTransformer2DModel)
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"DiTTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
     def test_correct_class_remapping_from_pretrained_config(self):
         config = DiTTransformer2DModel.load_config("facebook/DiT-XL-2-256", subfolder="transformer")
         model = Transformer2DModel.from_config(config)
diff --git a/tests/models/transformers/test_models_pixart_transformer2d.py b/tests/models/transformers/test_models_pixart_transformer2d.py
@@ -92,6 +92,10 @@ def test_output(self):
             expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
         )
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"PixArtTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
     def test_correct_class_remapping_from_dict_config(self):
         init_dict, _ = self.prepare_init_args_and_inputs_for_common()
         model = Transformer2DModel.from_config(init_dict)
diff --git a/tests/models/transformers/test_models_transformer_aura_flow.py b/tests/models/transformers/test_models_transformer_aura_flow.py
@@ -74,6 +74,10 @@ def prepare_init_args_and_inputs_for_common(self):
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"AuraFlowTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
     @unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
     def test_set_attn_processor_for_determinism(self):
         pass
diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py
@@ -81,3 +81,7 @@ def prepare_init_args_and_inputs_for_common(self):
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"CogVideoXTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py
@@ -111,3 +111,7 @@ def test_deprecated_inputs_img_txt_ids_3d(self):
             torch.allclose(output_1, output_2, atol=1e-5),
             msg="output with deprecated inputs (img_ids and txt_ids as 3d torch tensors) are not equal as them as 2d inputs",
         )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"FluxTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_latte.py b/tests/models/transformers/test_models_transformer_latte.py
@@ -86,3 +86,7 @@ def test_output(self):
         super().test_output(
             expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
         )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"LatteTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_sd3.py b/tests/models/transformers/test_models_transformer_sd3.py
@@ -80,3 +80,7 @@ def prepare_init_args_and_inputs_for_common(self):
     @unittest.skip("SD3Transformer2DModel uses a dedicated attention processor. This test doesn't apply")
     def test_set_attn_processor_for_determinism(self):
         pass
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"SD3Transformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
@@ -565,7 +565,11 @@ def test_gradient_checkpointing_is_applied(self):
             "Transformer2DModel",
             "DownBlock2D",
         }
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+        attention_head_dim = (8, 16)
+        block_out_channels = (16, 32)
+        super().test_gradient_checkpointing_is_applied(
+            expected_set=expected_set, attention_head_dim=attention_head_dim, block_out_channels=block_out_channels
+        )
 
     def test_special_attn_proc(self):
         class AttnEasyProc(torch.nn.Module):
diff --git a/tests/models/unets/test_models_unet_spatiotemporal.py b/tests/models/unets/test_models_unet_spatiotemporal.py
@@ -205,7 +205,10 @@ def test_gradient_checkpointing_is_applied(self):
             "CrossAttnUpBlockSpatioTemporal",
             "UNetMidBlockSpatioTemporal",
         }
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+        num_attention_heads = (8, 16)
+        super().test_gradient_checkpointing_is_applied(
+            expected_set=expected_set, num_attention_heads=num_attention_heads
+        )
 
     def test_pickle(self):
         # enable deterministic behavior for gradient checkpointing

Original file line number	Diff line number	Diff line change
`@@ -81,3 +81,7 @@ def prepare_init_args_and_inputs_for_common(self):`
`81`	`81`	`}`
`82`	`82`	`inputs_dict = self.dummy_input`
`83`	`83`	`return init_dict, inputs_dict`
	`84`	`+`
	`85`	`+ def test_gradient_checkpointing_is_applied(self):`
	`86`	`+ expected_set = {"CogVideoXTransformer3DModel"}`
	`87`	`+ super().test_gradient_checkpointing_is_applied(expected_set=expected_set)`
Original file line number	Diff line number	Diff line change
`@@ -111,3 +111,7 @@ def test_deprecated_inputs_img_txt_ids_3d(self):`
`111`	`111`	`torch.allclose(output_1, output_2, atol=1e-5),`
`112`	`112`	`msg="output with deprecated inputs (img_ids and txt_ids as 3d torch tensors) are not equal as them as 2d inputs",`
`113`	`113`	`)`
	`114`	`+`
	`115`	`+ def test_gradient_checkpointing_is_applied(self):`
	`116`	`+ expected_set = {"FluxTransformer2DModel"}`
	`117`	`+ super().test_gradient_checkpointing_is_applied(expected_set=expected_set)`
Original file line number	Diff line number	Diff line change
`@@ -86,3 +86,7 @@ def test_output(self):`
`86`	`86`	`super().test_output(`
`87`	`87`	`expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape`
`88`	`88`	`)`
	`89`	`+`
	`90`	`+ def test_gradient_checkpointing_is_applied(self):`
	`91`	`+ expected_set = {"LatteTransformer3DModel"}`
	`92`	`+ super().test_gradient_checkpointing_is_applied(expected_set=expected_set)`