add model tests

a-r-r-o-w · a-r-r-o-w · commit db2fd3bab750 · 2025-01-26T22:46:58.000+01:00
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -387,7 +387,8 @@ def _apply_group_offloading_block_level(
         cpu_param_dict=None,
         onload_self=True,
     )
-    _apply_group_offloading_hook(module, unmatched_group, force_offload, matched_module_groups[0])
+    next_group = matched_module_groups[0] if len(matched_module_groups) > 0 else None
+    _apply_group_offloading_hook(module, unmatched_group, force_offload, next_group)
 
 
 def _apply_group_offloading_leaf_level(
@@ -522,9 +523,13 @@ def _apply_group_offloading_hook(
     offload_on_init: bool,
     next_group: Optional[ModuleGroup] = None,
 ) -> None:
-    hook = GroupOffloadingHook(group, offload_on_init, next_group)
     registry = HookRegistry.check_if_exists_or_initialize(module)
-    registry.register_hook(hook, _GROUP_OFFLOADING)
+
+    # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
+    # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
+    if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, offload_on_init, next_group)
+        registry.register_hook(hook, _GROUP_OFFLOADING)
 
 
 def _apply_lazy_group_offloading_hook(
@@ -533,13 +538,15 @@ def _apply_lazy_group_offloading_hook(
     offload_on_init: bool,
     next_group: Optional[ModuleGroup] = None,
 ) -> None:
-    hook = GroupOffloadingHook(group, offload_on_init, next_group)
-    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
     registry = HookRegistry.check_if_exists_or_initialize(module)
+
     # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
     # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
     if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, offload_on_init, next_group)
         registry.register_hook(hook, _GROUP_OFFLOADING)
+
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
 
 
diff --git a/src/diffusers/hooks/hooks.py b/src/diffusers/hooks/hooks.py
@@ -120,7 +120,10 @@ def __init__(self, module_ref: torch.nn.Module) -> None:
 
     def register_hook(self, hook: ModelHook, name: str) -> None:
         if name in self.hooks.keys():
-            logger.warning(f"Hook with name {name} already exists, replacing it.")
+            raise ValueError(
+                f"Hook with name {name} already exists in the registry. Please use a different name or "
+                f"first remove the existing hook and then add a new one."
+            )
 
         self._module_ref = hook.initialize_hook(self._module_ref)
 
diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -132,6 +132,15 @@ def test_layerwise_casting_inference(self):
     def test_layerwise_casting_memory(self):
         pass
 
+    @unittest.skip(
+        "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. This causes the hook's pre_forward to not "
+        "cast the module weights to the expected device (as required by forward pass). As a result, forward pass errors out. To fix:\n"
+        "1. Make sure `nn::Module::to(device)` works with `torch.nn.utils.weight_norm` wrapped convolution layer.\n"
+        "2. Unskip this test."
+    )
+    def test_group_offloading(self):
+        pass
+
 
 @slow
 class AutoencoderOobleckIntegrationTests(unittest.TestCase):
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -155,6 +155,10 @@ def test_enable_disable_slicing(self):
             "Without slicing outputs should match with the outputs when slicing is manually disabled.",
         )
 
+    @unittest.skip("Not quite sure why this test fails and unable to debug.")
+    def test_group_offloading(self):
+        pass
+
 
 @slow
 class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
diff --git a/tests/models/autoencoders/test_models_vq.py b/tests/models/autoencoders/test_models_vq.py
@@ -116,3 +116,7 @@ def test_loss_pretrained(self):
         expected_output = torch.tensor([0.1936])
         # fmt: on
         self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
+
+    @unittest.skip("Group offloading for torch::nn::Embedding layers is not yet supported.")
+    def test_group_offloading(self):
+        pass
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -37,6 +37,7 @@
 from parameterized import parameterized
 from requests.exceptions import HTTPError
 
+from diffusers.hooks import apply_group_offloading
 from diffusers.models import UNet2DConditionModel
 from diffusers.models.attention_processor import (
     AttnProcessor,
@@ -1433,6 +1434,45 @@ def get_memory_usage(storage_dtype, compute_dtype):
             or abs(fp8_e4m3_fp32_max_memory - fp32_max_memory) < MB_TOLERANCE
         )
 
+    @require_torch_gpu
+    def test_group_offloading(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        torch.manual_seed(0)
+
+        def run_forward(model):
+            model.eval()
+            with torch.no_grad():
+                return model(**inputs_dict)[0]
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        output_without_group_offloading = run_forward(model)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        apply_group_offloading(model, offload_type="block_level", num_blocks_per_group=1)
+        output_with_group_offloading1 = run_forward(model)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        apply_group_offloading(model, offload_type="block_level", num_blocks_per_group=1, non_blocking=True)
+        output_with_group_offloading2 = run_forward(model)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        apply_group_offloading(model, offload_type="leaf_level")
+        output_with_group_offloading3 = run_forward(model)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        apply_group_offloading(model, offload_type="leaf_level", use_stream=True)
+        output_with_group_offloading4 = run_forward(model)
+
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading1, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading2, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading3, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading4, atol=1e-5))
+
 
 @is_staging_test
 class ModelPushToHubTester(unittest.TestCase):
diff --git a/tests/models/transformers/test_models_dit_transformer2d.py b/tests/models/transformers/test_models_dit_transformer2d.py
@@ -100,3 +100,11 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("facebook/DiT-XL-2-256", subfolder="transformer")
         assert isinstance(model, DiTTransformer2DModel)
+
+    @unittest.skip(
+        "This model uses a direct call to self.transformer_blocks[0].norm1.emb. This causes attached hooks to not be invoked "
+        "when block offloading is enabled. In order for it to work, the model should correctly first invoke the forward pass "
+        "the transformer blocks, so that weights can be onloaded, instead of directly invoking a submodule of the block."
+    )
+    def test_group_offloading(self):
+        pass
diff --git a/tests/models/transformers/test_models_transformer_hunyuan_dit.py b/tests/models/transformers/test_models_transformer_hunyuan_dit.py
@@ -111,3 +111,11 @@ def test_set_xformers_attn_processor_for_determinism(self):
     @unittest.skip("HunyuanDIT use a custom processor HunyuanAttnProcessor2_0")
     def test_set_attn_processor_for_determinism(self):
         pass
+
+    @unittest.skip(
+        "This model uses a direct call to F.multi_head_attention_forward instead of using a torch.nn.Module layer. This "
+        "usage is not yet supported with group offloading, because the call directly operates on the weights of the module. "
+        "We attach hooks correctly, but the onloading does not occur because the torch::nn::Module::forward is never invoked."
+    )
+    def test_group_offloading(self):
+        pass

Original file line number	Diff line number	Diff line change
`@@ -155,6 +155,10 @@ def test_enable_disable_slicing(self):`
`155`	`155`	`"Without slicing outputs should match with the outputs when slicing is manually disabled.",`
`156`	`156`	`)`
`157`	`157`
	`158`	`+ @unittest.skip("Not quite sure why this test fails and unable to debug.")`
	`159`	`+ def test_group_offloading(self):`
	`160`	`+ pass`
	`161`	`+`
`158`	`162`
`159`	`163`	`@slow`
`160`	`164`	`class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):`