address review comments

a-r-r-o-w · a-r-r-o-w · commit 24f92739b463 · 2025-02-04T06:27:29.000+01:00
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -343,7 +343,7 @@ def _apply_group_offloading_block_level(
         for i in range(0, len(submodule), num_blocks_per_group):
             current_modules = submodule[i : i + num_blocks_per_group]
             group = ModuleGroup(
-                modules=submodule[i : i + num_blocks_per_group],
+                modules=current_modules,
                 offload_device=offload_device,
                 onload_device=onload_device,
                 offload_leader=current_modules[-1],
diff --git a/src/diffusers/models/autoencoders/autoencoder_oobleck.py b/src/diffusers/models/autoencoders/autoencoder_oobleck.py
@@ -317,6 +317,7 @@ class AutoencoderOobleck(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = False
+    _supports_group_offloading = False
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -68,6 +68,8 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         ```
     """
 
+    _supports_group_offloading = False
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/models/autoencoders/vq_model.py b/src/diffusers/models/autoencoders/vq_model.py
@@ -72,6 +72,7 @@ class VQModel(ModelMixin, ConfigMixin):
     """
 
     _skip_layerwise_casting_patterns = ["quantize"]
+    _supports_group_offloading = False
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -174,6 +174,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _no_split_modules = None
     _keep_in_fp32_modules = None
     _skip_layerwise_casting_patterns = None
+    _supports_group_offloading = True
 
     def __init__(self):
         super().__init__()
diff --git a/src/diffusers/models/transformers/dit_transformer_2d.py b/src/diffusers/models/transformers/dit_transformer_2d.py
@@ -66,6 +66,7 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
 
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
     _supports_gradient_checkpointing = True
+    _supports_group_offloading = False
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -245,6 +245,7 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
     """
 
     _skip_layerwise_casting_patterns = ["pos_embed", "norm", "pooler"]
+    _supports_group_offloading = False
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -1020,25 +1020,19 @@ def _execution_device(self):
         [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
         Accelerate's module hooks.
         """
-        diffusers_hook_device = None
+        # When apply group offloading at the leaf_level, we're in the same situation as accelerate's sequential
+        # offloading. We need to return the onload device of the group offloading hooks so that the intermediates
+        # required for computation (latents, prompt embeddings, etc.) can be created on the correct device.
         for name, model in self.components.items():
             if not isinstance(model, torch.nn.Module):
                 continue
-
             for submodule in model.modules():
                 if not hasattr(submodule, "_diffusers_hook"):
                     continue
                 registry = submodule._diffusers_hook
                 hook = registry.get_hook("group_offloading")
                 if hook is not None:
-                    diffusers_hook_device = hook.group.onload_device
-                    break
-
-            if diffusers_hook_device is not None:
-                break
-
-        if diffusers_hook_device is not None:
-            return diffusers_hook_device
+                    return hook.group.onload_device
 
         for name, model in self.components.items():
             if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -132,15 +132,6 @@ def test_layerwise_casting_inference(self):
     def test_layerwise_casting_memory(self):
         pass
 
-    @unittest.skip(
-        "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. This causes the hook's pre_forward to not "
-        "cast the module weights to the expected device (as required by forward pass). As a result, forward pass errors out. To fix:\n"
-        "1. Make sure `nn::Module::to(device)` works with `torch.nn.utils.weight_norm` wrapped convolution layer.\n"
-        "2. Unskip this test."
-    )
-    def test_group_offloading(self):
-        pass
-
 
 @slow
 class AutoencoderOobleckIntegrationTests(unittest.TestCase):
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -155,10 +155,6 @@ def test_enable_disable_slicing(self):
             "Without slicing outputs should match with the outputs when slicing is manually disabled.",
         )
 
-    @unittest.skip("Not quite sure why this test fails and unable to debug.")
-    def test_group_offloading(self):
-        pass
-
 
 @slow
 class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
diff --git a/tests/models/autoencoders/test_models_vq.py b/tests/models/autoencoders/test_models_vq.py
@@ -116,7 +116,3 @@ def test_loss_pretrained(self):
         expected_output = torch.tensor([0.1936])
         # fmt: on
         self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
-
-    @unittest.skip("Group offloading for torch::nn::Embedding layers is not yet supported.")
-    def test_group_offloading(self):
-        pass
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -1447,6 +1447,9 @@ def run_forward(model):
                 return model(**inputs_dict)[0]
 
         model = self.model_class(**init_dict)
+        if not getattr(model, "_supports_group_offloading", True):
+            return
+        
         model.to(torch_device)
         output_without_group_offloading = run_forward(model)
 
diff --git a/tests/models/transformers/test_models_dit_transformer2d.py b/tests/models/transformers/test_models_dit_transformer2d.py
@@ -100,11 +100,3 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("facebook/DiT-XL-2-256", subfolder="transformer")
         assert isinstance(model, DiTTransformer2DModel)
-
-    @unittest.skip(
-        "This model uses a direct call to self.transformer_blocks[0].norm1.emb. This causes attached hooks to not be invoked "
-        "when block offloading is enabled. In order for it to work, the model should correctly first invoke the forward pass "
-        "the transformer blocks, so that weights can be onloaded, instead of directly invoking a submodule of the block."
-    )
-    def test_group_offloading(self):
-        pass
diff --git a/tests/models/transformers/test_models_transformer_hunyuan_dit.py b/tests/models/transformers/test_models_transformer_hunyuan_dit.py
@@ -111,11 +111,3 @@ def test_set_xformers_attn_processor_for_determinism(self):
     @unittest.skip("HunyuanDIT use a custom processor HunyuanAttnProcessor2_0")
     def test_set_attn_processor_for_determinism(self):
         pass
-
-    @unittest.skip(
-        "This model uses a direct call to F.multi_head_attention_forward instead of using a torch.nn.Module layer. This "
-        "usage is not yet supported with group offloading, because the call directly operates on the weights of the module. "
-        "We attach hooks correctly, but the onloading does not occur because the torch::nn::Module::forward is never invoked."
-    )
-    def test_group_offloading(self):
-        pass
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -2075,6 +2075,8 @@ def enable_group_offloading_on_component(pipe, group_offloading_kwargs):
                 if not hasattr(pipe, component_name):
                     continue
                 component = getattr(pipe, component_name)
+                if not getattr(component, "_supports_group_offloading", True):
+                    continue
                 apply_group_offloading(component, **group_offloading_kwargs)
                 self.assertTrue(
                     all(

Original file line number	Diff line number	Diff line change
`@@ -155,10 +155,6 @@ def test_enable_disable_slicing(self):`
`155`	`155`	`"Without slicing outputs should match with the outputs when slicing is manually disabled.",`
`156`	`156`	`)`
`157`	`157`
`158`		`- @unittest.skip("Not quite sure why this test fails and unable to debug.")`
`159`		`- def test_group_offloading(self):`
`160`		`- pass`
`161`		`-`
`162`	`158`
`163`	`159`	`@slow`
`164`	`160`	`class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):`