diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index d6efcc736487..66afb63cc9b4 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -1214,7 +1214,7 @@ def _get_signature_keys(cls, obj): # Adapted from `transformers` modeling_utils.py def _get_no_split_modules(self, device_map: str): """ - Get the modules of the model that should not be spit when using device_map. We iterate through the modules to + Get the modules of the model that should not be split when using device_map. We iterate through the modules to get the underlying `_no_split_modules`. Args: diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py index b47d439774cc..e83c5be75b44 100644 --- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py +++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py @@ -210,6 +210,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin): """ _supports_gradient_checkpointing = True + _no_split_modules = ["CogVideoXBlock", "CogVideoXPatchEmbed"] @register_to_config def __init__( diff --git a/src/diffusers/models/transformers/transformer_allegro.py b/src/diffusers/models/transformers/transformer_allegro.py index fe9c7290b063..81039fd49e0d 100644 --- a/src/diffusers/models/transformers/transformer_allegro.py +++ b/src/diffusers/models/transformers/transformer_allegro.py @@ -221,6 +221,8 @@ class AllegroTransformer3DModel(ModelMixin, ConfigMixin): Scaling factor to apply in 3D positional embeddings across time dimension. """ + _supports_gradient_checkpointing = True + @register_to_config def __init__( self, diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py index 94d852f6df4b..369509a3a35e 100644 --- a/src/diffusers/models/transformers/transformer_cogview3plus.py +++ b/src/diffusers/models/transformers/transformer_cogview3plus.py @@ -166,6 +166,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): """ _supports_gradient_checkpointing = True + _no_split_modules = ["CogView3PlusTransformerBlock", "CogView3PlusPatchEmbed"] @register_to_config def __init__( diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 6cb97af93652..d4ee8e426344 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -542,6 +542,12 @@ class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, """ _supports_gradient_checkpointing = True + _no_split_modules = [ + "HunyuanVideoTransformerBlock", + "HunyuanVideoSingleTransformerBlock", + "HunyuanVideoPatchEmbed", + "HunyuanVideoTokenRefiner", + ] @register_to_config def __init__( diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py index 4c13b54e0620..73b83b9eb514 100644 --- a/tests/models/transformers/test_models_transformer_cogvideox.py +++ b/tests/models/transformers/test_models_transformer_cogvideox.py @@ -71,7 +71,7 @@ def prepare_init_args_and_inputs_for_common(self): "out_channels": 4, "time_embed_dim": 2, "text_embed_dim": 8, - "num_layers": 1, + "num_layers": 2, "sample_width": 8, "sample_height": 8, "sample_frames": 8, @@ -130,7 +130,7 @@ def prepare_init_args_and_inputs_for_common(self): "out_channels": 4, "time_embed_dim": 2, "text_embed_dim": 8, - "num_layers": 1, + "num_layers": 2, "sample_width": 8, "sample_height": 8, "sample_frames": 8, diff --git a/tests/models/transformers/test_models_transformer_cogview3plus.py b/tests/models/transformers/test_models_transformer_cogview3plus.py index eda9813808e9..ec6c58a6734c 100644 --- a/tests/models/transformers/test_models_transformer_cogview3plus.py +++ b/tests/models/transformers/test_models_transformer_cogview3plus.py @@ -71,7 +71,7 @@ def prepare_init_args_and_inputs_for_common(self): init_dict = { "patch_size": 2, "in_channels": 4, - "num_layers": 1, + "num_layers": 2, "attention_head_dim": 4, "num_attention_heads": 2, "out_channels": 4,