Skip to content
Closed
2 changes: 1 addition & 1 deletion src/transformers/models/arcee/modeling_arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ class ArceePreTrainedModel(PreTrainedModel):
config: ArceeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["ArceeDecoderLayer"]
_no_split_modules = ["ArceeDecoderLayer", "ArceeRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ class AriaPreTrainedModel(PreTrainedModel):
config: AriaConfig
base_model_prefix = ""
supports_gradient_checkpointing = True
_no_split_modules = ["AriaDecoderLayer"]
_no_split_modules = ["AriaDecoderLayer", "AriaRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/bitnet/modeling_bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ class BitNetPreTrainedModel(PreTrainedModel):
config: BitNetConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["BitNetDecoderLayer"]
_no_split_modules = ["BitNetDecoderLayer", "BitNetRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere/modeling_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ class CoherePreTrainedModel(PreTrainedModel):
config: CohereConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["CohereDecoderLayer"]
_no_split_modules = ["CohereDecoderLayer", "CohereRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere2/modeling_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ class Cohere2PreTrainedModel(PreTrainedModel):
config: Cohere2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Cohere2DecoderLayer"]
_no_split_modules = ["Cohere2DecoderLayer", "Cohere2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ class DeepseekV2PreTrainedModel(PreTrainedModel):
config: DeepseekV2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["DeepseekV2DecoderLayer"]
_no_split_modules = ["DeepseekV2DecoderLayer", "DeepseekV2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ class DeepseekV3PreTrainedModel(PreTrainedModel):
config: DeepseekV3Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["DeepseekV3DecoderLayer"]
_no_split_modules = ["DeepseekV3DecoderLayer", "DeepseekV3RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/diffllama/modeling_diffllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ class DiffLlamaPreTrainedModel(PreTrainedModel):
config: DiffLlamaConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["DiffLlamaDecoderLayer"]
_no_split_modules = ["DiffLlamaDecoderLayer", "DiffLlamaRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/doge/modeling_doge.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ class DogePreTrainedModel(PreTrainedModel):
config: DogeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["DogeDecoderLayer"]
_no_split_modules = ["DogeDecoderLayer", "DogeRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = False
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dots1/modeling_dots1.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ class Dots1PreTrainedModel(PreTrainedModel):
config: Dots1Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Dots1DecoderLayer"]
_no_split_modules = ["Dots1DecoderLayer", "Dots1RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/ernie4_5/modeling_ernie4_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ class Ernie4_5PreTrainedModel(PreTrainedModel):
config: Ernie4_5Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Ernie4_5DecoderLayer"]
_no_split_modules = ["Ernie4_5DecoderLayer", "Ernie4_5RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma/modeling_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ class GemmaPreTrainedModel(PreTrainedModel):
config: GemmaConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["GemmaDecoderLayer"]
_no_split_modules = ["GemmaDecoderLayer", "GemmaRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma2/modeling_gemma2.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ class Gemma2PreTrainedModel(PreTrainedModel):
config: Gemma2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Gemma2DecoderLayer"]
_no_split_modules = ["Gemma2DecoderLayer", "Gemma2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm/modeling_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ class GlmPreTrainedModel(PreTrainedModel):
config: GlmConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["GlmDecoderLayer"]
_no_split_modules = ["GlmDecoderLayer", "GlmRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm4/modeling_glm4.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ class Glm4PreTrainedModel(PreTrainedModel):
config: Glm4Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Glm4DecoderLayer"]
_no_split_modules = ["Glm4DecoderLayer", "Glm4RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm4_moe/modeling_glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ class Glm4MoePreTrainedModel(PreTrainedModel):
config: Glm4MoeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Glm4MoeDecoderLayer"]
_no_split_modules = ["Glm4MoeDecoderLayer", "Glm4MoeRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gpt_oss/modeling_gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ class GptOssPreTrainedModel(PreTrainedModel):
config: GptOssConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["GptOssDecoderLayer"]
_no_split_modules = ["GptOssDecoderLayer", "GptOssRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = False
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/granite/modeling_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ class GranitePreTrainedModel(PreTrainedModel):
config: GraniteConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["GraniteDecoderLayer"]
_no_split_modules = ["GraniteDecoderLayer", "GraniteRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/helium/modeling_helium.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ class HeliumPreTrainedModel(PreTrainedModel):
config: HeliumConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["HeliumDecoderLayer"]
_no_split_modules = ["HeliumDecoderLayer", "HeliumRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/lfm2/modeling_lfm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ class Lfm2PreTrainedModel(PreTrainedModel):
config: Lfm2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Lfm2DecoderLayer"]
_no_split_modules = ["Lfm2DecoderLayer", "Lfm2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
config: LlamaConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
_no_split_modules = ["LlamaDecoderLayer", "LlamaRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/minimax/modeling_minimax.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ class MiniMaxPreTrainedModel(PreTrainedModel):
config: MiniMaxConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["MiniMaxDecoderLayer"]
_no_split_modules = ["MiniMaxDecoderLayer", "MiniMaxRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mistral/modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class MistralPreTrainedModel(PreTrainedModel):
config: MistralConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["MistralDecoderLayer"]
_no_split_modules = ["MistralDecoderLayer", "MistralRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mixtral/modeling_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ class MixtralPreTrainedModel(PreTrainedModel):
config: MixtralConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["MixtralDecoderLayer"]
_no_split_modules = ["MixtralDecoderLayer", "MixtralRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/olmo/modeling_olmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ class OlmoPreTrainedModel(PreTrainedModel):
config: OlmoConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["OlmoDecoderLayer"]
_no_split_modules = ["OlmoDecoderLayer", "OlmoRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/olmo2/modeling_olmo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ class Olmo2PreTrainedModel(PreTrainedModel):
config: Olmo2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Olmo2DecoderLayer"]
_no_split_modules = ["Olmo2DecoderLayer", "Olmo2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/phi/modeling_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ class PhiPreTrainedModel(PreTrainedModel):
config: PhiConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["PhiDecoderLayer"]
_no_split_modules = ["PhiDecoderLayer", "PhiRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/phi3/modeling_phi3.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class Phi3PreTrainedModel(PreTrainedModel):
config: Phi3Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Phi3DecoderLayer"]
_no_split_modules = ["Phi3DecoderLayer", "Phi3RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1518,7 +1518,7 @@ class Phi4MultimodalPreTrainedModel(PreTrainedModel):
config: Phi4MultimodalConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Phi4MultimodalDecoderLayer"]
_no_split_modules = ["Phi4MultimodalDecoderLayer", "Phi4MultimodalRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/qwen2/modeling_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ class Qwen2PreTrainedModel(PreTrainedModel):
config: Qwen2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Qwen2DecoderLayer"]
_no_split_modules = ["Qwen2DecoderLayer", "Qwen2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/qwen3/modeling_qwen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ class Qwen3PreTrainedModel(PreTrainedModel):
config: Qwen3Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Qwen3DecoderLayer"]
_no_split_modules = ["Qwen3DecoderLayer", "Qwen3RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ class Qwen3MoePreTrainedModel(PreTrainedModel):
config: Qwen3MoeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Qwen3MoeDecoderLayer"]
_no_split_modules = ["Qwen3MoeDecoderLayer", "Qwen3MoeRMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/smollm3/modeling_smollm3.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ class SmolLM3PreTrainedModel(PreTrainedModel):
config: SmolLM3Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["SmolLM3DecoderLayer"]
_no_split_modules = ["SmolLM3DecoderLayer", "SmolLM3RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/starcoder2/modeling_starcoder2.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ class Starcoder2PreTrainedModel(PreTrainedModel):
config: Starcoder2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Starcoder2DecoderLayer"]
_no_split_modules = ["Starcoder2DecoderLayer", "Starcoder2RMSNorm"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn = True
_supports_sdpa = True
Expand Down