huggingface · zucchini-nlp · Jan 26, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 20, 2026
diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py
@@ -18,7 +18,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any
+from typing import Any, Literal
 
 from ...configuration_utils import PreTrainedConfig, layer_type_validation
 from ...modeling_rope_utils import RopeParameters
@@ -92,10 +92,9 @@ class Gemma3TextConfig(PreTrainedConfig):
             Scaling factor when applying tanh softcapping on the logits.
         attn_logit_softcapping (`float`, *optional*):
             Scaling factor when applying tanh softcapping on the attention scores.
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
+        rope_parameters (`dict`, *optional*):
+            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
+            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
         use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
             If True, the model will attend to all text tokens instead of using a causal mask. This does not change
             behavior for vision tokens.
@@ -155,7 +154,7 @@ def __init__(
         layer_types: list[str] | None = None,
         final_logit_softcapping: float | None = None,
         attn_logit_softcapping: float | None = None,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None,
         use_bidirectional_attention: bool | None = False,
         tie_word_embeddings: bool | None = True,
         **kwargs,
@@ -205,12 +204,17 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa
         rope_scaling = kwargs.pop("rope_scaling", None)
 
         # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
-        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        # as arg in the inputs and both `sliding_attention` and `full_attention` are present, we can safely assume
+        # that it is in the new format. New naming used -> new format
         default_rope_params = {
             "sliding_attention": {"rope_type": "default"},
             "full_attention": {"rope_type": "default"},
         }
-        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if (
+            self.rope_parameters.get("sliding_attention") is not None
+            and self.rope_parameters.get("full_attention") is not None
+        ):
+            self.rope_parameters = default_rope_params
         if rope_scaling is not None:
             self.rope_parameters["full_attention"].update(rope_scaling)
         self.rope_parameters["full_attention"].setdefault(

diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -19,7 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Sequence
-from typing import Any
+from typing import Any, Literal
 
 from ...configuration_utils import PreTrainedConfig, layer_type_validation
 from ...modeling_rope_utils import RopeParameters
@@ -89,10 +89,9 @@ class Gemma3nTextConfig(PreTrainedConfig):
             End of stream token id.
         bos_token_id (`int`, *optional*, defaults to 2):
             Beginning of stream token id.
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
+        rope_parameters (`dict`, *optional*):
+            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
+            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -179,7 +178,7 @@ def __init__(
         pad_token_id: int = 0,
         eos_token_id: int = 1,
         bos_token_id: int = 2,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None,
         attention_bias: bool = False,
         attention_dropout: float = 0.0,
         sliding_window: int = 512,
@@ -265,12 +264,17 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa
         rope_scaling = kwargs.pop("rope_scaling", None)
 
         # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
-        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        # as arg in the inputs and both `sliding_attention` and `full_attention` are present, we can safely assume
+        # that it is in the new format. New naming used -> new format
         default_rope_params = {
             "sliding_attention": {"rope_type": "default"},
             "full_attention": {"rope_type": "default"},
         }
-        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if (
+            self.rope_parameters.get("sliding_attention") is not None
+            and self.rope_parameters.get("full_attention") is not None
+        ):
+            self.rope_parameters = default_rope_params
         if rope_scaling is not None:
             self.rope_parameters["full_attention"].update(rope_scaling)
         self.rope_parameters["full_attention"].setdefault(

diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py
@@ -76,10 +76,9 @@ class ModernBertConfig(PreTrainedConfig):
             The dropout ratio for the attention probabilities.
         layer_types (`list`, *optional*):
             Attention pattern for each layer.
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
+        rope_parameters (`dict`, *optional*):
+            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
+            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
         local_attention (`int`, *optional*, defaults to 128):
             The window size for local attention.
         embedding_dropout (`float`, *optional*, defaults to 0.0):
@@ -156,7 +155,7 @@ def __init__(
         attention_bias: bool | None = False,
         attention_dropout: float | None = 0.0,
         layer_types: list[str] | None = None,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None,
         local_attention: int | None = 128,
         embedding_dropout: float | None = 0.0,
         mlp_bias: bool | None = False,
@@ -232,12 +231,17 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa
         rope_scaling = kwargs.pop("rope_scaling", None)
 
         # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
-        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        # as arg in the inputs and both `sliding_attention` and `full_attention` are present, we can safely assume
+        # that it is in the new format. New naming used -> new format
         default_rope_params = {
             "sliding_attention": {"rope_type": "default"},
             "full_attention": {"rope_type": "default"},
         }
-        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if (
+            self.rope_parameters.get("sliding_attention") is not None
+            and self.rope_parameters.get("full_attention") is not None
+        ):
+            self.rope_parameters = default_rope_params
         if rope_scaling is not None:
             self.rope_parameters["full_attention"].update(rope_scaling)
             self.rope_parameters["sliding_attention"].update(rope_scaling)

diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py
@@ -19,6 +19,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Literal
+
 from ...configuration_utils import PreTrainedConfig
 from ...modeling_rope_utils import RopeParameters
 
@@ -99,10 +101,9 @@ class ModernBertDecoderConfig(PreTrainedConfig):
             `global_attn_every_n_layers`. Should contain "full_attention" or "sliding_attention".
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
+        rope_parameters (`dict`, *optional*):
+            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
+            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
 
     Examples:
 
@@ -155,7 +156,7 @@ def __init__(
         global_attn_every_n_layers: int | None = 3,
         layer_types: list[str] | None = None,
         tie_word_embeddings: bool | None = True,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None,
         **kwargs,
     ):
         self.pad_token_id = pad_token_id
@@ -209,18 +210,24 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa
         rope_scaling = kwargs.pop("rope_scaling", None)
 
         # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
-        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        # as arg in the inputs and both `sliding_attention` and `full_attention` are present, we can safely assume
+        # that it is in the new format. New naming used -> new format
         default_rope_params = {
             "sliding_attention": {"rope_type": "default"},
             "full_attention": {"rope_type": "default"},
         }
-        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if (
+            self.rope_parameters.get("sliding_attention") is not None
+            and self.rope_parameters.get("full_attention") is not None
+        ):
+            self.rope_parameters = default_rope_params
         if rope_scaling is not None:
             self.rope_parameters["full_attention"].update(rope_scaling)
             self.rope_parameters["sliding_attention"].update(rope_scaling)
-        self.rope_parameters["full_attention"].setdefault(
-            "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"])
-        )
+        if self.rope_parameters.get("full_attention") is not None:
+            self.rope_parameters["full_attention"].setdefault(
+                "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"])
+            )
         self.rope_parameters["sliding_attention"].setdefault(
             "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"])
         )