PaddlePaddle
diff --git a/‎paddleformers/nn/norm.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/nn/norm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddleformers/nn/pp_model.py‎
Lines changed: 2 additions & 1 deletion b/‎paddleformers/nn/pp_model.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddleformers/transformers/ernie4_5/configuration.py‎
Lines changed: 0 additions & 4 deletions b/‎paddleformers/transformers/ernie4_5/configuration.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎paddleformers/transformers/glm4_moe/configuration.py‎
Lines changed: 0 additions & 2 deletions b/‎paddleformers/transformers/glm4_moe/configuration.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎paddleformers/transformers/gpt_oss/configuration.py‎
Lines changed: 0 additions & 2 deletions b/‎paddleformers/transformers/gpt_oss/configuration.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎paddleformers/transformers/qwen2/configuration.py‎
Lines changed: 59 additions & 66 deletions b/‎paddleformers/transformers/qwen2/configuration.py‎
Lines changed: 59 additions & 66 deletions
@@ -85,7 +85,7 @@ class Norm(GeneralInterface):
     @classmethod
     def create(self, config, hidden_size=None, has_bias=None, norm_eps=None, norm_type=None, **kwargs):
         if norm_type is None:
-            norm_type = "rms_norm" if config.get("use_rmsnorm", False) else "layer_norm"
+            norm_type = "rms_norm"
         if has_bias is None:
             has_bias = config.get("use_bias", False)
         norm_cls = self._global_mapping[norm_type]
 
@@ -507,6 +507,7 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
     transpose_weight_keys = None
     _embed_cls = None
     _rotary_emb_cls = None
+    _norm_cls = "rms_norm"
 
     def __init__(self, config: PretrainedConfig, **kwargs):
         # dynamic inherit DecoderLayer
@@ -582,7 +583,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
             )
 
         self.add_sequential_layer(
-            LayerDesc(RMSNormPipe if config.use_rmsnorm else LayerNormPipe, config=config),
+            LayerDesc(RMSNormPipe if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
             "model.norm",
         )
 
 
@@ -44,7 +44,6 @@ def __init__(
         recompute_granularity="core_attn",
         recompute_use_reentrant=False,
         tie_word_embeddings=True,
-        use_rmsnorm=True,
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
@@ -81,7 +80,6 @@ def __init__(
             recompute (bool): Whether to use gradient checkpointing to save memory
             recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
             recompute_use_reentrant (bool): Whether to use reentrant checkpointing
-            use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm
             tie_word_embeddings (bool):  Whether the input and output word embeddings should be tied
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
@@ -129,7 +127,6 @@ def __init__(
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-        self.use_rmsnorm = use_rmsnorm
         self.micro_batch_size = micro_batch_size
 
         self.max_sequence_length = max_sequence_length
@@ -153,7 +150,6 @@ def __init__(
                 "hidden_dropout_prob",
                 "ignored_index",
                 "scale_qk_coeff",
-                "use_rmsnorm",
                 "recompute",
                 "recompute_use_reentrant",
                 "recompute_granularity",
 
@@ -133,7 +133,6 @@ def __init__(
         num_key_value_heads=8,
         hidden_act="silu",
         max_position_embeddings=131072,
-        use_rmsnorm=True,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
         use_cache=True,
@@ -163,7 +162,6 @@ def __init__(
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
-        self.use_rmsnorm = use_rmsnorm
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
 
@@ -42,7 +42,6 @@ def __init__(
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         max_position_embeddings=131072,
-        use_rmsnorm=True,
         rms_norm_eps: float = 1e-5,
         rope_scaling={"rope_type": "yarn", "factor": 32.0, "beta_fast": 32.0, "beta_slow": 1.0, "truncate": False},
         attention_dropout: float = 0.0,
@@ -69,7 +68,6 @@ def __init__(
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.use_rmsnorm = use_rmsnorm
         self.rms_norm_eps = rms_norm_eps
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
 
@@ -16,10 +16,6 @@
 
 from ..configuration_utils import PretrainedConfig, layer_type_validation
 
-__all__ = [
-    "Qwen2Config",
-]
-
 
 class Qwen2Config(PretrainedConfig):
     r"""
@@ -47,18 +43,16 @@ class Qwen2Config(PretrainedConfig):
         num_key_value_heads (`int`, *optional*, defaults to 32):
             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 32768):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_rmsnorm (`bool`, *optional*, defaults to `True`):
-            Whether to use RMSNorm instead of LayerNorm.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -68,25 +62,57 @@ class Qwen2Config(PretrainedConfig):
             Whether the model's input and output word embeddings should be tied.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
-        use_swiglu (`bool`, *optional*, defaults to `False`):
-            Whether to use SwiGLU activation function.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
         use_sliding_window (`bool`, *optional*, defaults to `False`):
             Whether to use sliding window attention.
         sliding_window (`int`, *optional*, defaults to 4096):
             Sliding window attention (SWA) window size. If not specified, will default to `4096`.
         max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        ignored_index (`int`, *optional*, defaults to -100):
-            Target value that is ignored during loss computation.
+            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+            additional layer afterwards will use SWA (Sliding Window Attention).
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        attention_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        pp_seg_method (`str`, *optional*, defaults to `"layer:Qwen2DecoderLayer"`):
-            Method for pipeline parallel segmentation.
 
     ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
+    >>> from paddleformers.transformers import Qwen2Model, Qwen2Config
 
     >>> # Initializing a Qwen2 style configuration
     >>> configuration = Qwen2Config()
@@ -112,25 +138,16 @@ def __init__(
         hidden_act="silu",
         max_position_embeddings=32768,
         initializer_range=0.02,
-        use_rmsnorm=True,
         rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
-        pad_token_id=151643,
-        bos_token_id=151643,
-        eos_token_id=151643,
-        use_swiglu=False,
+        rope_scaling=None,
         use_sliding_window=False,
         sliding_window=4096,
         max_window_layers=28,
-        ignored_index=-100,
-        attention_bias=True,
-        attention_dropout=0.0,
-        rope_scaling_factor=1.0,
-        rope_scaling_type=None,
         layer_types=None,
-        pp_seg_method="layer:Qwen2DecoderLayer",
+        attention_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -140,9 +157,8 @@ def __init__(
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
         self.max_window_layers = max_window_layers
-        self.ignored_index = ignored_index
 
         # for backward compatibility
         if num_key_value_heads is None:
@@ -151,54 +167,31 @@ def __init__(
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.use_swiglu = use_swiglu
-        self.use_rmsnorm = use_rmsnorm
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.attention_bias = attention_bias
+        self.rope_scaling = rope_scaling
         self.attention_dropout = attention_dropout
-
-        self.rope_scaling_factor = rope_scaling_factor
-        self.rope_scaling_type = rope_scaling_type
-
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        self.pp_seg_method = pp_seg_method
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
 
         self.layer_types = layer_types
         if self.layer_types is None:
             self.layer_types = [
-                "sliding_attention" if self.use_sliding_window and i >= self.max_window_layers else "full_attention"
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]
         layer_type_validation(self.layer_types, self.num_hidden_layers)
 
         super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
 
-        self.register_unsavable_keys(
-            [
-                "attention_bias",
-                "ignored_index",
-                "pad_token_id",
-                "rope_scaling_factor",
-                "rope_scaling_type",
-                "use_rmsnorm",
-                "use_swiglu",
-                "recompute",
-                "recompute_use_reentrant",
-                "recompute_granularity",
-                "pp_seg_method",
-                "dpo_config",
-                "kto_config",
-                "layer_types",
-            ]
-        )
+
+__all__ = ["Qwen2Config"]
Original file line number	Diff line number	Diff line change
`@@ -507,6 +507,7 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):`
`507`	`507`	`transpose_weight_keys = None`
`508`	`508`	`_embed_cls = None`
`509`	`509`	`_rotary_emb_cls = None`
	`510`	`+ _norm_cls = "rms_norm"`
`510`	`511`
`511`	`512`	`def __init__(self, config: PretrainedConfig, **kwargs):`
`512`	`513`	`# dynamic inherit DecoderLayer`
`@@ -582,7 +583,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):`
`582`	`583`	`)`
`583`	`584`
`584`	`585`	`self.add_sequential_layer(`
`585`		`- LayerDesc(RMSNormPipe if config.use_rmsnorm else LayerNormPipe, config=config),`
	`586`	`+ LayerDesc(RMSNormPipe if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),`
`586`	`587`	`"model.norm",`
`587`	`588`	`)`
`588`	`589`