Skip to content

Commit ee9ad93

Browse files
authored
Qwen2/3 modeling alignment implementation (#2694)
1 parent d144036 commit ee9ad93

File tree

17 files changed

+659
-993
lines changed

17 files changed

+659
-993
lines changed

paddleformers/nn/norm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class Norm(GeneralInterface):
8585
@classmethod
8686
def create(self, config, hidden_size=None, has_bias=None, norm_eps=None, norm_type=None, **kwargs):
8787
if norm_type is None:
88-
norm_type = "rms_norm" if config.get("use_rmsnorm", False) else "layer_norm"
88+
norm_type = "rms_norm"
8989
if has_bias is None:
9090
has_bias = config.get("use_bias", False)
9191
norm_cls = self._global_mapping[norm_type]

paddleformers/nn/pp_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
507507
transpose_weight_keys = None
508508
_embed_cls = None
509509
_rotary_emb_cls = None
510+
_norm_cls = "rms_norm"
510511

511512
def __init__(self, config: PretrainedConfig, **kwargs):
512513
# dynamic inherit DecoderLayer
@@ -582,7 +583,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
582583
)
583584

584585
self.add_sequential_layer(
585-
LayerDesc(RMSNormPipe if config.use_rmsnorm else LayerNormPipe, config=config),
586+
LayerDesc(RMSNormPipe if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
586587
"model.norm",
587588
)
588589

paddleformers/transformers/ernie4_5/configuration.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def __init__(
4444
recompute_granularity="core_attn",
4545
recompute_use_reentrant=False,
4646
tie_word_embeddings=True,
47-
use_rmsnorm=True,
4847
pad_token_id=0,
4948
bos_token_id=1,
5049
eos_token_id=2,
@@ -81,7 +80,6 @@ def __init__(
8180
recompute (bool): Whether to use gradient checkpointing to save memory
8281
recompute_granularity (str): Granularity of recomputation ("core_attn", "full", etc.)
8382
recompute_use_reentrant (bool): Whether to use reentrant checkpointing
84-
use_rmsnorm (bool): Whether to use RMSNorm instead of LayerNorm
8583
tie_word_embeddings (bool): Whether the input and output word embeddings should be tied
8684
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
8785
model has a output word embedding layer.
@@ -129,7 +127,6 @@ def __init__(
129127
self.pad_token_id = pad_token_id
130128
self.bos_token_id = bos_token_id
131129
self.eos_token_id = eos_token_id
132-
self.use_rmsnorm = use_rmsnorm
133130
self.micro_batch_size = micro_batch_size
134131

135132
self.max_sequence_length = max_sequence_length
@@ -153,7 +150,6 @@ def __init__(
153150
"hidden_dropout_prob",
154151
"ignored_index",
155152
"scale_qk_coeff",
156-
"use_rmsnorm",
157153
"recompute",
158154
"recompute_use_reentrant",
159155
"recompute_granularity",

paddleformers/transformers/glm4_moe/configuration.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ def __init__(
133133
num_key_value_heads=8,
134134
hidden_act="silu",
135135
max_position_embeddings=131072,
136-
use_rmsnorm=True,
137136
initializer_range=0.02,
138137
rms_norm_eps=1e-5,
139138
use_cache=True,
@@ -163,7 +162,6 @@ def __init__(
163162
):
164163
self.vocab_size = vocab_size
165164
self.max_position_embeddings = max_position_embeddings
166-
self.use_rmsnorm = use_rmsnorm
167165
self.hidden_size = hidden_size
168166
self.intermediate_size = intermediate_size
169167
self.num_hidden_layers = num_hidden_layers

paddleformers/transformers/gpt_oss/configuration.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def __init__(
4242
hidden_act: str = "silu",
4343
initializer_range: float = 0.02,
4444
max_position_embeddings=131072,
45-
use_rmsnorm=True,
4645
rms_norm_eps: float = 1e-5,
4746
rope_scaling={"rope_type": "yarn", "factor": 32.0, "beta_fast": 32.0, "beta_slow": 1.0, "truncate": False},
4847
attention_dropout: float = 0.0,
@@ -69,7 +68,6 @@ def __init__(
6968
self.num_key_value_heads = num_key_value_heads
7069
self.hidden_act = hidden_act
7170
self.initializer_range = initializer_range
72-
self.use_rmsnorm = use_rmsnorm
7371
self.rms_norm_eps = rms_norm_eps
7472
self.rope_theta = rope_theta
7573
self.rope_scaling = rope_scaling

paddleformers/transformers/qwen2/configuration.py

Lines changed: 59 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616

1717
from ..configuration_utils import PretrainedConfig, layer_type_validation
1818

19-
__all__ = [
20-
"Qwen2Config",
21-
]
22-
2319

2420
class Qwen2Config(PretrainedConfig):
2521
r"""
@@ -47,18 +43,16 @@ class Qwen2Config(PretrainedConfig):
4743
num_key_value_heads (`int`, *optional*, defaults to 32):
4844
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
4945
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
50-
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
46+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
5147
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
52-
by meanpooling all the original heads within that group. For more details checkout [this
53-
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
48+
by meanpooling all the original heads within that group. For more details, check out [this
49+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
5450
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
5551
The non-linear activation function (function or string) in the decoder.
5652
max_position_embeddings (`int`, *optional*, defaults to 32768):
5753
The maximum sequence length that this model might ever be used with.
5854
initializer_range (`float`, *optional*, defaults to 0.02):
5955
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60-
use_rmsnorm (`bool`, *optional*, defaults to `True`):
61-
Whether to use RMSNorm instead of LayerNorm.
6256
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
6357
The epsilon used by the rms normalization layers.
6458
use_cache (`bool`, *optional*, defaults to `True`):
@@ -68,25 +62,57 @@ class Qwen2Config(PretrainedConfig):
6862
Whether the model's input and output word embeddings should be tied.
6963
rope_theta (`float`, *optional*, defaults to 10000.0):
7064
The base period of the RoPE embeddings.
71-
use_swiglu (`bool`, *optional*, defaults to `False`):
72-
Whether to use SwiGLU activation function.
65+
rope_scaling (`Dict`, *optional*):
66+
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
67+
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
68+
accordingly.
69+
Expected contents:
70+
`rope_type` (`str`):
71+
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
72+
'llama3'], with 'default' being the original RoPE implementation.
73+
`factor` (`float`, *optional*):
74+
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
75+
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
76+
original maximum pre-trained length.
77+
`original_max_position_embeddings` (`int`, *optional*):
78+
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
79+
pretraining.
80+
`attention_factor` (`float`, *optional*):
81+
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
82+
computation. If unspecified, it defaults to value recommended by the implementation, using the
83+
`factor` field to infer the suggested value.
84+
`beta_fast` (`float`, *optional*):
85+
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
86+
ramp function. If unspecified, it defaults to 32.
87+
`beta_slow` (`float`, *optional*):
88+
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
89+
ramp function. If unspecified, it defaults to 1.
90+
`short_factor` (`list[float]`, *optional*):
91+
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
92+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
93+
size divided by the number of attention heads divided by 2
94+
`long_factor` (`list[float]`, *optional*):
95+
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
96+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97+
size divided by the number of attention heads divided by 2
98+
`low_freq_factor` (`float`, *optional*):
99+
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
100+
`high_freq_factor` (`float`, *optional*):
101+
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
73102
use_sliding_window (`bool`, *optional*, defaults to `False`):
74103
Whether to use sliding window attention.
75104
sliding_window (`int`, *optional*, defaults to 4096):
76105
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77106
max_window_layers (`int`, *optional*, defaults to 28):
78-
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
79-
ignored_index (`int`, *optional*, defaults to -100):
80-
Target value that is ignored during loss computation.
107+
The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
108+
additional layer afterwards will use SWA (Sliding Window Attention).
109+
layer_types (`list`, *optional*):
110+
Attention pattern for each layer.
81111
attention_dropout (`float`, *optional*, defaults to 0.0):
82112
The dropout ratio for the attention probabilities.
83-
attention_bias (`bool`, *optional*, defaults to `True`):
84-
Whether to use a bias in the query, key, value and output projection layers during self-attention.
85-
pp_seg_method (`str`, *optional*, defaults to `"layer:Qwen2DecoderLayer"`):
86-
Method for pipeline parallel segmentation.
87113
88114
```python
89-
>>> from transformers import Qwen2Model, Qwen2Config
115+
>>> from paddleformers.transformers import Qwen2Model, Qwen2Config
90116
91117
>>> # Initializing a Qwen2 style configuration
92118
>>> configuration = Qwen2Config()
@@ -112,25 +138,16 @@ def __init__(
112138
hidden_act="silu",
113139
max_position_embeddings=32768,
114140
initializer_range=0.02,
115-
use_rmsnorm=True,
116141
rms_norm_eps=1e-6,
117142
use_cache=True,
118143
tie_word_embeddings=False,
119144
rope_theta=10000.0,
120-
pad_token_id=151643,
121-
bos_token_id=151643,
122-
eos_token_id=151643,
123-
use_swiglu=False,
145+
rope_scaling=None,
124146
use_sliding_window=False,
125147
sliding_window=4096,
126148
max_window_layers=28,
127-
ignored_index=-100,
128-
attention_bias=True,
129-
attention_dropout=0.0,
130-
rope_scaling_factor=1.0,
131-
rope_scaling_type=None,
132149
layer_types=None,
133-
pp_seg_method="layer:Qwen2DecoderLayer",
150+
attention_dropout=0.0,
134151
**kwargs,
135152
):
136153
self.vocab_size = vocab_size
@@ -140,9 +157,8 @@ def __init__(
140157
self.num_hidden_layers = num_hidden_layers
141158
self.num_attention_heads = num_attention_heads
142159
self.use_sliding_window = use_sliding_window
143-
self.sliding_window = sliding_window
160+
self.sliding_window = sliding_window if self.use_sliding_window else None
144161
self.max_window_layers = max_window_layers
145-
self.ignored_index = ignored_index
146162

147163
# for backward compatibility
148164
if num_key_value_heads is None:
@@ -151,54 +167,31 @@ def __init__(
151167
self.num_key_value_heads = num_key_value_heads
152168
self.hidden_act = hidden_act
153169
self.initializer_range = initializer_range
154-
self.use_swiglu = use_swiglu
155-
self.use_rmsnorm = use_rmsnorm
156170
self.rms_norm_eps = rms_norm_eps
157171
self.use_cache = use_cache
158172
self.rope_theta = rope_theta
159-
self.attention_bias = attention_bias
173+
self.rope_scaling = rope_scaling
160174
self.attention_dropout = attention_dropout
161-
162-
self.rope_scaling_factor = rope_scaling_factor
163-
self.rope_scaling_type = rope_scaling_type
164-
165-
self.pad_token_id = pad_token_id
166-
self.bos_token_id = bos_token_id
167-
self.eos_token_id = eos_token_id
168-
169-
self.pp_seg_method = pp_seg_method
175+
# Validate the correctness of rotary position embeddings parameters
176+
# BC: if there is a 'type' field, move it to 'rope_type'.
177+
if self.rope_scaling is not None and "type" in self.rope_scaling:
178+
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
179+
# rope_config_validation(self)
170180

171181
self.layer_types = layer_types
172182
if self.layer_types is None:
173183
self.layer_types = [
174-
"sliding_attention" if self.use_sliding_window and i >= self.max_window_layers else "full_attention"
184+
"sliding_attention"
185+
if self.sliding_window is not None and i >= self.max_window_layers
186+
else "full_attention"
175187
for i in range(self.num_hidden_layers)
176188
]
177189
layer_type_validation(self.layer_types, self.num_hidden_layers)
178190

179191
super().__init__(
180-
pad_token_id=pad_token_id,
181-
bos_token_id=bos_token_id,
182-
eos_token_id=eos_token_id,
183192
tie_word_embeddings=tie_word_embeddings,
184193
**kwargs,
185194
)
186195

187-
self.register_unsavable_keys(
188-
[
189-
"attention_bias",
190-
"ignored_index",
191-
"pad_token_id",
192-
"rope_scaling_factor",
193-
"rope_scaling_type",
194-
"use_rmsnorm",
195-
"use_swiglu",
196-
"recompute",
197-
"recompute_use_reentrant",
198-
"recompute_granularity",
199-
"pp_seg_method",
200-
"dpo_config",
201-
"kto_config",
202-
"layer_types",
203-
]
204-
)
196+
197+
__all__ = ["Qwen2Config"]

0 commit comments

Comments
 (0)