16
16
17
17
from ..configuration_utils import PretrainedConfig , layer_type_validation
18
18
19
- __all__ = [
20
- "Qwen2Config" ,
21
- ]
22
-
23
19
24
20
class Qwen2Config (PretrainedConfig ):
25
21
r"""
@@ -47,18 +43,16 @@ class Qwen2Config(PretrainedConfig):
47
43
num_key_value_heads (`int`, *optional*, defaults to 32):
48
44
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
49
45
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
50
- `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
46
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
51
47
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
52
- by meanpooling all the original heads within that group. For more details checkout [this
53
- paper](https://arxiv.org/pdf /2305.13245.pdf ). If it is not specified, will default to `32`.
48
+ by meanpooling all the original heads within that group. For more details, check out [this
49
+ paper](https://huggingface.co/papers /2305.13245). If it is not specified, will default to `32`.
54
50
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
51
The non-linear activation function (function or string) in the decoder.
56
52
max_position_embeddings (`int`, *optional*, defaults to 32768):
57
53
The maximum sequence length that this model might ever be used with.
58
54
initializer_range (`float`, *optional*, defaults to 0.02):
59
55
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
- use_rmsnorm (`bool`, *optional*, defaults to `True`):
61
- Whether to use RMSNorm instead of LayerNorm.
62
56
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
63
57
The epsilon used by the rms normalization layers.
64
58
use_cache (`bool`, *optional*, defaults to `True`):
@@ -68,25 +62,57 @@ class Qwen2Config(PretrainedConfig):
68
62
Whether the model's input and output word embeddings should be tied.
69
63
rope_theta (`float`, *optional*, defaults to 10000.0):
70
64
The base period of the RoPE embeddings.
71
- use_swiglu (`bool`, *optional*, defaults to `False`):
72
- Whether to use SwiGLU activation function.
65
+ rope_scaling (`Dict`, *optional*):
66
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
67
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
68
+ accordingly.
69
+ Expected contents:
70
+ `rope_type` (`str`):
71
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
72
+ 'llama3'], with 'default' being the original RoPE implementation.
73
+ `factor` (`float`, *optional*):
74
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
75
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
76
+ original maximum pre-trained length.
77
+ `original_max_position_embeddings` (`int`, *optional*):
78
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
79
+ pretraining.
80
+ `attention_factor` (`float`, *optional*):
81
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
82
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
83
+ `factor` field to infer the suggested value.
84
+ `beta_fast` (`float`, *optional*):
85
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
86
+ ramp function. If unspecified, it defaults to 32.
87
+ `beta_slow` (`float`, *optional*):
88
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
89
+ ramp function. If unspecified, it defaults to 1.
90
+ `short_factor` (`list[float]`, *optional*):
91
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
92
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
93
+ size divided by the number of attention heads divided by 2
94
+ `long_factor` (`list[float]`, *optional*):
95
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
96
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
+ size divided by the number of attention heads divided by 2
98
+ `low_freq_factor` (`float`, *optional*):
99
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
100
+ `high_freq_factor` (`float`, *optional*):
101
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
73
102
use_sliding_window (`bool`, *optional*, defaults to `False`):
74
103
Whether to use sliding window attention.
75
104
sliding_window (`int`, *optional*, defaults to 4096):
76
105
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77
106
max_window_layers (`int`, *optional*, defaults to 28):
78
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
79
- ignored_index (`int`, *optional*, defaults to -100):
80
- Target value that is ignored during loss computation.
107
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
108
+ additional layer afterwards will use SWA (Sliding Window Attention).
109
+ layer_types (`list`, *optional*):
110
+ Attention pattern for each layer.
81
111
attention_dropout (`float`, *optional*, defaults to 0.0):
82
112
The dropout ratio for the attention probabilities.
83
- attention_bias (`bool`, *optional*, defaults to `True`):
84
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
85
- pp_seg_method (`str`, *optional*, defaults to `"layer:Qwen2DecoderLayer"`):
86
- Method for pipeline parallel segmentation.
87
113
88
114
```python
89
- >>> from transformers import Qwen2Model, Qwen2Config
115
+ >>> from paddleformers. transformers import Qwen2Model, Qwen2Config
90
116
91
117
>>> # Initializing a Qwen2 style configuration
92
118
>>> configuration = Qwen2Config()
@@ -112,25 +138,16 @@ def __init__(
112
138
hidden_act = "silu" ,
113
139
max_position_embeddings = 32768 ,
114
140
initializer_range = 0.02 ,
115
- use_rmsnorm = True ,
116
141
rms_norm_eps = 1e-6 ,
117
142
use_cache = True ,
118
143
tie_word_embeddings = False ,
119
144
rope_theta = 10000.0 ,
120
- pad_token_id = 151643 ,
121
- bos_token_id = 151643 ,
122
- eos_token_id = 151643 ,
123
- use_swiglu = False ,
145
+ rope_scaling = None ,
124
146
use_sliding_window = False ,
125
147
sliding_window = 4096 ,
126
148
max_window_layers = 28 ,
127
- ignored_index = - 100 ,
128
- attention_bias = True ,
129
- attention_dropout = 0.0 ,
130
- rope_scaling_factor = 1.0 ,
131
- rope_scaling_type = None ,
132
149
layer_types = None ,
133
- pp_seg_method = "layer:Qwen2DecoderLayer" ,
150
+ attention_dropout = 0.0 ,
134
151
** kwargs ,
135
152
):
136
153
self .vocab_size = vocab_size
@@ -140,9 +157,8 @@ def __init__(
140
157
self .num_hidden_layers = num_hidden_layers
141
158
self .num_attention_heads = num_attention_heads
142
159
self .use_sliding_window = use_sliding_window
143
- self .sliding_window = sliding_window
160
+ self .sliding_window = sliding_window if self . use_sliding_window else None
144
161
self .max_window_layers = max_window_layers
145
- self .ignored_index = ignored_index
146
162
147
163
# for backward compatibility
148
164
if num_key_value_heads is None :
@@ -151,54 +167,31 @@ def __init__(
151
167
self .num_key_value_heads = num_key_value_heads
152
168
self .hidden_act = hidden_act
153
169
self .initializer_range = initializer_range
154
- self .use_swiglu = use_swiglu
155
- self .use_rmsnorm = use_rmsnorm
156
170
self .rms_norm_eps = rms_norm_eps
157
171
self .use_cache = use_cache
158
172
self .rope_theta = rope_theta
159
- self .attention_bias = attention_bias
173
+ self .rope_scaling = rope_scaling
160
174
self .attention_dropout = attention_dropout
161
-
162
- self .rope_scaling_factor = rope_scaling_factor
163
- self .rope_scaling_type = rope_scaling_type
164
-
165
- self .pad_token_id = pad_token_id
166
- self .bos_token_id = bos_token_id
167
- self .eos_token_id = eos_token_id
168
-
169
- self .pp_seg_method = pp_seg_method
175
+ # Validate the correctness of rotary position embeddings parameters
176
+ # BC: if there is a 'type' field, move it to 'rope_type'.
177
+ if self .rope_scaling is not None and "type" in self .rope_scaling :
178
+ self .rope_scaling ["rope_type" ] = self .rope_scaling ["type" ]
179
+ # rope_config_validation(self)
170
180
171
181
self .layer_types = layer_types
172
182
if self .layer_types is None :
173
183
self .layer_types = [
174
- "sliding_attention" if self .use_sliding_window and i >= self .max_window_layers else "full_attention"
184
+ "sliding_attention"
185
+ if self .sliding_window is not None and i >= self .max_window_layers
186
+ else "full_attention"
175
187
for i in range (self .num_hidden_layers )
176
188
]
177
189
layer_type_validation (self .layer_types , self .num_hidden_layers )
178
190
179
191
super ().__init__ (
180
- pad_token_id = pad_token_id ,
181
- bos_token_id = bos_token_id ,
182
- eos_token_id = eos_token_id ,
183
192
tie_word_embeddings = tie_word_embeddings ,
184
193
** kwargs ,
185
194
)
186
195
187
- self .register_unsavable_keys (
188
- [
189
- "attention_bias" ,
190
- "ignored_index" ,
191
- "pad_token_id" ,
192
- "rope_scaling_factor" ,
193
- "rope_scaling_type" ,
194
- "use_rmsnorm" ,
195
- "use_swiglu" ,
196
- "recompute" ,
197
- "recompute_use_reentrant" ,
198
- "recompute_granularity" ,
199
- "pp_seg_method" ,
200
- "dpo_config" ,
201
- "kto_config" ,
202
- "layer_types" ,
203
- ]
204
- )
196
+
197
+ __all__ = ["Qwen2Config" ]
0 commit comments