@@ -96,6 +96,26 @@ class UnifiedTransformerPretrainedModel(PretrainedModel):
96
96
"eos_token_id" : 2 ,
97
97
"mask_token_id" : 30000 ,
98
98
},
99
+ "plato-xl" : {
100
+ "vocab_size" : 8001 ,
101
+ "hidden_size" : 3072 ,
102
+ "num_hidden_layers" : 72 ,
103
+ "num_attention_heads" : 32 ,
104
+ "intermediate_size" : 18432 ,
105
+ "hidden_act" : "gelu" ,
106
+ "hidden_dropout_prob" : 0.1 ,
107
+ "attention_probs_dropout_prob" : 0.1 ,
108
+ "normalize_before" : True ,
109
+ "max_position_embeddings" : 1024 ,
110
+ "type_vocab_size" : 2 ,
111
+ "role_type_size" : 128 ,
112
+ "initializer_range" : 0.02 ,
113
+ "unk_token_id" : 0 ,
114
+ "pad_token_id" : 0 ,
115
+ "bos_token_id" : 1 ,
116
+ "eos_token_id" : 2 ,
117
+ "mask_token_id" : 8000 ,
118
+ }
99
119
}
100
120
resource_files_names = {"model_state" : "model_state.pdparams" }
101
121
pretrained_resource_files_map = {
@@ -106,6 +126,8 @@ class UnifiedTransformerPretrainedModel(PretrainedModel):
106
126
"https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-luge.pdparams" ,
107
127
"plato-mini" :
108
128
"https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-mini.pdparams" ,
129
+ "plato-xl" :
130
+ "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-xl.pdparams" ,
109
131
}
110
132
}
111
133
base_model_prefix = "unified_transformer"
@@ -115,7 +137,9 @@ def init_weights(self, layer):
115
137
if isinstance (layer , (nn .Linear , nn .Embedding )):
116
138
# In the dygraph mode, use the `set_value` to reset the parameter directly,
117
139
# and reset the `state_dict` to update parameter in static mode.
118
- if isinstance (layer .weight , paddle .Tensor ):
140
+ if isinstance (
141
+ layer .weight ,
142
+ paddle .Tensor ) and paddle .get_default_dtype () == "float32" :
119
143
layer .weight .set_value (
120
144
paddle .tensor .normal (
121
145
mean = 0.0 ,
@@ -133,20 +157,27 @@ def __init__(self,
133
157
hidden_size = 768 ,
134
158
hidden_dropout_prob = 0.1 ,
135
159
max_position_embeddings = 512 ,
136
- type_vocab_size = 2 ):
160
+ type_vocab_size = 2 ,
161
+ role_type_size = None ):
137
162
super (UnifiedTransformerEmbeddings , self ).__init__ ()
138
163
self .word_embeddings = nn .Embedding (vocab_size , hidden_size )
139
164
self .position_embeddings = nn .Embedding (max_position_embeddings ,
140
165
hidden_size )
141
166
self .token_type_embeddings = nn .Embedding (type_vocab_size , hidden_size )
167
+ self .role_embeddings = None if role_type_size is None else nn .Embedding (
168
+ role_type_size , hidden_size )
142
169
self .dropout = nn .Dropout (hidden_dropout_prob )
143
170
144
- def forward (self , input_ids , token_type_ids , position_ids ):
171
+ def forward (self , input_ids , token_type_ids , position_ids , role_ids = None ):
145
172
input_embedings = self .word_embeddings (input_ids )
146
173
position_embeddings = self .position_embeddings (position_ids )
147
174
token_type_embeddings = self .token_type_embeddings (token_type_ids )
148
175
149
176
embeddings = input_embedings + position_embeddings + token_type_embeddings
177
+
178
+ if self .role_embeddings is not None :
179
+ embeddings += self .role_embeddings (role_ids )
180
+
150
181
embeddings = self .dropout (embeddings )
151
182
return embeddings
152
183
@@ -221,25 +252,25 @@ class UnifiedTransformerModel(UnifiedTransformerPretrainedModel):
221
252
The id of special token `mask_token`. Defaults to 30000.
222
253
"""
223
254
224
- def __init__ (
225
- self ,
226
- vocab_size ,
227
- hidden_size = 768 ,
228
- num_hidden_layers = 12 ,
229
- num_attention_heads = 12 ,
230
- intermediate_size = 3072 ,
231
- hidden_act = "gelu" ,
232
- hidden_dropout_prob = 0.1 ,
233
- attention_probs_dropout_prob = 0.1 ,
234
- normalize_before = True ,
235
- max_position_embeddings = 512 ,
236
- type_vocab_size = 2 ,
237
- initializer_range = 0.02 ,
238
- unk_token_id = 0 ,
239
- pad_token_id = 0 ,
240
- bos_token_id = 1 ,
241
- eos_token_id = 2 ,
242
- mask_token_id = 30000 , ):
255
+ def __init__ (self ,
256
+ vocab_size ,
257
+ hidden_size = 768 ,
258
+ num_hidden_layers = 12 ,
259
+ num_attention_heads = 12 ,
260
+ intermediate_size = 3072 ,
261
+ hidden_act = "gelu" ,
262
+ hidden_dropout_prob = 0.1 ,
263
+ attention_probs_dropout_prob = 0.1 ,
264
+ normalize_before = True ,
265
+ max_position_embeddings = 512 ,
266
+ type_vocab_size = 2 ,
267
+ initializer_range = 0.02 ,
268
+ unk_token_id = 0 ,
269
+ pad_token_id = 0 ,
270
+ bos_token_id = 1 ,
271
+ eos_token_id = 2 ,
272
+ mask_token_id = 30000 ,
273
+ role_type_size = None ):
243
274
super (UnifiedTransformerModel , self ).__init__ ()
244
275
self .unk_token_id = unk_token_id
245
276
self .pad_token_id = pad_token_id
@@ -250,7 +281,7 @@ def __init__(
250
281
251
282
self .embeddings = UnifiedTransformerEmbeddings (
252
283
vocab_size , hidden_size , hidden_dropout_prob ,
253
- max_position_embeddings , type_vocab_size )
284
+ max_position_embeddings , type_vocab_size , role_type_size )
254
285
encoder_layer = nn .TransformerEncoderLayer (
255
286
hidden_size ,
256
287
num_attention_heads ,
0 commit comments