1- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
1+ #the weight randly Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
22#
33# Licensed under the Apache License, Version 2.0 (the "License");
44# you may not use this file except in compliance with the License.
@@ -52,14 +52,15 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
5252 # task tower
5353 self ._param_tower = []
5454 self ._param_tower_out = []
55+ task_init = [pow (10 , - i ) for i in range (1 , self .task_num + 1 )]
5556 for i in range (0 , self .task_num ):
5657 linear = self .add_sublayer (
5758 name = 'tower_' + str (i ),
5859 sublayer = nn .Linear (
5960 expert_size ,
6061 tower_size ,
61- #initialize the weight randly
62- weight_attr = nn .initializer .XavierUniform ( ),
62+ #initialize each task respectly
63+ weight_attr = nn .initializer .Constant ( value = task_init [ i ] ),
6364 bias_attr = nn .initializer .Constant (value = 0.1 ),
6465 #bias_attr=paddle.ParamAttr(learning_rate=1.0),
6566 name = 'tower_' + str (i )))
@@ -70,8 +71,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
7071 sublayer = nn .Linear (
7172 tower_size ,
7273 2 ,
73- #initialize the weight randly
74- weight_attr = nn .initializer .XavierUniform ( ),
74+ #initialize each task respectly
75+ weight_attr = nn .initializer .Constant ( value = task_init [ i ] ),
7576 bias_attr = nn .initializer .Constant (value = 0.1 ),
7677 name = 'tower_out_' + str (i )))
7778 self ._param_tower_out .append (linear )
@@ -113,43 +114,49 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
113114
114115 self ._param_expert = []
115116 # task-specific expert part
117+ step = self .exp_per_task
116118 for i in range (0 , self .task_num ):
119+ exp_init = [
120+ pow (10 , - k ) for k in range (1 + i * step , step * (i + 1 ) + 1 )
121+ ]
117122 for j in range (0 , self .exp_per_task ):
118123 linear = self .add_sublayer (
119124 name = level_name + "_exp_" + str (i ) + "_" + str (j ),
120125 sublayer = nn .Linear (
121126 input_feature_size ,
122127 expert_size ,
123- #initialize the weight randly
124- weight_attr = nn .initializer .XavierUniform ( ),
128+ #initialize each expert respectly
129+ weight_attr = nn .initializer .Constant ( value = exp_init [ j ] ),
125130 bias_attr = nn .initializer .Constant (value = 0.1 ),
126131 name = level_name + "_exp_" + str (i ) + "_" + str (j )))
127132 self ._param_expert .append (linear )
128-
133+ shared_exp_init = [ pow ( 10 , - i ) for i in range ( 1 , self . shared_num + 1 )]
129134 # shared expert part
130135 for i in range (0 , self .shared_num ):
131136 linear = self .add_sublayer (
132137 name = level_name + "_exp_shared_" + str (i ),
133138 sublayer = nn .Linear (
134139 input_feature_size ,
135140 expert_size ,
136- #initialize the weight randly
137- weight_attr = nn .initializer .XavierUniform (),
141+ #initialize each shared expert respectly
142+ weight_attr = nn .initializer .Constant (
143+ value = shared_exp_init [i ]),
138144 bias_attr = nn .initializer .Constant (value = 0.1 ),
139145 name = level_name + "_exp_shared_" + str (i )))
140146 self ._param_expert .append (linear )
141147
142148 # task gate part
143149 self ._param_gate = []
144150 cur_expert_num = self .exp_per_task + self .shared_num
151+ gate_init = [pow (10 , - i ) for i in range (1 , self .task_num + 1 )]
145152 for i in range (0 , self .task_num ):
146153 linear = self .add_sublayer (
147154 name = level_name + "_gate_" + str (i ),
148155 sublayer = nn .Linear (
149156 input_feature_size ,
150157 cur_expert_num ,
151- #initialize the weight randly
152- weight_attr = nn .initializer .XavierUniform ( ),
158+ #initialize each gate respectly
159+ weight_attr = nn .initializer .Constant ( value = gate_init [ i ] ),
153160 bias_attr = nn .initializer .Constant (value = 0.1 ),
154161 name = level_name + "_gate_" + str (i )))
155162 self ._param_gate .append (linear )
@@ -162,8 +169,7 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
162169 sublayer = nn .Linear (
163170 input_feature_size ,
164171 cur_expert_num ,
165- #initialize the weight randly
166- weight_attr = nn .initializer .XavierUniform (),
172+ weight_attr = nn .initializer .Constant (value = 0.1 ),
167173 bias_attr = nn .initializer .Constant (value = 0.1 ),
168174 name = level_name + "_gate_shared_" ))
169175 self ._param_gate_shared = linear
0 commit comments