@@ -58,7 +58,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
5858 sublayer = nn .Linear (
5959 expert_size ,
6060 tower_size ,
61- weight_attr = nn .initializer .Constant (value = 0.1 ),
61+ #initialize the weight randly
62+ weight_attr = nn .initializer .XavierUniform (),
6263 bias_attr = nn .initializer .Constant (value = 0.1 ),
6364 #bias_attr=paddle.ParamAttr(learning_rate=1.0),
6465 name = 'tower_' + str (i )))
@@ -69,7 +70,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
6970 sublayer = nn .Linear (
7071 tower_size ,
7172 2 ,
72- weight_attr = nn .initializer .Constant (value = 0.1 ),
73+ #initialize the weight randly
74+ weight_attr = nn .initializer .XavierUniform (),
7375 bias_attr = nn .initializer .Constant (value = 0.1 ),
7476 name = 'tower_out_' + str (i )))
7577 self ._param_tower_out .append (linear )
@@ -118,7 +120,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
118120 sublayer = nn .Linear (
119121 input_feature_size ,
120122 expert_size ,
121- weight_attr = nn .initializer .Constant (value = 0.1 ),
123+ #initialize the weight randly
124+ weight_attr = nn .initializer .XavierUniform (),
122125 bias_attr = nn .initializer .Constant (value = 0.1 ),
123126 name = level_name + "_exp_" + str (i ) + "_" + str (j )))
124127 self ._param_expert .append (linear )
@@ -130,7 +133,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
130133 sublayer = nn .Linear (
131134 input_feature_size ,
132135 expert_size ,
133- weight_attr = nn .initializer .Constant (value = 0.1 ),
136+ #initialize the weight randly
137+ weight_attr = nn .initializer .XavierUniform (),
134138 bias_attr = nn .initializer .Constant (value = 0.1 ),
135139 name = level_name + "_exp_shared_" + str (i )))
136140 self ._param_expert .append (linear )
@@ -144,7 +148,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
144148 sublayer = nn .Linear (
145149 input_feature_size ,
146150 cur_expert_num ,
147- weight_attr = nn .initializer .Constant (value = 0.1 ),
151+ #initialize the weight randly
152+ weight_attr = nn .initializer .XavierUniform (),
148153 bias_attr = nn .initializer .Constant (value = 0.1 ),
149154 name = level_name + "_gate_" + str (i )))
150155 self ._param_gate .append (linear )
@@ -157,7 +162,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
157162 sublayer = nn .Linear (
158163 input_feature_size ,
159164 cur_expert_num ,
160- weight_attr = nn .initializer .Constant (value = 0.1 ),
165+ #initialize the weight randly
166+ weight_attr = nn .initializer .XavierUniform (),
161167 bias_attr = nn .initializer .Constant (value = 0.1 ),
162168 name = level_name + "_gate_shared_" ))
163169 self ._param_gate_shared = linear
0 commit comments