Skip to content

Commit 5e733be

Browse files
authored
Merge pull request #724 from wangzhen38/mmoe_ple_fix
fix the weight initiallizer style of MMOE and PLE
2 parents f33c832 + 226ea56 commit 5e733be

File tree

2 files changed

+20
-10
lines changed

2 files changed

+20
-10
lines changed

models/multitask/mmoe/net.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
3434
sublayer=nn.Linear(
3535
feature_size,
3636
expert_size,
37-
weight_attr=nn.initializer.Constant(value=0.1),
37+
#initialize the weight randly
38+
weight_attr=nn.initializer.XavierUniform(),
3839
bias_attr=nn.initializer.Constant(value=0.1),
3940
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
4041
name='expert_' + str(i)))
@@ -49,7 +50,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
4950
sublayer=nn.Linear(
5051
feature_size,
5152
expert_num,
52-
weight_attr=nn.initializer.Constant(value=0.1),
53+
#initialize the weight randly
54+
weight_attr=nn.initializer.XavierUniform(),
5355
bias_attr=nn.initializer.Constant(value=0.1),
5456
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
5557
name='gate_' + str(i)))
@@ -60,7 +62,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
6062
sublayer=nn.Linear(
6163
expert_size,
6264
tower_size,
63-
weight_attr=nn.initializer.Constant(value=0.1),
65+
#initialize the weight randly
66+
weight_attr=nn.initializer.XavierUniform(),
6467
bias_attr=nn.initializer.Constant(value=0.1),
6568
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
6669
name='tower_' + str(i)))
@@ -71,7 +74,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
7174
sublayer=nn.Linear(
7275
tower_size,
7376
2,
74-
weight_attr=nn.initializer.Constant(value=0.1),
77+
#initialize the weight randly
78+
weight_attr=nn.initializer.XavierUniform(),
7579
bias_attr=nn.initializer.Constant(value=0.1),
7680
name='tower_out_' + str(i)))
7781
self._param_tower_out.append(linear)

models/multitask/ple/net.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
5858
sublayer=nn.Linear(
5959
expert_size,
6060
tower_size,
61-
weight_attr=nn.initializer.Constant(value=0.1),
61+
#initialize the weight randly
62+
weight_attr=nn.initializer.XavierUniform(),
6263
bias_attr=nn.initializer.Constant(value=0.1),
6364
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
6465
name='tower_' + str(i)))
@@ -69,7 +70,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
6970
sublayer=nn.Linear(
7071
tower_size,
7172
2,
72-
weight_attr=nn.initializer.Constant(value=0.1),
73+
#initialize the weight randly
74+
weight_attr=nn.initializer.XavierUniform(),
7375
bias_attr=nn.initializer.Constant(value=0.1),
7476
name='tower_out_' + str(i)))
7577
self._param_tower_out.append(linear)
@@ -118,7 +120,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
118120
sublayer=nn.Linear(
119121
input_feature_size,
120122
expert_size,
121-
weight_attr=nn.initializer.Constant(value=0.1),
123+
#initialize the weight randly
124+
weight_attr=nn.initializer.XavierUniform(),
122125
bias_attr=nn.initializer.Constant(value=0.1),
123126
name=level_name + "_exp_" + str(i) + "_" + str(j)))
124127
self._param_expert.append(linear)
@@ -130,7 +133,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
130133
sublayer=nn.Linear(
131134
input_feature_size,
132135
expert_size,
133-
weight_attr=nn.initializer.Constant(value=0.1),
136+
#initialize the weight randly
137+
weight_attr=nn.initializer.XavierUniform(),
134138
bias_attr=nn.initializer.Constant(value=0.1),
135139
name=level_name + "_exp_shared_" + str(i)))
136140
self._param_expert.append(linear)
@@ -144,7 +148,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
144148
sublayer=nn.Linear(
145149
input_feature_size,
146150
cur_expert_num,
147-
weight_attr=nn.initializer.Constant(value=0.1),
151+
#initialize the weight randly
152+
weight_attr=nn.initializer.XavierUniform(),
148153
bias_attr=nn.initializer.Constant(value=0.1),
149154
name=level_name + "_gate_" + str(i)))
150155
self._param_gate.append(linear)
@@ -157,7 +162,8 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
157162
sublayer=nn.Linear(
158163
input_feature_size,
159164
cur_expert_num,
160-
weight_attr=nn.initializer.Constant(value=0.1),
165+
#initialize the weight randly
166+
weight_attr=nn.initializer.XavierUniform(),
161167
bias_attr=nn.initializer.Constant(value=0.1),
162168
name=level_name + "_gate_shared_"))
163169
self._param_gate_shared = linear

0 commit comments

Comments
 (0)