Skip to content

Commit 99925e9

Browse files
authored
Merge pull request #775 from wangzhen38/multi-task-update
update para init
2 parents 3c2d20b + 54b6a96 commit 99925e9

File tree

2 files changed

+30
-22
lines changed

2 files changed

+30
-22
lines changed

models/multitask/mmoe/net.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
2828
self.gate_num = gate_num
2929

3030
self._param_expert = []
31+
expert_init = [pow(10, -i) for i in range(1, self.expert_num + 1)]
3132
for i in range(0, self.expert_num):
3233
linear = self.add_sublayer(
3334
name='expert_' + str(i),
3435
sublayer=nn.Linear(
3536
feature_size,
3637
expert_size,
37-
#initialize the weight randly
38-
weight_attr=nn.initializer.XavierUniform(),
38+
#initialize each expert respectly
39+
weight_attr=nn.initializer.Constant(value=expert_init[i]),
3940
bias_attr=nn.initializer.Constant(value=0.1),
4041
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
4142
name='expert_' + str(i)))
@@ -44,14 +45,15 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
4445
self._param_gate = []
4546
self._param_tower = []
4647
self._param_tower_out = []
48+
gate_init = [pow(10, -i) for i in range(1, self.gate_num + 1)]
4749
for i in range(0, self.gate_num):
4850
linear = self.add_sublayer(
4951
name='gate_' + str(i),
5052
sublayer=nn.Linear(
5153
feature_size,
5254
expert_num,
53-
#initialize the weight randly
54-
weight_attr=nn.initializer.XavierUniform(),
55+
#initialize every gate respectly
56+
weight_attr=nn.initializer.Constant(value=gate_init[i]),
5557
bias_attr=nn.initializer.Constant(value=0.1),
5658
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
5759
name='gate_' + str(i)))
@@ -62,8 +64,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
6264
sublayer=nn.Linear(
6365
expert_size,
6466
tower_size,
65-
#initialize the weight randly
66-
weight_attr=nn.initializer.XavierUniform(),
67+
#initialize each gate respectly
68+
weight_attr=nn.initializer.Constant(value=gate_init[i]),
6769
bias_attr=nn.initializer.Constant(value=0.1),
6870
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
6971
name='tower_' + str(i)))
@@ -74,8 +76,8 @@ def __init__(self, feature_size, expert_num, expert_size, tower_size,
7476
sublayer=nn.Linear(
7577
tower_size,
7678
2,
77-
#initialize the weight randly
78-
weight_attr=nn.initializer.XavierUniform(),
79+
#initialize each gate respectly
80+
weight_attr=nn.initializer.Constant(value=gate_init[i]),
7981
bias_attr=nn.initializer.Constant(value=0.1),
8082
name='tower_out_' + str(i)))
8183
self._param_tower_out.append(linear)

models/multitask/ple/net.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
1+
#the weight randly Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -52,14 +52,15 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
5252
# task tower
5353
self._param_tower = []
5454
self._param_tower_out = []
55+
task_init = [pow(10, -i) for i in range(1, self.task_num + 1)]
5556
for i in range(0, self.task_num):
5657
linear = self.add_sublayer(
5758
name='tower_' + str(i),
5859
sublayer=nn.Linear(
5960
expert_size,
6061
tower_size,
61-
#initialize the weight randly
62-
weight_attr=nn.initializer.XavierUniform(),
62+
#initialize each task respectly
63+
weight_attr=nn.initializer.Constant(value=task_init[i]),
6364
bias_attr=nn.initializer.Constant(value=0.1),
6465
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
6566
name='tower_' + str(i)))
@@ -70,8 +71,8 @@ def __init__(self, feature_size, task_num, exp_per_task, shared_num,
7071
sublayer=nn.Linear(
7172
tower_size,
7273
2,
73-
#initialize the weight randly
74-
weight_attr=nn.initializer.XavierUniform(),
74+
#initialize each task respectly
75+
weight_attr=nn.initializer.Constant(value=task_init[i]),
7576
bias_attr=nn.initializer.Constant(value=0.1),
7677
name='tower_out_' + str(i)))
7778
self._param_tower_out.append(linear)
@@ -113,43 +114,49 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
113114

114115
self._param_expert = []
115116
# task-specific expert part
117+
step = self.exp_per_task
116118
for i in range(0, self.task_num):
119+
exp_init = [
120+
pow(10, -k) for k in range(1 + i * step, step * (i + 1) + 1)
121+
]
117122
for j in range(0, self.exp_per_task):
118123
linear = self.add_sublayer(
119124
name=level_name + "_exp_" + str(i) + "_" + str(j),
120125
sublayer=nn.Linear(
121126
input_feature_size,
122127
expert_size,
123-
#initialize the weight randly
124-
weight_attr=nn.initializer.XavierUniform(),
128+
#initialize each expert respectly
129+
weight_attr=nn.initializer.Constant(value=exp_init[j]),
125130
bias_attr=nn.initializer.Constant(value=0.1),
126131
name=level_name + "_exp_" + str(i) + "_" + str(j)))
127132
self._param_expert.append(linear)
128-
133+
shared_exp_init = [pow(10, -i) for i in range(1, self.shared_num + 1)]
129134
# shared expert part
130135
for i in range(0, self.shared_num):
131136
linear = self.add_sublayer(
132137
name=level_name + "_exp_shared_" + str(i),
133138
sublayer=nn.Linear(
134139
input_feature_size,
135140
expert_size,
136-
#initialize the weight randly
137-
weight_attr=nn.initializer.XavierUniform(),
141+
#initialize each shared expert respectly
142+
weight_attr=nn.initializer.Constant(
143+
value=shared_exp_init[i]),
138144
bias_attr=nn.initializer.Constant(value=0.1),
139145
name=level_name + "_exp_shared_" + str(i)))
140146
self._param_expert.append(linear)
141147

142148
# task gate part
143149
self._param_gate = []
144150
cur_expert_num = self.exp_per_task + self.shared_num
151+
gate_init = [pow(10, -i) for i in range(1, self.task_num + 1)]
145152
for i in range(0, self.task_num):
146153
linear = self.add_sublayer(
147154
name=level_name + "_gate_" + str(i),
148155
sublayer=nn.Linear(
149156
input_feature_size,
150157
cur_expert_num,
151-
#initialize the weight randly
152-
weight_attr=nn.initializer.XavierUniform(),
158+
#initialize each gate respectly
159+
weight_attr=nn.initializer.Constant(value=gate_init[i]),
153160
bias_attr=nn.initializer.Constant(value=0.1),
154161
name=level_name + "_gate_" + str(i)))
155162
self._param_gate.append(linear)
@@ -162,8 +169,7 @@ def __init__(self, input_feature_size, task_num, exp_per_task, shared_num,
162169
sublayer=nn.Linear(
163170
input_feature_size,
164171
cur_expert_num,
165-
#initialize the weight randly
166-
weight_attr=nn.initializer.XavierUniform(),
172+
weight_attr=nn.initializer.Constant(value=0.1),
167173
bias_attr=nn.initializer.Constant(value=0.1),
168174
name=level_name + "_gate_shared_"))
169175
self._param_gate_shared = linear

0 commit comments

Comments
 (0)