47
47
48
48
49
49
class MyDataset (Dataset ):
50
+
50
51
def __init__ (self , num_samples ):
51
52
super (MyDataset , self ).__init__ ()
52
53
self .num_samples = num_samples
@@ -61,6 +62,7 @@ def __len__(self):
61
62
62
63
63
64
class MLPLayer (nn .Layer ):
65
+
64
66
def __init__ (self ,
65
67
hidden_size = 1024 ,
66
68
intermediate_size = 4 * 1024 ,
@@ -69,43 +71,45 @@ def __init__(self,
69
71
super (MLPLayer , self ).__init__ ()
70
72
d_model = hidden_size
71
73
dim_feedforward = intermediate_size
72
- weight_attr = paddle .ParamAttr (initializer = nn . initializer . Normal (
73
- mean = 0.0 , std = initializer_range ))
74
+ weight_attr = paddle .ParamAttr (
75
+ initializer = nn . initializer . Normal ( mean = 0.0 , std = initializer_range ))
74
76
bias_attr = None
75
77
76
- self .linear0 = nn .Linear (
77
- d_model , dim_feedforward , weight_attr , bias_attr = bias_attr )
78
- self .linear1 = nn .Linear (
79
- dim_feedforward , d_model , weight_attr , bias_attr = bias_attr )
78
+ self .linear0 = nn .Linear (d_model ,
79
+ dim_feedforward ,
80
+ weight_attr ,
81
+ bias_attr = bias_attr )
82
+ self .linear1 = nn .Linear (dim_feedforward ,
83
+ d_model ,
84
+ weight_attr ,
85
+ bias_attr = bias_attr )
80
86
self .linear2 = nn .Linear (d_model , 1 , weight_attr , bias_attr = bias_attr )
81
87
self .norm = nn .LayerNorm (d_model , epsilon = 1e-5 )
82
88
self .dropout = nn .Dropout (dropout_ratio , mode = "upscale_in_train" )
83
89
84
90
def forward (self , input ):
85
- out = auto .shard_op (
86
- self . norm , dist_attr = { "process_mesh" : PP_MESH_0 })(input )[ 0 ]
87
- out = self .linear0 (input )
91
+ out = auto .shard_op (self . norm , dist_attr = { "process_mesh" :
92
+ PP_MESH_0 })(input )
93
+ out = self .linear0 (out )
88
94
out = F .gelu (out , approximate = True )
89
- out = auto .shard_op (
90
- self . linear1 , dist_attr = { "process_mesh" : PP_MESH_1 })(out )[ 0 ]
95
+ out = auto .shard_op (self . linear1 , dist_attr = { "process_mesh" :
96
+ PP_MESH_1 })(out )
91
97
out = self .dropout (out )
92
98
out = self .linear2 (out )
93
99
return out
94
100
95
101
96
102
def train ():
97
- mlp = MLPLayer (
98
- hidden_size = hidden_size ,
99
- intermediate_size = 4 * hidden_size ,
100
- dropout_ratio = 0.1 ,
101
- initializer_range = 0.02 )
103
+ mlp = MLPLayer (hidden_size = hidden_size ,
104
+ intermediate_size = 4 * hidden_size ,
105
+ dropout_ratio = 0.1 ,
106
+ initializer_range = 0.02 )
102
107
loss = paddle .nn .CrossEntropyLoss ()
103
- optimizer = paddle .fluid .optimizer .AdamOptimizer (
104
- learning_rate = 0.00001 ,
105
- beta1 = 0.9 ,
106
- beta2 = 0.999 ,
107
- epsilon = 1e-08 ,
108
- grad_clip = None )
108
+ optimizer = paddle .fluid .optimizer .AdamOptimizer (learning_rate = 0.00001 ,
109
+ beta1 = 0.9 ,
110
+ beta2 = 0.999 ,
111
+ epsilon = 1e-08 ,
112
+ grad_clip = None )
109
113
110
114
dataset = MyDataset (batch_num * batch_size )
111
115
inputs_spec = InputSpec ([batch_size , hidden_size ], 'float32' , 'x' )
@@ -119,11 +123,10 @@ def train():
119
123
dist_strategy .semi_auto = True
120
124
fleet .init (is_collective = True , strategy = dist_strategy )
121
125
122
- engine = Engine (
123
- mlp ,
124
- inputs_spec = inputs_spec ,
125
- labels_spec = labels_spec ,
126
- strategy = dist_strategy )
126
+ engine = Engine (mlp ,
127
+ inputs_spec = inputs_spec ,
128
+ labels_spec = labels_spec ,
129
+ strategy = dist_strategy )
127
130
engine .prepare (optimizer , loss )
128
131
engine .fit (dataset ,
129
132
batch_size = batch_size ,
0 commit comments