21
21
import sys
22
22
import numpy
23
23
import unittest
24
+ import os
24
25
25
26
26
27
def resnet_cifar10 (input , depth = 32 ):
@@ -92,7 +93,7 @@ def conv_block(input, num_filter, groups, dropouts):
92
93
return fc2
93
94
94
95
95
- def train (net_type , use_cuda , save_dirname ):
96
+ def train (net_type , use_cuda , save_dirname , is_local ):
96
97
classdim = 10
97
98
data_shape = [3 , 32 , 32 ]
98
99
@@ -117,7 +118,7 @@ def train(net_type, use_cuda, save_dirname):
117
118
test_program = fluid .default_main_program ().clone ()
118
119
119
120
optimizer = fluid .optimizer .Adam (learning_rate = 0.001 )
120
- optimizer .minimize (avg_cost )
121
+ optimize_ops , params_grads = optimizer .minimize (avg_cost )
121
122
122
123
BATCH_SIZE = 128
123
124
PASS_NUM = 1
@@ -133,38 +134,68 @@ def train(net_type, use_cuda, save_dirname):
133
134
place = fluid .CUDAPlace (0 ) if use_cuda else fluid .CPUPlace ()
134
135
exe = fluid .Executor (place )
135
136
feeder = fluid .DataFeeder (place = place , feed_list = [images , label ])
136
- exe .run (fluid .default_startup_program ())
137
-
138
- loss = 0.0
139
- for pass_id in range (PASS_NUM ):
140
- for batch_id , data in enumerate (train_reader ()):
141
- exe .run (feed = feeder .feed (data ))
142
-
143
- if (batch_id % 10 ) == 0 :
144
- acc_list = []
145
- avg_loss_list = []
146
- for tid , test_data in enumerate (test_reader ()):
147
- loss_t , acc_t = exe .run (program = test_program ,
148
- feed = feeder .feed (test_data ),
149
- fetch_list = [avg_cost , acc ])
150
- if math .isnan (float (loss_t )):
151
- sys .exit ("got NaN loss, training failed." )
152
- acc_list .append (float (acc_t ))
153
- avg_loss_list .append (float (loss_t ))
154
- break # Use 1 segment for speeding up CI
155
-
156
- acc_value = numpy .array (acc_list ).mean ()
157
- avg_loss_value = numpy .array (avg_loss_list ).mean ()
158
-
159
- print (
160
- 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}' .
161
- format (pass_id , batch_id + 1 ,
162
- float (avg_loss_value ), float (acc_value )))
163
-
164
- if acc_value > 0.01 : # Low threshold for speeding up CI
165
- fluid .io .save_inference_model (save_dirname , ["pixel" ],
166
- [predict ], exe )
167
- return
137
+
138
+ def train_loop (main_program ):
139
+ exe .run (fluid .default_startup_program ())
140
+ loss = 0.0
141
+ for pass_id in range (PASS_NUM ):
142
+ for batch_id , data in enumerate (train_reader ()):
143
+ exe .run (main_program , feed = feeder .feed (data ))
144
+
145
+ if (batch_id % 10 ) == 0 :
146
+ acc_list = []
147
+ avg_loss_list = []
148
+ for tid , test_data in enumerate (test_reader ()):
149
+ loss_t , acc_t = exe .run (program = test_program ,
150
+ feed = feeder .feed (test_data ),
151
+ fetch_list = [avg_cost , acc ])
152
+ if math .isnan (float (loss_t )):
153
+ sys .exit ("got NaN loss, training failed." )
154
+ acc_list .append (float (acc_t ))
155
+ avg_loss_list .append (float (loss_t ))
156
+ break # Use 1 segment for speeding up CI
157
+
158
+ acc_value = numpy .array (acc_list ).mean ()
159
+ avg_loss_value = numpy .array (avg_loss_list ).mean ()
160
+
161
+ print (
162
+ 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}' .
163
+ format (pass_id , batch_id + 1 ,
164
+ float (avg_loss_value ), float (acc_value )))
165
+
166
+ if acc_value > 0.01 : # Low threshold for speeding up CI
167
+ fluid .io .save_inference_model (save_dirname , ["pixel" ],
168
+ [predict ], exe )
169
+ return
170
+
171
+ if is_local :
172
+ train_loop (fluid .default_main_program ())
173
+ else :
174
+ port = os .getenv ("PADDLE_INIT_PORT" , "6174" )
175
+ pserver_ips = os .getenv ("PADDLE_INIT_PSERVERS" ) # ip,ip...
176
+ eplist = []
177
+ for ip in pserver_ips .split ("," ):
178
+ eplist .append (':' .join ([ip , port ]))
179
+ pserver_endpoints = "," .join (eplist ) # ip:port,ip:port...
180
+ trainers = int (os .getenv ("TRAINERS" ))
181
+ current_endpoint = os .getenv ("POD_IP" ) + ":" + port
182
+ trainer_id = int (os .getenv ("PADDLE_INIT_TRAINER_ID" ))
183
+ training_role = os .getenv ("TRAINING_ROLE" , "TRAINER" )
184
+ t = fluid .DistributeTranspiler ()
185
+ t .transpile (
186
+ optimize_ops ,
187
+ params_grads ,
188
+ trainer_id ,
189
+ pservers = pserver_endpoints ,
190
+ trainers = trainers )
191
+ if training_role == "PSERVER" :
192
+ pserver_prog = t .get_pserver_program (current_endpoint )
193
+ pserver_startup = t .get_startup_program (current_endpoint ,
194
+ pserver_prog )
195
+ exe .run (pserver_startup )
196
+ exe .run (pserver_prog )
197
+ elif training_role == "TRAINER" :
198
+ train_loop (t .get_trainer_program ())
168
199
169
200
170
201
def infer (use_cuda , save_dirname = None ):
@@ -196,14 +227,14 @@ def infer(use_cuda, save_dirname=None):
196
227
print ("infer results: " , results [0 ])
197
228
198
229
199
- def main (net_type , use_cuda ):
230
+ def main (net_type , use_cuda , is_local = True ):
200
231
if use_cuda and not fluid .core .is_compiled_with_cuda ():
201
232
return
202
233
203
234
# Directory for saving the trained model
204
235
save_dirname = "image_classification_" + net_type + ".inference.model"
205
236
206
- train (net_type , use_cuda , save_dirname )
237
+ train (net_type , use_cuda , save_dirname , is_local )
207
238
infer (use_cuda , save_dirname )
208
239
209
240
0 commit comments