@@ -124,6 +124,9 @@ func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
124
124
125
125
// LoadCheckpoint loads checkpoint from file.
126
126
func LoadCheckpoint (e * EtcdClient , idx int ) (Checkpoint , error ) {
127
+ log .Info ("Loading checkpoint" , "pserver index" , idx )
128
+ defer traceTime (time .Now (), "load checkpoint" )
129
+
127
130
cpMeta , err := loadMeta (e , idx )
128
131
if err != nil {
129
132
return nil , err
@@ -178,6 +181,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
178
181
func (s * Service ) InitParam (paramWithConfigs ParameterWithConfig , _ * int ) error {
179
182
select {
180
183
case <- s .initialized :
184
+ log .Warn ("init param called but parameters already initialized." )
181
185
return errors .New (AlreadyInitialized )
182
186
default :
183
187
}
@@ -191,6 +195,13 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
191
195
// properly memory aligned, if not, make copy to a memory
192
196
// aligned region.
193
197
s .optMap [paramWithConfigs .Param .Name ] = newOptimizer (paramWithConfigs , nil )
198
+ log .Info (
199
+ "init parameter" ,
200
+ "name" , paramWithConfigs .Param .Name ,
201
+ "config len" , len (paramWithConfigs .Config ),
202
+ "param len" , len (paramWithConfigs .Param .Content ),
203
+ "type" , paramWithConfigs .Param .ElementType ,
204
+ )
194
205
return nil
195
206
}
196
207
@@ -199,6 +210,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
199
210
func (s * Service ) FinishInitParams (_ int , _ * int ) error {
200
211
select {
201
212
case <- s .initialized :
213
+ log .Warn ("finished init param called but parameters already initialized." )
202
214
return errors .New (AlreadyInitialized )
203
215
default :
204
216
}
@@ -213,6 +225,8 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
213
225
}
214
226
}
215
227
}()
228
+
229
+ log .Info ("init parameter finished." )
216
230
return nil
217
231
}
218
232
@@ -222,6 +236,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
222
236
select {
223
237
case <- s .initialized :
224
238
default :
239
+ log .Warn ("received gradient before initialization." , "name" , g .Name , "size" , len (g .Content ), "type" , g .ElementType )
225
240
return errors .New (Uninitialized )
226
241
}
227
242
@@ -233,6 +248,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
233
248
return fmt .Errorf ("parameter: %s does not exist" , g .Name )
234
249
}
235
250
251
+ log .Info ("received gradient from trainer, updating gradient." , "name" , g .Name , "size" , len (g .Content ), "type" , g .ElementType )
236
252
return o .UpdateParameter (g )
237
253
}
238
254
@@ -244,6 +260,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
244
260
245
261
opt , ok := s .optMap [name ]
246
262
if ! ok {
263
+ log .Warn ("trainer wants to get a parameter that does not exist." , "name" , name )
247
264
return fmt .Errorf ("parameter: %s does not exist" , name )
248
265
}
249
266
@@ -257,6 +274,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
257
274
parameter .Name = name
258
275
parameter .ElementType = opt .elementType
259
276
parameter .Content = opt .GetWeights ()
277
+ log .Info ("sending parameter to the trainer" , "name" , parameter .Name , "size" , len (parameter .Content ), "type" , parameter .ElementType )
260
278
return nil
261
279
}
262
280
0 commit comments