Skip to content

Commit 2711bd9

Browse files
authored
Merge pull request #5068 from helinwang/pserver_log
add detailed log for the pserver
2 parents 288ffdd + fc57c09 commit 2711bd9

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

go/pserver/service.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
124124

125125
// LoadCheckpoint loads checkpoint from file.
126126
func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
127+
log.Info("Loading checkpoint", "pserver index", idx)
128+
defer traceTime(time.Now(), "load checkpoint")
129+
127130
cpMeta, err := loadMeta(e, idx)
128131
if err != nil {
129132
return nil, err
@@ -178,6 +181,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
178181
func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
179182
select {
180183
case <-s.initialized:
184+
log.Warn("init param called but parameters already initialized.")
181185
return errors.New(AlreadyInitialized)
182186
default:
183187
}
@@ -191,6 +195,13 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
191195
// properly memory aligned, if not, make copy to a memory
192196
// aligned region.
193197
s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
198+
log.Info(
199+
"init parameter",
200+
"name", paramWithConfigs.Param.Name,
201+
"config len", len(paramWithConfigs.Config),
202+
"param len", len(paramWithConfigs.Param.Content),
203+
"type", paramWithConfigs.Param.ElementType,
204+
)
194205
return nil
195206
}
196207

@@ -199,6 +210,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
199210
func (s *Service) FinishInitParams(_ int, _ *int) error {
200211
select {
201212
case <-s.initialized:
213+
log.Warn("finished init param called but parameters already initialized.")
202214
return errors.New(AlreadyInitialized)
203215
default:
204216
}
@@ -213,6 +225,8 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
213225
}
214226
}
215227
}()
228+
229+
log.Info("init parameter finished.")
216230
return nil
217231
}
218232

@@ -222,6 +236,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
222236
select {
223237
case <-s.initialized:
224238
default:
239+
log.Warn("received gradient before initialization.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
225240
return errors.New(Uninitialized)
226241
}
227242

@@ -233,6 +248,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
233248
return fmt.Errorf("parameter: %s does not exist", g.Name)
234249
}
235250

251+
log.Info("received gradient from trainer, updating gradient.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
236252
return o.UpdateParameter(g)
237253
}
238254

@@ -244,6 +260,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
244260

245261
opt, ok := s.optMap[name]
246262
if !ok {
263+
log.Warn("trainer wants to get a parameter that does not exist.", "name", name)
247264
return fmt.Errorf("parameter: %s does not exist", name)
248265
}
249266

@@ -257,6 +274,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
257274
parameter.Name = name
258275
parameter.ElementType = opt.elementType
259276
parameter.Content = opt.GetWeights()
277+
log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
260278
return nil
261279
}
262280

0 commit comments

Comments
 (0)