@@ -136,7 +136,6 @@ def __init__(self,
136
136
# config for checkpoint
137
137
# only chief worker will save variables
138
138
self .trainer_id = 0
139
- self .chief = True
140
139
self .checkpoint_cfg = checkpoint_config
141
140
if self .checkpoint_cfg :
142
141
assert isinstance (self .checkpoint_cfg , CheckpointConfig )
@@ -201,7 +200,6 @@ def _transpile_nccl2_dist(self):
201
200
self .nccl_id_var = None
202
201
else :
203
202
self .trainer_id = int (os .getenv ("PADDLE_TRAINER_ID" ))
204
- self .chief = self .trainer_id == 0
205
203
port = os .getenv ("PADDLE_PSERVER_PORT" )
206
204
worker_ips = os .getenv ("PADDLE_TRAINER_IPS" )
207
205
worker_endpoints = []
@@ -250,7 +248,7 @@ def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
250
248
# the unique trainer id, starting from 0, needed by trainer
251
249
# only
252
250
self .trainer_id = int (os .getenv ("PADDLE_TRAINER_ID" , "0" ))
253
- self . chief = self . trainer_id == 0
251
+
254
252
# the role, should be either PSERVER or TRAINER
255
253
training_role = os .getenv ("PADDLE_TRAINING_ROLE" )
256
254
with self ._prog_and_scope_guard ():
@@ -456,7 +454,6 @@ def _save_checkpoint(self, epoch_id, step_id):
456
454
executor = exe ,
457
455
checkpoint_dir = self .checkpoint_cfg .checkpoint_dir ,
458
456
trainer_id = self .trainer_id ,
459
- is_chief = self .chief ,
460
457
trainer_args = self ._get_checkpoint_save_args (epoch_id , step_id ),
461
458
main_program = self .train_program ,
462
459
max_num_checkpoints = self .checkpoint_cfg .max_num_checkpoints )
0 commit comments