@@ -63,6 +63,8 @@ def init_horovod_if_needed(self):
63
63
time .sleep (RETRY_ALLREDUCE_INTERVAL_SECS )
64
64
else :
65
65
break
66
+ if rank_response .rank_id < 0 :
67
+ raise ValueError ("Invalid rank {}" .format (rank_response .rank_id ))
66
68
67
69
# If the rendezvous from master is unequal to self._rendezvous_id,
68
70
# the worker should rebuild the communication because the master
@@ -73,19 +75,22 @@ def init_horovod_if_needed(self):
73
75
rank_response .rank_id , rank_response .world_size
74
76
)
75
77
)
76
- os .environ [HorovodEnv .RENDEZVOUS_PORT ] = str (
77
- rank_response .rendezvous_port
78
- )
79
- os .environ [HorovodEnv .RANK ] = str (rank_response .rank_id )
80
- os .environ [HorovodEnv .SIZE ] = str (rank_response .world_size )
81
- # Not using Horovod elastic feature in init, but need it for
82
- # allreduce to call allreduce op when size=1.
83
- os .environ [HorovodEnv .ELASTIC ] = str (0 )
84
- hvd .shutdown ()
85
- hvd .init ()
86
- os .environ [HorovodEnv .ELASTIC ] = str (1 )
87
- self ._rendezvous_id = rank_response .rendezvous_id
88
- self .need_broadcast = True
78
+ self ._restart_hvd (rank_response )
79
+
80
+ def _restart_hvd (self , rank_response ):
81
+ os .environ [HorovodEnv .RENDEZVOUS_PORT ] = str (
82
+ rank_response .rendezvous_port
83
+ )
84
+ os .environ [HorovodEnv .RANK ] = str (rank_response .rank_id )
85
+ os .environ [HorovodEnv .SIZE ] = str (rank_response .world_size )
86
+ # Not using Horovod elastic feature in init, but need it for
87
+ # allreduce to call allreduce op when size=1.
88
+ os .environ [HorovodEnv .ELASTIC ] = str (0 )
89
+ hvd .shutdown ()
90
+ hvd .init ()
91
+ os .environ [HorovodEnv .ELASTIC ] = str (1 )
92
+ self ._rendezvous_id = rank_response .rendezvous_id
93
+ self .need_broadcast = True
89
94
90
95
def _set_horovod_env (self ):
91
96
master_addr_port = os .getenv (WorkerEnv .MASTER_ADDR , None )
@@ -103,7 +108,7 @@ def notify_training_loop_status(self, status):
103
108
104
109
class AllReduceController (object ):
105
110
"""The controller initializes Horovod and calls the function with forward
106
- and backward computation using a mini-batch of data. If Horovod raise an
111
+ and backward computation using a mini-batch of data. If Horovod raises an
107
112
exception about AllReduce, Allgather and Broadcast, the controller will
108
113
catch the exception and re-initialize Horovod. Then, it will broadcast
109
114
the variables and retry to call those functions.
0 commit comments