14
14
import time
15
15
from threading import Lock
16
16
17
+ from elasticai_api .util .log_utils import default_logger as logger
18
+
17
19
try :
18
20
from horovod .runner .common .util .hosts import (
19
21
get_host_assignments ,
@@ -78,6 +80,11 @@ def start(self):
78
80
self ._rendezvous_port = self ._rendezvous_server .start ()
79
81
80
82
def _init_rendezvous_server (self ):
83
+ logger .info (
84
+ "Initialize rendezvous server with hosts {}" .format (
85
+ self ._next_rendezvous_hosts
86
+ )
87
+ )
81
88
self ._cur_rendezvous_hosts = self ._next_rendezvous_hosts
82
89
self ._next_rendezvous_hosts = None
83
90
host_alloc_plan = self ._get_host_plan ()
@@ -128,7 +135,12 @@ def get_rendezvous_id(self):
128
135
129
136
def add_worker (self , worker_host ):
130
137
with self ._lock :
131
- if worker_host and worker_host not in self ._cur_rendezvous_hosts :
138
+ logger .info (
139
+ "Add worker host {} into rendenzvous and cur hosts {}." .format (
140
+ worker_host , self ._cur_rendezvous_hosts
141
+ )
142
+ )
143
+ if worker_host :
132
144
if self ._next_rendezvous_hosts is None :
133
145
self ._next_rendezvous_hosts = copy .deepcopy (
134
146
self ._cur_rendezvous_hosts
@@ -137,6 +149,9 @@ def add_worker(self, worker_host):
137
149
138
150
def remove_worker (self , worker_host ):
139
151
with self ._lock :
152
+ logger .info (
153
+ "Remove worker host {} from rendenzvous." .format (worker_host )
154
+ )
140
155
if worker_host in self ._cur_rendezvous_hosts :
141
156
if self ._next_rendezvous_hosts is None :
142
157
self ._next_rendezvous_hosts = copy .deepcopy (
0 commit comments