Skip to content

Commit 6f8754c

Browse files
authored
Fix the bug if the host of a new worker is the same as an old worker (#2496)
* Fix the bug if the host of a new worker is the same as an old worker * Do not call on_pod_deleted if the status is from FAILED to DELETED
1 parent f7bee19 commit 6f8754c

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

elasticdl/python/master/pod_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,10 @@ def _event_cb(self, event):
586586
):
587587
self._worker_info[pod_id].inc_relaunch_count()
588588
should_relaunch = True
589-
elif matched_pod_state_flow.to_status == PodStatus.DELETED:
589+
elif (
590+
matched_pod_state_flow.from_status != PodStatus.FAILED
591+
and matched_pod_state_flow.to_status == PodStatus.DELETED
592+
):
590593
[
591594
callback.on_pod_deleted(pod_info, cluster_context)
592595
for callback in self._pod_event_callbacks

elasticdl/python/master/rendezvous_server.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import time
1515
from threading import Lock
1616

17+
from elasticai_api.util.log_utils import default_logger as logger
18+
1719
try:
1820
from horovod.runner.common.util.hosts import (
1921
get_host_assignments,
@@ -78,6 +80,11 @@ def start(self):
7880
self._rendezvous_port = self._rendezvous_server.start()
7981

8082
def _init_rendezvous_server(self):
83+
logger.info(
84+
"Initialize rendezvous server with hosts {}".format(
85+
self._next_rendezvous_hosts
86+
)
87+
)
8188
self._cur_rendezvous_hosts = self._next_rendezvous_hosts
8289
self._next_rendezvous_hosts = None
8390
host_alloc_plan = self._get_host_plan()
@@ -128,7 +135,12 @@ def get_rendezvous_id(self):
128135

129136
def add_worker(self, worker_host):
130137
with self._lock:
131-
if worker_host and worker_host not in self._cur_rendezvous_hosts:
138+
logger.info(
139+
"Add worker host {} into rendenzvous and cur hosts {}.".format(
140+
worker_host, self._cur_rendezvous_hosts
141+
)
142+
)
143+
if worker_host:
132144
if self._next_rendezvous_hosts is None:
133145
self._next_rendezvous_hosts = copy.deepcopy(
134146
self._cur_rendezvous_hosts
@@ -137,6 +149,9 @@ def add_worker(self, worker_host):
137149

138150
def remove_worker(self, worker_host):
139151
with self._lock:
152+
logger.info(
153+
"Remove worker host {} from rendenzvous.".format(worker_host)
154+
)
140155
if worker_host in self._cur_rendezvous_hosts:
141156
if self._next_rendezvous_hosts is None:
142157
self._next_rendezvous_hosts = copy.deepcopy(

0 commit comments

Comments
 (0)