Skip to content

Commit 620cbfa

Browse files
authored
Master does not delete and relaunch OOM workers. (#2107)
* Fix version * Retry to get table size to avoid ReadTimeout * fix extras in setup * Calculate logit using DNN output * Extend tf.keras.layers.Layer * Modify the tile of building models with structured data using ElasticDL * Don't relaunch OOM pod * Don't relaunch OOM pod * Don't remove the timeout pod * Only remove timeout worker * Restore mistakes * Add a annotation * Format codes * Don't relaunch OOM pod * Recover tasks when the worker failed * Remove unused imports * Format codes * Print log for unit test * Recover task for failed workers * Remove log
1 parent a66035b commit 620cbfa

File tree

1 file changed

+27
-24
lines changed

1 file changed

+27
-24
lines changed

elasticdl/python/master/k8s_instance_manager.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -306,40 +306,43 @@ def _event_cb(self, event):
306306
if pod_name in self._failed_pods:
307307
return
308308

309-
# When a pod fails with exit_code == 137, it may be deleted,
310-
# preempted, or OOMkilled. Master will try to relaunch it.
311-
# For OOMkilled, the relaunch is a workaround for memory leak
312-
# issues in tf eager mode.
313309
relaunch_failed_pod = False
314-
if (
315-
evt_type == "MODIFIED"
316-
and phase == "Failed"
317-
and evt_obj.status.container_statuses
318-
and evt_obj.status.container_statuses[0].state.terminated
319-
and evt_obj.status.container_statuses[
320-
0
321-
].state.terminated.exit_code
322-
== 137
323-
):
310+
if evt_type == "MODIFIED" and phase == "Failed":
324311
self._failed_pods.append(pod_name)
325-
relaunch_failed_pod = True
326-
logger.info(
327-
"Pod %s is killed with reason %s."
328-
% (
329-
pod_name,
330-
evt_obj.status.container_statuses[
331-
0
332-
].state.terminated.reason,
312+
worker_id = self._worker_pod_name_to_id.get(pod_name, None)
313+
if worker_id is not None:
314+
# Recover tasks when the worker failed
315+
self._task_d.recover_tasks(worker_id)
316+
317+
if (
318+
evt_obj.status.container_statuses
319+
and evt_obj.status.container_statuses[0].state.terminated
320+
and evt_obj.status.container_statuses[
321+
0
322+
].state.terminated.exit_code
323+
== 137
324+
and evt_obj.status.container_statuses[
325+
0
326+
].state.terminated.reason
327+
!= "OOMKilled"
328+
):
329+
relaunch_failed_pod = True
330+
logger.info(
331+
"Pod %s is killed with reason %s."
332+
% (
333+
pod_name,
334+
evt_obj.status.container_statuses[
335+
0
336+
].state.terminated.reason,
337+
)
333338
)
334-
)
335339

336340
if pod_name in self._worker_pod_name_to_id:
337341
worker_id = self._worker_pod_name_to_id.get(pod_name)
338342
self._worker_pods_phase[worker_id] = (pod_name, phase)
339343
if evt_type == "DELETED" or relaunch_failed_pod:
340344
del self._worker_pods_phase[worker_id]
341345
del self._worker_pod_name_to_id[pod_name]
342-
self._task_d.recover_tasks(worker_id)
343346

344347
# If a deleted pod was not "Succeeded", relaunch a worker.
345348
relaunch_worker = (

0 commit comments

Comments
 (0)