Skip to content
This repository was archived by the owner on Oct 16, 2024. It is now read-only.

Commit 4475f64

Browse files
Merge pull request #208 from Yelp/u/ilkinmammadzada/CLUSTERMAN-685_uncordon
Failed Uncordon issue while expired draining
2 parents d2af373 + 7739e66 commit 4475f64

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

clusterman/draining/queue.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,6 @@ def process_drain_queue(
343343
# 0) Instance is orphan, it should be terminated
344344
# 1) threshold expired, it should be terminated since force_terminate is true
345345
# 2) threshold expired, it should be uncordoned since force_terminate is false
346-
# if it can't be uncordoned, then it should be returned to queue to try again
347346
# 3) threshold not expired, drain and terminate node
348347
# 4) threshold not expired, drain failed for any reason(api is unreachable, PDB doesn't allow eviction)
349348
# then it should be returned to queue to try again
@@ -357,11 +356,12 @@ def process_drain_queue(
357356
logger.info(f"Draining expired for: {host_to_process.instance_id}")
358357
self.submit_host_for_termination(host_to_process, delay=0)
359358
should_add_to_cache = True
360-
elif not k8s_uncordon(kube_operator_client, host_to_process.agent_id): # case 2
361-
# Todo Message can be stay in the queue up to SQS retention period, limit should be added
362-
should_resend_to_queue = True
359+
else: # case 2
360+
k8s_uncordon(kube_operator_client, host_to_process.agent_id)
363361
else:
364362
if k8s_drain(kube_operator_client, host_to_process.agent_id, disable_eviction): # case 3
363+
draining_time = arrow.now() - arrow.get(host_to_process.draining_start_time)
364+
logger.info(f"draining took {draining_time} seconds for {host_to_process.instance_id}")
365365
self.submit_host_for_termination(host_to_process, delay=0)
366366
should_add_to_cache = True
367367
else: # case 4

clusterman/kubernetes/kubernetes_cluster_connector.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
MIGRATION_CRD_VERSION = "v1"
5858
MIGRATION_CRD_PLURAL = "nodemigrations"
5959
MIGRATION_CRD_STATUS_LABEL = "clusterman.yelp.com/migration_status"
60+
NOT_FOUND_STATUS = 404
6061
# we don't want to block on eviction/deletion as we're potentially evicting/deleting a ton of pods
6162
# AND there's a delay before we go ahead and terminate
6263
# AND at Yelp we run a script on shutdown that will also try to drain one final time.
@@ -271,20 +272,22 @@ def create_node_migration_resource(
271272

272273
def _evict_or_delete_pods(self, node_name: str, pods: List[KubernetesPod], disable_eviction: bool) -> bool:
273274
all_done = True
274-
logger.info(f"{len(pods)} pods being evicted/deleted on {node_name}")
275+
action_name = "deleted" if disable_eviction else "evicted"
276+
logger.info(f"{len(pods)} pods being {action_name} on {node_name}")
275277
for pod in pods:
276278
try:
277279
if disable_eviction:
278280
self._delete_pod(pod)
279281
else:
280282
self._evict_pod(pod)
281-
logger.info(f"{pod.metadata.name} ({pod.metadata.namespace}) was evicted/deleted on {node_name}")
283+
logger.info(f"{pod.metadata.name} ({pod.metadata.namespace}) was {action_name} on {node_name}")
282284
except ApiException as e:
283285
logger.warning(
284-
f"Failed to evict/delete {pod.metadata.name} ({pod.metadata.namespace}) on {node_name}"
286+
f"{pod.metadata.name} ({pod.metadata.namespace}) couldn't be {action_name} on {node_name}"
285287
f":{e.status}-{e.reason}"
286288
)
287-
all_done = False
289+
if e.status != NOT_FOUND_STATUS:
290+
all_done = False
288291

289292
return all_done
290293

0 commit comments

Comments
 (0)