Merge pull request #208 from Yelp/u/ilkinmammadzada/CLUSTERMAN-685_uncordon

ilkinmammadzada · web-flow · commit 4475f642c234 · 2022-10-03T15:16:28.000+01:00
Failed Uncordon issue while expired draining
diff --git a/clusterman/draining/queue.py b/clusterman/draining/queue.py
@@ -343,7 +343,6 @@ def process_drain_queue(
                 #  0) Instance is orphan, it should be terminated
                 #  1) threshold expired, it should be terminated since force_terminate is true
                 #  2) threshold expired, it should be uncordoned since force_terminate is false
-                #  if it can't be uncordoned, then it should be returned to queue to try again
                 #  3) threshold not expired, drain and terminate node
                 #  4) threshold not expired, drain failed for any reason(api is unreachable, PDB doesn't allow eviction)
                 #  then it should be returned to queue to try again
@@ -357,11 +356,12 @@ def process_drain_queue(
                         logger.info(f"Draining expired for: {host_to_process.instance_id}")
                         self.submit_host_for_termination(host_to_process, delay=0)
                         should_add_to_cache = True
-                    elif not k8s_uncordon(kube_operator_client, host_to_process.agent_id):  # case 2
-                        # Todo Message can be stay in the queue up to SQS retention period, limit should be added
-                        should_resend_to_queue = True
+                    else:  # case 2
+                        k8s_uncordon(kube_operator_client, host_to_process.agent_id)
                 else:
                     if k8s_drain(kube_operator_client, host_to_process.agent_id, disable_eviction):  # case 3
+                        draining_time = arrow.now() - arrow.get(host_to_process.draining_start_time)
+                        logger.info(f"draining took {draining_time} seconds for {host_to_process.instance_id}")
                         self.submit_host_for_termination(host_to_process, delay=0)
                         should_add_to_cache = True
                     else:  # case 4
diff --git a/clusterman/kubernetes/kubernetes_cluster_connector.py b/clusterman/kubernetes/kubernetes_cluster_connector.py
@@ -57,6 +57,7 @@
 MIGRATION_CRD_VERSION = "v1"
 MIGRATION_CRD_PLURAL = "nodemigrations"
 MIGRATION_CRD_STATUS_LABEL = "clusterman.yelp.com/migration_status"
+NOT_FOUND_STATUS = 404
 # we don't want to block on eviction/deletion as we're potentially evicting/deleting a ton of pods
 # AND there's a delay before we go ahead and terminate
 # AND at Yelp we run a script on shutdown that will also try to drain one final time.
@@ -271,20 +272,22 @@ def create_node_migration_resource(
 
     def _evict_or_delete_pods(self, node_name: str, pods: List[KubernetesPod], disable_eviction: bool) -> bool:
         all_done = True
-        logger.info(f"{len(pods)} pods being evicted/deleted on {node_name}")
+        action_name = "deleted" if disable_eviction else "evicted"
+        logger.info(f"{len(pods)} pods being {action_name} on {node_name}")
         for pod in pods:
             try:
                 if disable_eviction:
                     self._delete_pod(pod)
                 else:
                     self._evict_pod(pod)
-                logger.info(f"{pod.metadata.name} ({pod.metadata.namespace}) was evicted/deleted on {node_name}")
+                logger.info(f"{pod.metadata.name} ({pod.metadata.namespace}) was {action_name} on {node_name}")
             except ApiException as e:
                 logger.warning(
-                    f"Failed to evict/delete {pod.metadata.name} ({pod.metadata.namespace}) on {node_name}"
+                    f"{pod.metadata.name} ({pod.metadata.namespace}) couldn't be {action_name} on {node_name}"
                     f":{e.status}-{e.reason}"
                 )
-                all_done = False
+                if e.status != NOT_FOUND_STATUS:
+                    all_done = False
 
         return all_done