From 38f07bad4573b36f69ad63286d3c83a2b1146027 Mon Sep 17 00:00:00 2001 From: dry923 Date: Fri, 29 Oct 2021 10:45:41 -0400 Subject: [PATCH 1/3] Update checking for worker nodes to disregard workload/infra/masters --- .../scale_openshift_wrapper/trigger_scale.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/snafu/scale_openshift_wrapper/trigger_scale.py b/snafu/scale_openshift_wrapper/trigger_scale.py index e4ad40fa..39e183f3 100644 --- a/snafu/scale_openshift_wrapper/trigger_scale.py +++ b/snafu/scale_openshift_wrapper/trigger_scale.py @@ -287,10 +287,20 @@ def _run_scale(self): # Ensure all workers are not listed as unschedulable # If we don't do this it will auto-complete a scale-down even though the workers # have not been eliminated yet - new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items + new_worker_list = nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items for i in range(len(new_worker_list)): while i < len(new_worker_list) and new_worker_list[i].spec.unschedulable: - new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items + new_worker_list = nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items logger.debug( "Number of ready workers: %d. Waiting %d seconds for next check..." % (len(new_worker_list), self.poll_interval) @@ -301,7 +311,10 @@ def _run_scale(self): worker_count = ( len( nodes.get( - label_selector="node-role.kubernetes.io/worker,!node-role.kubernetes.io/master" + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" ).attributes.items ) or 0 From ca0ee7f612302f5397757b3357e725f779b3ccc3 Mon Sep 17 00:00:00 2001 From: dry923 Date: Fri, 29 Oct 2021 13:52:00 -0400 Subject: [PATCH 2/3] Adding additional scale verification --- .../scale_openshift_wrapper/trigger_scale.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/snafu/scale_openshift_wrapper/trigger_scale.py b/snafu/scale_openshift_wrapper/trigger_scale.py index 39e183f3..9b9b874e 100644 --- a/snafu/scale_openshift_wrapper/trigger_scale.py +++ b/snafu/scale_openshift_wrapper/trigger_scale.py @@ -34,6 +34,7 @@ def __init__(self, args): self.poll_interval = args.poll_interval self.kubeconfig = args.kubeconfig self.is_rosa = False + self.timeout = int(args.timeout * 60) if args.rosa_cluster is not None: logger.info("Identified ROSA for scaling process") if args.rosa_token is None: @@ -248,6 +249,8 @@ def _run_scale(self): logger.info("New worker per machine set %s" % (machine_spread)) logger.info("Starting Patching of machine sets") + start_time = time.time() + end_time = start_time + self.timeout # Patch the machinesets if not self.is_rosa: for i in range(len(machineset_workers)): @@ -269,6 +272,12 @@ def _run_scale(self): while new_machine_sets.status.readyReplicas != machine_spread[i]: if new_machine_sets.status.readyReplicas is None and machine_spread[i] == 0: break + + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) + new_machine_sets = machinesets.get( namespace="openshift-machine-api", name=machineset_worker_list[i].metadata.name ) @@ -295,6 +304,10 @@ def _run_scale(self): ).attributes.items for i in range(len(new_worker_list)): while i < len(new_worker_list) and new_worker_list[i].spec.unschedulable: + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) new_worker_list = nodes.get( label_selector="node-role.kubernetes.io/worker," "!node-role.kubernetes.io/master," @@ -308,6 +321,27 @@ def _run_scale(self): time.sleep(self.poll_interval) logger.info("All workers schedulable") + logger.inf("Verifying correct worker count") + current_workers = len(nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items) + while current_workers != int(self.scale): + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) + + logger.debug( + "Number of ready workers: %d. Waiting %d seconds for next check..." + % (len(new_worker_list), self.poll_interval) + ) + time.sleep(self.poll_interval) + + logger.info("Correct worker count verified") + worker_count = ( len( nodes.get( @@ -342,6 +376,7 @@ def emit_actions(self): workload_count, platform, action, + successful, ) = self._run_scale() end_time = time.time() elaspsed_time = end_time - start_time From 4452967ecbb7e36bd3a6f5bc6e880fdd27cf3593 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Oct 2021 18:10:38 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../scale_openshift_wrapper/trigger_scale.py | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/snafu/scale_openshift_wrapper/trigger_scale.py b/snafu/scale_openshift_wrapper/trigger_scale.py index 9b9b874e..455322c2 100644 --- a/snafu/scale_openshift_wrapper/trigger_scale.py +++ b/snafu/scale_openshift_wrapper/trigger_scale.py @@ -297,10 +297,10 @@ def _run_scale(self): # If we don't do this it will auto-complete a scale-down even though the workers # have not been eliminated yet new_worker_list = nodes.get( - label_selector="node-role.kubernetes.io/worker," - "!node-role.kubernetes.io/master," - "!node-role.kubernetes.io/infra," - "!node-role.kubernetes.io/workload" + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" ).attributes.items for i in range(len(new_worker_list)): while i < len(new_worker_list) and new_worker_list[i].spec.unschedulable: @@ -322,23 +322,25 @@ def _run_scale(self): logger.info("All workers schedulable") logger.inf("Verifying correct worker count") - current_workers = len(nodes.get( - label_selector="node-role.kubernetes.io/worker," - "!node-role.kubernetes.io/master," - "!node-role.kubernetes.io/infra," - "!node-role.kubernetes.io/workload" - ).attributes.items) + current_workers = len( + nodes.get( + label_selector="node-role.kubernetes.io/worker," + "!node-role.kubernetes.io/master," + "!node-role.kubernetes.io/infra," + "!node-role.kubernetes.io/workload" + ).attributes.items + ) while current_workers != int(self.scale): - current_time = time.time() - if current_time >= end_time: - logger.error("Timeout %d minutes exceeded" % self.timeout) - exit(1) + current_time = time.time() + if current_time >= end_time: + logger.error("Timeout %d minutes exceeded" % self.timeout) + exit(1) - logger.debug( - "Number of ready workers: %d. Waiting %d seconds for next check..." - % (len(new_worker_list), self.poll_interval) - ) - time.sleep(self.poll_interval) + logger.debug( + "Number of ready workers: %d. Waiting %d seconds for next check..." + % (len(new_worker_list), self.poll_interval) + ) + time.sleep(self.poll_interval) logger.info("Correct worker count verified")