fix: fix intermittent sync issues impacting confirm_network_healthy script execution

daniel-butler-irl · web-flow · commit eb019cb9c3e9 · 2024-03-05T11:16:46.000Z
diff --git a/examples/fscloud/main.tf b/examples/fscloud/main.tf
@@ -238,7 +238,7 @@ module "ocp_fscloud" {
   cluster_name         = var.prefix
   ibmcloud_api_key     = var.ibmcloud_api_key
   resource_group_id    = module.resource_group.resource_group_id
-  region               = "us-south"
+  region               = var.region
   force_delete_storage = true
   vpc_id               = module.vpc.vpc_id
   vpc_subnets          = local.cluster_vpc_subnets
diff --git a/examples/fscloud/variables.tf b/examples/fscloud/variables.tf
@@ -44,12 +44,12 @@ variable "hpcs_instance_guid" {
 }
 
 variable "hpcs_key_crn_cluster" {
-  description = "CRN of the Hyper Protect Crypto service to use to encrypt the cluster boot volume"
+  description = "CRN of the Hyper Protect Crypto service key to use to encrypt the cluster boot volume"
   type        = string
 }
 
 variable "hpcs_key_crn_worker_pool" {
-  description = "CRN of the Hyper Protect Crypto service to use to encrypt the worker pool boot volumes"
+  description = "CRN of the Hyper Protect Crypto service key to use to encrypt the worker pool boot volumes"
   type        = string
 }
 
diff --git a/main.tf b/main.tf
@@ -256,6 +256,7 @@ data "ibm_container_cluster_config" "cluster_config" {
   count             = var.verify_worker_network_readiness || lookup(local.addons_list, "cluster-autoscaler", null) != null ? 1 : 0
   cluster_name_id   = local.cluster_id
   config_dir        = "${path.module}/kubeconfig"
+  admin             = true # workaround for https://github.com/terraform-ibm-modules/terraform-ibm-base-ocp-vpc/issues/374
   resource_group_id = var.resource_group_id
   endpoint_type     = var.cluster_config_endpoint_type != "default" ? var.cluster_config_endpoint_type : null # null value represents default
 }
diff --git a/scripts/confirm_network_healthy.sh b/scripts/confirm_network_healthy.sh
@@ -7,9 +7,31 @@ function run_checks() {
   last_attempt=$1
   namespace=calico-system
 
-  # Get list of calico-node pods (There will be 1 pod per worker node)
+  MAX_ATTEMPTS=10
+  attempt=0
   PODS=()
-  while IFS='' read -r line; do PODS+=("$line"); done < <(kubectl get pods -n "${namespace}" | grep calico-node | cut -f1 -d ' ')
+  while [ $attempt -lt $MAX_ATTEMPTS ]; do
+    # Get list of calico-node pods (There will be 1 pod per worker node)
+    if while IFS='' read -r line; do PODS+=("$line"); done < <(kubectl get pods -n "${namespace}" | grep calico-node | cut -f1 -d ' '); then
+      if [ ${#PODS[@]} -eq 0 ]; then
+        echo "No calico-node pods found. Retrying in 10s. (Attempt $((attempt+1)) / $MAX_ATTEMPTS)"
+        sleep 10
+        ((attempt=attempt+1))
+      else
+        # Pods found, break out of loop
+        break
+      fi
+    else
+      echo "Error getting calico-node pods. Retrying in 10s. (Attempt $((attempt+1)) / $MAX_ATTEMPTS)"
+      sleep 10
+      ((attempt=attempt+1))
+    fi
+  done
+
+  if [ ${#PODS[@]} -eq 0 ]; then
+    echo "No calico-node pods found after $MAX_ATTEMPTS attempts. Exiting."
+    exit 1
+  fi
 
   # Iterate through pods to check health
   healthy=true
diff --git a/scripts/get_config_map_status.sh b/scripts/get_config_map_status.sh
@@ -15,6 +15,8 @@ done
 
 if [[ $COUNTER -eq $MAX_ATTEMPTS ]]; then
   echo "ConfigMap '$CONFIGMAP_NAME' did not become available within $MAX_ATTEMPTS attempts."
+  #  Output for debugging
+  kubectl get configmaps -n $NAMESPACE
   exit 1
 else
   echo "ConfigMap '$CONFIGMAP_NAME' is now available." >&2

Original file line number	Diff line number	Diff line change
`@@ -44,12 +44,12 @@ variable "hpcs_instance_guid" {`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`variable "hpcs_key_crn_cluster" {`
`47`		`- description = "CRN of the Hyper Protect Crypto service to use to encrypt the cluster boot volume"`
	`47`	`+ description = "CRN of the Hyper Protect Crypto service key to use to encrypt the cluster boot volume"`
`48`	`48`	`type = string`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`variable "hpcs_key_crn_worker_pool" {`
`52`		`- description = "CRN of the Hyper Protect Crypto service to use to encrypt the worker pool boot volumes"`
	`52`	`+ description = "CRN of the Hyper Protect Crypto service key to use to encrypt the worker pool boot volumes"`
`53`	`53`	`type = string`
`54`	`54`	`}`
`55`	`55`
Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ data "ibm_container_cluster_config" "cluster_config" {`
`256`	`256`	`count = var.verify_worker_network_readiness \|\| lookup(local.addons_list, "cluster-autoscaler", null) != null ? 1 : 0`
`257`	`257`	`cluster_name_id = local.cluster_id`
`258`	`258`	`config_dir = "${path.module}/kubeconfig"`
	`259`	`+ admin = true # workaround for https://github.com/terraform-ibm-modules/terraform-ibm-base-ocp-vpc/issues/374`
`259`	`260`	`resource_group_id = var.resource_group_id`
`260`	`261`	`endpoint_type = var.cluster_config_endpoint_type != "default" ? var.cluster_config_endpoint_type : null # null value represents default`
`261`	`262`	`}`