Wait for deployed resources to become ready

detjensrobert · detjensrobert · commit 183624630f63 · 2025-03-29T19:08:31.000-07:00
This waits for:
- pods to become running
- deployments to complete rollouts
- ingresses to become published by controller
- LoadBalancer services to get external IP

Other types of resources or services are ignored and immediately return Ok.

Signed-off-by: Robert Detjens &lt;github@detjens.dev&gt;
diff --git a/src/clients.rs b/src/clients.rs
@@ -3,11 +3,17 @@
 use anyhow::{anyhow, bail, Context, Error, Result};
 use bollard;
 use futures::TryFutureExt;
+use k8s_openapi::api::{
+    apps::v1::Deployment,
+    core::v1::{Pod, Service},
+    networking::v1::Ingress,
+};
 use kube::{
     self,
-    api::{DynamicObject, GroupVersionKind, Patch, PatchParams, TypeMeta},
+    api::{DynamicObject, GroupVersionKind, Patch, PatchParams},
     core::ResourceExt,
-    discovery::{ApiCapabilities, ApiResource, Discovery, Scope},
+    discovery::{ApiCapabilities, ApiResource},
+    runtime::{conditions, wait::await_condition},
 };
 use s3;
 use simplelog::*;
@@ -240,3 +246,107 @@ fn multidoc_deserialize(data: &str) -> Result<Vec<serde_yml::Value>> {
     //     .map(|r| r.map_err(|e| e.into()))
     //     .collect()
 }
+
+/// Check the status of the passed object and wait for it to become ready.
+///
+/// This function does not provide a timeout. Callers will need to wrap this with a timeout instead.
+pub async fn wait_for_status(client: &kube::Client, object: &DynamicObject) -> Result<()> {
+    debug!(
+        "waiting for ok status for {} {}",
+        object.types.clone().unwrap_or_default().kind,
+        object.name_any()
+    );
+
+    // handle each separate object type differently
+    match object.types.clone().unwrap_or_default().kind.as_str() {
+        // wait for Pod to become running
+        "Pod" => {
+            let api = kube::Api::namespaced(client.clone(), &object.namespace().unwrap());
+            let x = await_condition(api, &object.name_any(), conditions::is_pod_running()).await?;
+        }
+
+        // wait for Deployment to complete rollout
+        "Deployment" => {
+            let api = kube::Api::namespaced(client.clone(), &object.namespace().unwrap());
+            await_condition(api, &object.name_any(), |d: Option<&Deployment>| {
+                // Use a nested function so that we can use Option? returns (the outer closure returns `bool`)
+                // TODO: switch to try { } when that is standardized
+                /// Replicate the upstream deployment complete check
+                /// https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#complete-deployment
+                fn depl_complete(d: Option<&Deployment>) -> Option<bool> {
+                    Some(d?.status.as_ref()?.conditions.as_ref()?.iter().any(|c| {
+                        c.reason == Some("NewReplicaSetAvailable".to_string()) && c.status == "True"
+                    }))
+                }
+                depl_complete(d).unwrap_or(false)
+            })
+            .await?;
+        }
+
+        // wait for Ingress to get IP from ingress controller
+        "Ingress" => {
+            let api = kube::Api::namespaced(client.clone(), &object.namespace().unwrap());
+            await_condition(api, &object.name_any(), |i: Option<&Ingress>| {
+                // Use nested function for Option ?, like above.
+                /// Wait for ingress controller to update this with its external ip
+                fn ingress_ip(i: Option<&Ingress>) -> Option<bool> {
+                    Some(
+                        // bleh, this as_ref stuff is unavoidable
+                        i?.status
+                            .as_ref()?
+                            .load_balancer
+                            .as_ref()?
+                            .ingress
+                            .as_ref()?
+                            .iter()
+                            // TODO: should this be any()? all controllers I've seen only add .ip here
+                            .all(|ip| ip.hostname.is_some() || ip.ip.is_some()),
+                    )
+                }
+                ingress_ip(i).unwrap_or(false)
+            })
+            .await?;
+        }
+
+        // wait for LoadBalancer service to get IP
+        "Service" => {
+            let api = kube::Api::namespaced(client.clone(), &object.namespace().unwrap());
+            let svc: Service = api.get(&object.name_any()).await?;
+
+            // we only care about checking LoadBalancer-type services, return Ok
+            // for any non-LB services
+            //
+            // TODO: do we care about NodePorts? don't need to check any atm
+            if svc.spec.unwrap_or_default().type_ != Some("LoadBalancer".to_string()) {
+                trace!(
+                    "not checking status for internal service {}",
+                    object.name_any()
+                );
+                return Ok(());
+            }
+
+            await_condition(api, &object.name_any(), |s: Option<&Service>| {
+                /// Wait for LoadBalancer to get external IP
+                fn lb_ip(s: Option<&Service>) -> Option<bool> {
+                    Some(
+                        // bleh, this as_ref stuff is unavoidable
+                        s?.status
+                            .as_ref()?
+                            .load_balancer
+                            .as_ref()?
+                            .ingress
+                            .as_ref()?
+                            .iter()
+                            .all(|ip| ip.hostname.is_some() || ip.ip.is_some()),
+                    )
+                }
+                lb_ip(s).unwrap_or(false)
+            })
+            .await?;
+        }
+
+        other => trace!("not checking status for resource type {other}"),
+    };
+
+    Ok(())
+}
diff --git a/src/deploy/kubernetes/mod.rs b/src/deploy/kubernetes/mod.rs
@@ -1,12 +1,14 @@
 use std::path::PathBuf;
+use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Error, Ok, Result};
 use itertools::Itertools;
 use minijinja;
 use simplelog::*;
+use tokio::time::timeout;
 
 use crate::builder::BuildResult;
-use crate::clients::{apply_manifest_yaml, kube_client};
+use crate::clients::{apply_manifest_yaml, kube_client, wait_for_status};
 use crate::configparser::challenge::ExposeType;
 use crate::configparser::config::ProfileConfig;
 use crate::configparser::{get_config, get_profile_config, ChallengeConfig};
@@ -78,9 +80,17 @@ async fn deploy_single_challenge(
     trace!("NAMESPACE:\n{}", ns_manifest);
 
     debug!("applying namespace for chal {:?}", chal.directory);
-    apply_manifest_yaml(&kube, &ns_manifest).await?;
 
-    let expose_results = DeployResult { exposed: vec![] };
+    // apply namespace manifest
+    apply_manifest_yaml(&kube, &ns_manifest)
+        .await?
+        .iter()
+        // and then wait for it to be ready
+        .map(|object| wait_for_status(&kube, object))
+        .try_join_all()
+        .await?;
+
+    let results = DeployResult { exposed: vec![] };
 
     for pod in &chal.pods {
         let pod_image = chal.container_tag_for_pod(profile_name, &pod.name)?;
@@ -94,7 +104,26 @@ async fn deploy_single_challenge(
             "applying deployment for chal {:?} pod {:?}",
             chal.directory, pod.name
         );
-        apply_manifest_yaml(&kube, &depl_manifest).await?;
+        let depl = apply_manifest_yaml(&kube, &depl_manifest).await?;
+        for object in depl {
+            // wait for objects to be ready, with 5m timeout
+            timeout(Duration::from_secs(5 * 60), wait_for_status(&kube, &object))
+                .await
+                // timeout wraps with another Result
+                .with_context(|| {
+                    format!(
+                        "timed out waiting for chal {:?} pod {:?} deployment to become ready",
+                        chal.directory, pod.name
+                    )
+                })?
+                // inner result from wait_for_status
+                .with_context(|| {
+                    format!(
+                        "failed to get status for chal {:?} pod {:?} deployment",
+                        chal.directory, pod.name
+                    )
+                })?;
+        }
 
         // tcp and http exposes need to he handled separately, so separate them by type
         let (tcp_ports, http_ports): (Vec<_>, Vec<_>) = pod
@@ -113,7 +142,26 @@ async fn deploy_single_challenge(
                 "applying tcp service for chal {:?} pod {:?}",
                 chal.directory, pod.name
             );
-            apply_manifest_yaml(&kube, &tcp_manifest).await?;
+            let tcp = apply_manifest_yaml(&kube, &tcp_manifest).await?;
+            for object in tcp {
+                // wait for objects to be ready, with 5m timeout
+                timeout(Duration::from_secs(5 * 60), wait_for_status(&kube, &object))
+                    .await
+                    // timeout wraps with another Result
+                    .with_context(|| {
+                        format!(
+                            "timed out waiting for chal {:?} pod {:?} exposed TCP service to become ready",
+                            chal.directory, pod.name
+                        )
+                    })?
+                    // inner result from wait_for_status
+                    .with_context(|| {
+                        format!(
+                            "failed to get status for chal {:?} pod {:?} exposed TCP service",
+                            chal.directory, pod.name
+                        )
+                    })?;
+            }
 
             // TODO:
             // expose_results.exposed.push(PodDeployResult::Tcp { port: tcp_ports[0]. });
@@ -130,11 +178,30 @@ async fn deploy_single_challenge(
                 "applying http service and ingress for chal {:?} pod {:?}",
                 chal.directory, pod.name
             );
-            apply_manifest_yaml(&kube, &http_manifest).await?;
+            let ingress = apply_manifest_yaml(&kube, &http_manifest).await?;
+            for object in ingress {
+                // wait for objects to be ready, with 5m timeout
+                timeout(Duration::from_secs(5 * 60), wait_for_status(&kube, &object))
+                    .await
+                    // timeout wraps with another Result
+                    .with_context(|| {
+                        format!(
+                            "timed out waiting for chal {:?} pod {:?} ingress to become ready",
+                            chal.directory, pod.name
+                        )
+                    })?
+                    // inner result from wait_for_status
+                    .with_context(|| {
+                        format!(
+                            "failed to get status for chal {:?} pod {:?} ingress",
+                            chal.directory, pod.name
+                        )
+                    })?;
+            }
         }
     }
 
-    Ok(expose_results)
+    Ok(results)
 }
 
 // Updates the current ingress controller chart with the current set of TCP