Increase server-side healthcheck timeout, improve probe failure logging

njhill · njhill · commit d4d96cef5ec0 · 2024-01-12T14:47:47.000-08:00
diff --git a/router/src/health.rs b/router/src/health.rs
@@ -1,6 +1,7 @@
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use tokenizers::Tokenizer;
+use tokio::time::Instant;
 use text_generation_client::{Batch, NextTokenChooserParameters, Request, ShardedClient};
 
 const TEST_INPUT: &str = "liveness";
@@ -25,9 +26,15 @@ impl Health {
     }
 
     pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
+        let generation_healthy = self.generation_health.load(Ordering::SeqCst);
+
+        let mut guard = Guard{ prefill: !generation_healthy, start_time: Some(Instant::now()) };
+
+        let ok = if generation_healthy {
             // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
+            self.client.health().await
+                .map_err(|err| tracing::error!("Basic shard healthcheck error: {err}"))
+                .is_ok()
         } else {
             // Generation is unhealthy or have not sent any generation request yet
 
@@ -51,13 +58,32 @@ impl Health {
                 requests: vec![liveness_request],
                 total_tokens: 1,
             };
-            // Skips the queue
+            // Skips the queue, but will still be serialized behind in-flight prefill/next_token requests
             let value = self.client.prefill(batch, vec![]).await
-                .map_err(|err| tracing::error!("Healthcheck error: {err}"))
+                .map_err(|err| tracing::error!("Prefill healthcheck error: {err}"))
                 .is_ok();
             // Update generation health
             self.generation_health.store(value, Ordering::SeqCst);
             value
+        };
+        guard.start_time = None;
+        ok
+    }
+}
+
+struct Guard {
+    prefill: bool,
+    start_time: Option<Instant>, // None once completed
+}
+
+impl Drop for Guard {
+    fn drop(&mut self) {
+        if let Some(start_time) = self.start_time {
+            tracing::warn!(
+                "Healthcheck request cancelled during {} check after {}ms",
+                if self.prefill { "prefill" } else { "basic shard" },
+                start_time.elapsed().as_millis(),
+            )
         }
     }
 }
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -38,24 +38,24 @@ pub(crate) struct ServerState {
     pub(crate) seq2seq: bool,
 }
 
+// This is a safety-net timeout, it's expected the client (e.g. kubelet) will
+// be configured with a shorter one
+const PROBE_TIMEOUT_SECS: u64 = 60;
+
 /// Health check method
 #[instrument(skip(health))]
 async fn health(mut health: Extension<Health>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
-    match timeout(Duration::from_secs(5), health.check()).await {
+    match timeout(Duration::from_secs(PROBE_TIMEOUT_SECS), health.check()).await {
         Ok(true) => Ok(()),
         Ok(false) => Err((
             StatusCode::SERVICE_UNAVAILABLE,
-            Json(ErrorResponse {
-                error: "unhealthy".to_string(),
-            }),
+            Json(ErrorResponse { error: "unhealthy".to_string() }),
         )),
         Err(_) => {
-            tracing::error!("Healthcheck request timed-out");
+            tracing::error!("Aborting health-check request after {PROBE_TIMEOUT_SECS}s time-out");
             Err((
-                StatusCode::REQUEST_TIMEOUT,
-                Json(ErrorResponse {
-                    error: "Healthcheck timed-out".to_string(),
-                }),
+                StatusCode::SERVICE_UNAVAILABLE,
+                Json(ErrorResponse { error: "Healthcheck timed-out".to_string() }),
             ))
         }
     }