Skip to content

Commit 3264533

Browse files
authored
Merge pull request #209 from apollographql/GT-121
Health Check Support
2 parents dc32d0b + bcd892c commit 3264533

File tree

11 files changed

+342
-4
lines changed

11 files changed

+342
-4
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### Health Check Support - @DaleSeo PR #209
2+
3+
Health reporting functionality has been added to make the MCP server ready for production deployment with proper health monitoring and Kubernetes integration.

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/apollo-mcp-server/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ bon = "3.6.3"
1717
clap = { version = "4.5.36", features = ["derive", "env"] }
1818
figment = { version = "0.10.19", features = ["env", "yaml"] }
1919
futures.workspace = true
20+
humantime-serde = "1.1.1"
2021
lz-str = "0.2.1"
2122
regex = "1.11.1"
2223
reqwest.workspace = true
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
//! Health Check module for Apollo MCP Server
2+
//!
3+
//! Provides liveness and readiness checks for the MCP server, inspired by Apollo Router's health check implementation.
4+
//!
5+
//! The health check is exposed via HTTP endpoints and can be used by load balancers, container orchestrators, and monitoring systems to determine server health.
6+
7+
use std::{
8+
sync::{
9+
Arc,
10+
atomic::{AtomicBool, AtomicUsize, Ordering},
11+
},
12+
time::Duration,
13+
};
14+
15+
use axum::http::StatusCode;
16+
use schemars::JsonSchema;
17+
use serde::{Deserialize, Serialize};
18+
use tokio::time::Instant;
19+
use tracing::debug;
20+
21+
/// Health status enumeration
22+
#[derive(Debug, Serialize)]
23+
#[serde(rename_all = "UPPERCASE")]
24+
pub enum HealthStatus {
25+
Up,
26+
Down,
27+
}
28+
29+
/// Health response structure
30+
#[derive(Debug, Serialize)]
31+
pub struct Health {
32+
status: HealthStatus,
33+
}
34+
35+
/// Configuration options for the readiness health interval sub-component.
36+
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
37+
#[serde(deny_unknown_fields)]
38+
#[serde(default)]
39+
pub struct ReadinessIntervalConfig {
40+
#[serde(deserialize_with = "humantime_serde::deserialize", default)]
41+
#[serde(serialize_with = "humantime_serde::serialize")]
42+
#[schemars(with = "Option<String>", default)]
43+
/// The sampling interval (default: 5s)
44+
pub sampling: Duration,
45+
46+
#[serde(deserialize_with = "humantime_serde::deserialize")]
47+
#[serde(serialize_with = "humantime_serde::serialize")]
48+
#[schemars(with = "Option<String>")]
49+
/// The unready interval (default: 2 * sampling interval)
50+
pub unready: Option<Duration>,
51+
}
52+
53+
impl Default for ReadinessIntervalConfig {
54+
fn default() -> Self {
55+
Self {
56+
sampling: Duration::from_secs(5),
57+
unready: None,
58+
}
59+
}
60+
}
61+
62+
/// Configuration options for the readiness health sub-component.
63+
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
64+
#[serde(deny_unknown_fields)]
65+
#[serde(default)]
66+
pub struct ReadinessConfig {
67+
/// The readiness interval configuration
68+
pub interval: ReadinessIntervalConfig,
69+
70+
/// How many rejections are allowed in an interval (default: 100)
71+
/// If this number is exceeded, the server will start to report unready.
72+
pub allowed: usize,
73+
}
74+
75+
impl Default for ReadinessConfig {
76+
fn default() -> Self {
77+
Self {
78+
interval: Default::default(),
79+
allowed: 100,
80+
}
81+
}
82+
}
83+
84+
/// Configuration options for the health check component.
85+
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
86+
#[serde(deny_unknown_fields)]
87+
#[serde(default)]
88+
pub struct HealthCheckConfig {
89+
/// Set to false to disable the health check
90+
pub enabled: bool,
91+
92+
/// Optionally set a custom healthcheck path
93+
/// Defaults to /health
94+
pub path: String,
95+
96+
/// Optionally specify readiness configuration
97+
pub readiness: ReadinessConfig,
98+
}
99+
100+
impl Default for HealthCheckConfig {
101+
fn default() -> Self {
102+
Self {
103+
enabled: false,
104+
path: "/health".to_string(),
105+
readiness: Default::default(),
106+
}
107+
}
108+
}
109+
110+
#[derive(Clone)]
111+
pub struct HealthCheck {
112+
config: HealthCheckConfig,
113+
live: Arc<AtomicBool>,
114+
ready: Arc<AtomicBool>,
115+
rejected: Arc<AtomicUsize>,
116+
ticker: Arc<tokio::task::JoinHandle<()>>,
117+
}
118+
119+
impl HealthCheck {
120+
pub fn new(config: HealthCheckConfig) -> Self {
121+
let live = Arc::new(AtomicBool::new(true)); // Start as live
122+
let ready = Arc::new(AtomicBool::new(true)); // Start as ready
123+
let rejected = Arc::new(AtomicUsize::new(0));
124+
125+
let allowed = config.readiness.allowed;
126+
let sampling_interval = config.readiness.interval.sampling;
127+
let recovery_interval = config
128+
.readiness
129+
.interval
130+
.unready
131+
.unwrap_or(2 * sampling_interval);
132+
133+
let my_rejected = rejected.clone();
134+
let my_ready = ready.clone();
135+
136+
let ticker = tokio::spawn(async move {
137+
loop {
138+
let start = Instant::now() + sampling_interval;
139+
let mut interval = tokio::time::interval_at(start, sampling_interval);
140+
loop {
141+
interval.tick().await;
142+
if my_rejected.load(Ordering::Relaxed) > allowed {
143+
debug!("Health check readiness threshold exceeded, marking as unready");
144+
my_ready.store(false, Ordering::SeqCst);
145+
tokio::time::sleep(recovery_interval).await;
146+
my_rejected.store(0, Ordering::Relaxed);
147+
my_ready.store(true, Ordering::SeqCst);
148+
debug!("Health check readiness restored");
149+
break;
150+
}
151+
}
152+
}
153+
});
154+
155+
Self {
156+
config,
157+
live,
158+
ready,
159+
rejected,
160+
ticker: Arc::new(ticker),
161+
}
162+
}
163+
164+
pub fn record_rejection(&self) {
165+
self.rejected.fetch_add(1, Ordering::Relaxed);
166+
}
167+
168+
pub fn config(&self) -> &HealthCheckConfig {
169+
&self.config
170+
}
171+
172+
pub fn get_health_state(&self, query: Option<&str>) -> (Health, StatusCode) {
173+
let mut status_code = StatusCode::OK;
174+
175+
let health = if let Some(query) = query {
176+
let query_upper = query.to_ascii_uppercase();
177+
178+
if query_upper.starts_with("READY") {
179+
let status = if self.ready.load(Ordering::SeqCst) {
180+
HealthStatus::Up
181+
} else {
182+
status_code = StatusCode::SERVICE_UNAVAILABLE;
183+
HealthStatus::Down
184+
};
185+
Health { status }
186+
} else if query_upper.starts_with("LIVE") {
187+
let status = if self.live.load(Ordering::SeqCst) {
188+
HealthStatus::Up
189+
} else {
190+
status_code = StatusCode::SERVICE_UNAVAILABLE;
191+
HealthStatus::Down
192+
};
193+
Health { status }
194+
} else {
195+
Health {
196+
status: HealthStatus::Up,
197+
}
198+
}
199+
} else {
200+
Health {
201+
status: HealthStatus::Up,
202+
}
203+
};
204+
205+
(health, status_code)
206+
}
207+
}
208+
209+
impl Drop for HealthCheck {
210+
fn drop(&mut self) {
211+
self.ticker.abort();
212+
}
213+
}
214+
215+
#[cfg(test)]
216+
mod tests {
217+
use super::*;
218+
use tokio::time::{Duration, sleep};
219+
220+
#[test]
221+
fn test_health_check_default_config() {
222+
let config = HealthCheckConfig::default();
223+
assert!(!config.enabled);
224+
assert_eq!(config.path, "/health");
225+
assert_eq!(config.readiness.allowed, 100);
226+
assert_eq!(config.readiness.interval.sampling, Duration::from_secs(5));
227+
assert!(config.readiness.interval.unready.is_none());
228+
}
229+
230+
#[tokio::test]
231+
async fn test_health_check_rejection_tracking() {
232+
let mut config = HealthCheckConfig::default();
233+
config.readiness.allowed = 2;
234+
config.readiness.interval.sampling = Duration::from_millis(50);
235+
config.readiness.interval.unready = Some(Duration::from_millis(100));
236+
237+
let health_check = HealthCheck::new(config);
238+
239+
// Should be live and ready initially
240+
assert!(health_check.live.load(Ordering::SeqCst));
241+
assert!(health_check.ready.load(Ordering::SeqCst));
242+
243+
// Record rejections beyond threshold
244+
for _ in 0..5 {
245+
health_check.record_rejection();
246+
}
247+
248+
// Wait for the ticker to process
249+
sleep(Duration::from_millis(100)).await;
250+
251+
// Should be still live but unready now
252+
assert!(health_check.live.load(Ordering::SeqCst));
253+
assert!(!health_check.ready.load(Ordering::SeqCst));
254+
}
255+
}

crates/apollo-mcp-server/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pub mod errors;
33
pub mod event;
44
mod explorer;
55
mod graphql;
6+
pub mod health;
67
mod introspection;
78
pub mod json_schema;
89
pub mod operations;

crates/apollo-mcp-server/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ async fn main() -> anyhow::Result<()> {
155155
)
156156
.search_leaf_depth(config.introspection.search.leaf_depth)
157157
.index_memory_bytes(config.introspection.search.index_memory_bytes)
158+
.health_check(config.health_check)
158159
.build()
159160
.start()
160161
.await?)

crates/apollo-mcp-server/src/runtime/config.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::path::PathBuf;
22

3-
use apollo_mcp_server::server::Transport;
3+
use apollo_mcp_server::{health::HealthCheckConfig, server::Transport};
44
use reqwest::header::HeaderMap;
55
use schemars::JsonSchema;
66
use serde::Deserialize;
@@ -30,6 +30,10 @@ pub struct Config {
3030
#[schemars(schema_with = "super::schemas::header_map")]
3131
pub headers: HeaderMap,
3232

33+
/// Health check configuration
34+
#[serde(default)]
35+
pub health_check: HealthCheckConfig,
36+
3337
/// Introspection configuration
3438
pub introspection: Introspection,
3539

crates/apollo-mcp-server/src/server.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use url::Url;
1010
use crate::custom_scalar_map::CustomScalarMap;
1111
use crate::errors::ServerError;
1212
use crate::event::Event as ServerEvent;
13+
use crate::health::HealthCheckConfig;
1314
use crate::operations::{MutationMode, OperationSource};
1415

1516
mod states;
@@ -36,6 +37,7 @@ pub struct Server {
3637
disable_schema_description: bool,
3738
search_leaf_depth: usize,
3839
index_memory_bytes: usize,
40+
health_check: HealthCheckConfig,
3941
}
4042

4143
#[derive(Debug, Clone, Deserialize, Default, JsonSchema)]
@@ -103,6 +105,7 @@ impl Server {
103105
disable_schema_description: bool,
104106
search_leaf_depth: usize,
105107
index_memory_bytes: usize,
108+
health_check: HealthCheckConfig,
106109
) -> Self {
107110
let headers = {
108111
let mut headers = headers.clone();
@@ -128,6 +131,7 @@ impl Server {
128131
disable_schema_description,
129132
search_leaf_depth,
130133
index_memory_bytes,
134+
health_check,
131135
}
132136
}
133137

crates/apollo-mcp-server/src/server/states.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use url::Url;
88
use crate::{
99
custom_scalar_map::CustomScalarMap,
1010
errors::{OperationError, ServerError},
11+
health::HealthCheckConfig,
1112
operations::MutationMode,
1213
};
1314

@@ -45,6 +46,7 @@ struct Config {
4546
disable_schema_description: bool,
4647
search_leaf_depth: usize,
4748
index_memory_bytes: usize,
49+
health_check: HealthCheckConfig,
4850
}
4951

5052
impl StateMachine {
@@ -76,6 +78,7 @@ impl StateMachine {
7678
disable_schema_description: server.disable_schema_description,
7779
search_leaf_depth: server.search_leaf_depth,
7880
index_memory_bytes: server.index_memory_bytes,
81+
health_check: server.health_check,
7982
},
8083
});
8184

0 commit comments

Comments
 (0)