Skip to content

Commit cd6ef2a

Browse files
authored
Add Prometheus metrics export endpoint for monitoring integration (#80)
1 parent 17c6d42 commit cd6ef2a

File tree

10 files changed

+620
-4
lines changed

10 files changed

+620
-4
lines changed

apps/mcp-server/README.md

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,139 @@ The server meets the following performance requirements:
167167
- **Mock Operations**: < 500ms per operation
168168
- **Memory Usage**: < 50MB
169169

170+
## 📈 Prometheus Metrics Endpoint
171+
172+
The server exposes a `/metrics` endpoint in Prometheus text format for integration with monitoring stacks like Prometheus, Grafana, and Datadog.
173+
174+
### Enabling Metrics
175+
176+
The metrics endpoint is enabled by default when the health check server is running. To configure:
177+
178+
```bash
179+
# Enable health check server (required for /metrics)
180+
HEALTH_CHECK_ENABLED=true
181+
182+
# Optionally set a custom port (default: 8080)
183+
HEALTH_CHECK_PORT=8080
184+
185+
# Disable Prometheus metrics (metrics enabled by default)
186+
PROMETHEUS_METRICS_ENABLED=false
187+
```
188+
189+
### Available Metrics
190+
191+
#### Authentication Metrics
192+
193+
| Metric | Type | Description |
194+
| ---------------------------------- | --------- | -------------------------------------------------------------------- |
195+
| `lighthouse_auth_total{status}` | Counter | Total authentication attempts by status (success, failure, fallback) |
196+
| `lighthouse_auth_duration_seconds` | Histogram | Authentication duration distribution |
197+
| `lighthouse_unique_api_keys` | Gauge | Number of unique API keys seen |
198+
199+
#### Cache Metrics
200+
201+
| Metric | Type | Description |
202+
| ------------------------------- | ------- | ---------------------------- |
203+
| `lighthouse_cache_hits_total` | Counter | Total cache hits |
204+
| `lighthouse_cache_misses_total` | Counter | Total cache misses |
205+
| `lighthouse_cache_size` | Gauge | Current cache size (entries) |
206+
| `lighthouse_cache_max_size` | Gauge | Maximum cache capacity |
207+
208+
#### Tool Metrics
209+
210+
| Metric | Type | Description |
211+
| ------------------------------------------------ | --------- | ----------------------------------- |
212+
| `lighthouse_tool_calls_total{tool}` | Counter | Total tool invocations by tool name |
213+
| `lighthouse_tools_registered` | Gauge | Number of registered tools |
214+
| `lighthouse_request_duration_seconds{operation}` | Histogram | Request duration by operation |
215+
216+
#### Security Metrics
217+
218+
| Metric | Type | Description |
219+
| ---------------------------------------- | ------- | --------------------------------------------------------------------------- |
220+
| `lighthouse_security_events_total{type}` | Counter | Security events by type (AUTHENTICATION_FAILURE, RATE_LIMIT_EXCEEDED, etc.) |
221+
222+
#### Storage Metrics
223+
224+
| Metric | Type | Description |
225+
| -------------------------------- | ----- | ------------------------------- |
226+
| `lighthouse_storage_files` | Gauge | Number of files in storage |
227+
| `lighthouse_storage_bytes` | Gauge | Total storage usage in bytes |
228+
| `lighthouse_storage_max_bytes` | Gauge | Maximum storage capacity |
229+
| `lighthouse_storage_utilization` | Gauge | Storage utilization ratio (0-1) |
230+
231+
#### Service Pool Metrics
232+
233+
| Metric | Type | Description |
234+
| ---------------------------------- | ----- | ----------------------------- |
235+
| `lighthouse_service_pool_size` | Gauge | Current service pool size |
236+
| `lighthouse_service_pool_max_size` | Gauge | Maximum service pool capacity |
237+
238+
#### Process Metrics (Auto-collected)
239+
240+
| Metric | Type | Description |
241+
| ------------------------------------------ | ------- | ----------------------- |
242+
| `lighthouse_process_cpu_seconds_total` | Counter | Total CPU time consumed |
243+
| `lighthouse_process_resident_memory_bytes` | Gauge | Resident memory size |
244+
| `lighthouse_nodejs_eventloop_lag_seconds` | Gauge | Node.js event loop lag |
245+
| `lighthouse_nodejs_heap_size_total_bytes` | Gauge | Total heap size |
246+
| `lighthouse_nodejs_heap_size_used_bytes` | Gauge | Used heap size |
247+
248+
### Example Output
249+
250+
```prometheus
251+
# HELP lighthouse_auth_total Total authentication attempts
252+
# TYPE lighthouse_auth_total counter
253+
lighthouse_auth_total{status="success"} 1542
254+
lighthouse_auth_total{status="failure"} 23
255+
lighthouse_auth_total{status="fallback"} 156
256+
257+
# HELP lighthouse_cache_hits_total Total cache hits
258+
# TYPE lighthouse_cache_hits_total counter
259+
lighthouse_cache_hits_total 12453
260+
261+
# HELP lighthouse_cache_misses_total Total cache misses
262+
# TYPE lighthouse_cache_misses_total counter
263+
lighthouse_cache_misses_total 1847
264+
265+
# HELP lighthouse_request_duration_seconds Request duration in seconds
266+
# TYPE lighthouse_request_duration_seconds histogram
267+
lighthouse_request_duration_seconds_bucket{operation="lighthouse_upload_file",le="0.1"} 234
268+
lighthouse_request_duration_seconds_bucket{operation="lighthouse_upload_file",le="0.5"} 892
269+
lighthouse_request_duration_seconds_bucket{operation="lighthouse_upload_file",le="1"} 1023
270+
lighthouse_request_duration_seconds_bucket{operation="lighthouse_upload_file",le="+Inf"} 1024
271+
lighthouse_request_duration_seconds_sum{operation="lighthouse_upload_file"} 342.87
272+
lighthouse_request_duration_seconds_count{operation="lighthouse_upload_file"} 1024
273+
274+
# HELP lighthouse_security_events_total Total security events by type
275+
# TYPE lighthouse_security_events_total counter
276+
lighthouse_security_events_total{type="AUTHENTICATION_FAILURE"} 23
277+
lighthouse_security_events_total{type="RATE_LIMIT_EXCEEDED"} 5
278+
```
279+
280+
### Prometheus Configuration
281+
282+
Add this scrape configuration to your `prometheus.yml`:
283+
284+
```yaml
285+
scrape_configs:
286+
- job_name: "lighthouse-mcp-server"
287+
static_configs:
288+
- targets: ["localhost:8080"]
289+
metrics_path: /metrics
290+
scrape_interval: 15s
291+
```
292+
293+
### Grafana Dashboard
294+
295+
Import the metrics into Grafana and create dashboards to visualize:
296+
297+
- Authentication success/failure rates
298+
- Cache hit rate over time
299+
- Tool usage patterns
300+
- Storage utilization trends
301+
- Security event alerts
302+
170303
## 🧪 Testing
171304
172305
```bash

apps/mcp-server/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
"@lighthouse-tooling/shared": "workspace:*",
2222
"@modelcontextprotocol/sdk": "^1.0.4",
2323
"better-sqlite3": "^11.7.0",
24-
"dotenv": "^16.4.5"
24+
"dotenv": "^16.4.5",
25+
"prom-client": "^15.1.0"
2526
},
2627
"devDependencies": {
2728
"@types/better-sqlite3": "^7.6.12",

apps/mcp-server/src/auth/AuthManager.ts

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@ import { AuthConfig, ValidationResult, AuthenticationResult } from "./types.js";
66
import { KeyValidationCache } from "./KeyValidationCache.js";
77
import { RateLimiter } from "./RateLimiter.js";
88
import { SecureKeyHandler } from "./SecureKeyHandler.js";
9+
import { MetricsCollector } from "./MetricsCollector.js";
910

1011
export class AuthManager {
1112
private config: AuthConfig;
1213
private cache: KeyValidationCache;
1314
private rateLimiter: RateLimiter;
15+
private metricsCollector: MetricsCollector;
1416

1517
constructor(config: AuthConfig) {
1618
this.config = config;
1719
this.cache = new KeyValidationCache(config.keyValidationCache);
1820
this.rateLimiter = new RateLimiter(config.rateLimiting);
21+
this.metricsCollector = new MetricsCollector();
1922
}
2023

2124
/**
@@ -38,8 +41,10 @@ export class AuthManager {
3841
// Check cache first
3942
const cached = this.cache.get(keyHash);
4043
if (cached) {
44+
this.metricsCollector.recordCacheAccess(true);
4145
return cached;
4246
}
47+
this.metricsCollector.recordCacheAccess(false);
4348

4449
// Check rate limiting
4550
const rateLimitResult = this.rateLimiter.isAllowed(keyHash);
@@ -115,23 +120,33 @@ export class AuthManager {
115120

116121
const validation = await this.validateApiKey(effectiveKey);
117122

118-
return {
123+
const result: AuthenticationResult = {
119124
success: validation.isValid,
120125
keyHash: validation.keyHash,
121126
usedFallback,
122127
rateLimited: validation.rateLimitInfo?.remaining === 0 || false,
123128
authTime: Date.now() - startTime,
124129
errorMessage: validation.errorMessage,
125130
};
131+
132+
// Record metrics
133+
this.metricsCollector.recordAuthentication(result);
134+
135+
return result;
126136
} catch (error) {
127-
return {
137+
const result: AuthenticationResult = {
128138
success: false,
129139
keyHash: "unknown",
130140
usedFallback: false,
131141
rateLimited: false,
132142
authTime: Date.now() - startTime,
133143
errorMessage: error instanceof Error ? error.message : "Authentication failed",
134144
};
145+
146+
// Record failed authentication
147+
this.metricsCollector.recordAuthentication(result);
148+
149+
return result;
135150
}
136151
}
137152

@@ -166,6 +181,13 @@ export class AuthManager {
166181
return this.cache.getStats();
167182
}
168183

184+
/**
185+
* Get metrics collector for Prometheus export
186+
*/
187+
getMetricsCollector(): MetricsCollector {
188+
return this.metricsCollector;
189+
}
190+
169191
/**
170192
* Get rate limiter status for a key
171193
*/
@@ -204,5 +226,6 @@ export class AuthManager {
204226
destroy(): void {
205227
this.cache.destroy();
206228
this.rateLimiter.destroy();
229+
this.metricsCollector.destroy();
207230
}
208231
}

apps/mcp-server/src/auth/MetricsCollector.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,16 @@ export class MetricsCollector {
215215
};
216216
}
217217

218+
/**
219+
* Get raw cache counters for Prometheus export
220+
*/
221+
getCacheCounters(): { hits: number; misses: number } {
222+
return {
223+
hits: this.cacheHits,
224+
misses: this.cacheMisses,
225+
};
226+
}
227+
218228
/**
219229
* Get security events within time range
220230
*/

apps/mcp-server/src/config/server-config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ export const DEFAULT_HEALTH_CHECK_CONFIG: HealthCheckConfig = {
9696
lighthouseApiUrl: process.env.LIGHTHOUSE_API_URL || "https://api.lighthouse.storage",
9797
connectivityCheckInterval: 30000,
9898
connectivityTimeout: 5000,
99+
metricsEnabled: process.env.PROMETHEUS_METRICS_ENABLED !== "false", // Enabled by default
99100
};
100101

101102
export const DEFAULT_ORGANIZATION_SETTINGS: OrganizationSettings = {

apps/mcp-server/src/health/HealthCheckServer.ts

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
22
* Health Check HTTP Server
33
*
4-
* Provides /health (liveness) and /ready (readiness) endpoints
4+
* Provides /health (liveness), /ready (readiness), and /metrics (Prometheus) endpoints
55
* on a configurable port, separate from the MCP stdio transport.
66
*/
77

@@ -14,6 +14,7 @@ import { ILighthouseService } from "../services/ILighthouseService.js";
1414
import { ToolRegistry } from "../registry/ToolRegistry.js";
1515
import { ServerConfig } from "../config/server-config.js";
1616
import { HealthCheckConfig, HealthStatus, ReadinessCheck, ReadinessStatus } from "./types.js";
17+
import { PrometheusExporter } from "./PrometheusExporter.js";
1718

1819
export interface HealthCheckDependencies {
1920
authManager: AuthManager;
@@ -30,6 +31,7 @@ export class HealthCheckServer {
3031
private deps: HealthCheckDependencies;
3132
private healthConfig: HealthCheckConfig;
3233
private logger: Logger;
34+
private prometheusExporter: PrometheusExporter | null = null;
3335

3436
private lastConnectivityCheck: {
3537
up: boolean;
@@ -41,6 +43,16 @@ export class HealthCheckServer {
4143
this.deps = deps;
4244
this.healthConfig = healthConfig;
4345
this.logger = deps.logger;
46+
47+
// Initialize Prometheus exporter if metrics are enabled
48+
if (healthConfig.metricsEnabled !== false) {
49+
this.prometheusExporter = new PrometheusExporter({
50+
metricsCollector: deps.authManager.getMetricsCollector(),
51+
registry: deps.registry,
52+
serviceFactory: deps.serviceFactory,
53+
lighthouseService: deps.lighthouseService,
54+
});
55+
}
4456
}
4557

4658
async start(): Promise<void> {
@@ -109,6 +121,12 @@ export class HealthCheckServer {
109121
this.sendJSON(res, 500, { error: "Internal server error" });
110122
});
111123
break;
124+
case "/metrics":
125+
this.handleMetrics(res).catch((err) => {
126+
this.logger.error("Metrics export failed", err);
127+
this.sendJSON(res, 500, { error: "Internal server error" });
128+
});
129+
break;
112130
default:
113131
this.sendJSON(res, 404, { error: "Not found" });
114132
break;
@@ -154,6 +172,27 @@ export class HealthCheckServer {
154172
this.sendJSON(res, allUp ? 200 : 503, status);
155173
}
156174

175+
private async handleMetrics(res: http.ServerResponse): Promise<void> {
176+
if (!this.prometheusExporter) {
177+
this.sendJSON(res, 404, { error: "Metrics endpoint not enabled" });
178+
return;
179+
}
180+
181+
try {
182+
const metrics = await this.prometheusExporter.getMetrics();
183+
const contentType = this.prometheusExporter.getContentType();
184+
185+
res.writeHead(200, {
186+
"Content-Type": contentType,
187+
"Cache-Control": "no-cache, no-store",
188+
});
189+
res.end(metrics);
190+
} catch (err) {
191+
this.logger.error("Failed to generate metrics", err as Error);
192+
this.sendJSON(res, 500, { error: "Failed to generate metrics" });
193+
}
194+
}
195+
157196
private checkSDK(): ReadinessCheck {
158197
try {
159198
const stats = this.deps.lighthouseService.getStorageStats();

0 commit comments

Comments
 (0)