Skip to content

Commit 77ff4f2

Browse files
authored
Merge pull request #9 from Western-1/feature/add-monitoring-stack
feat: Add comprehensive monitoring stack with Prometheus and Grafana
2 parents 23a7073 + fd96215 commit 77ff4f2

File tree

16 files changed

+1575
-234
lines changed

16 files changed

+1575
-234
lines changed

images/Grafana_full_load_test.png

549 KB
Loading

images/Grafana_load_test.png

596 KB
Loading
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
route:
2+
group_by: ['alertname']
3+
group_wait: 30s
4+
group_interval: 5m
5+
repeat_interval: 1h
6+
receiver: 'web.hook'
7+
8+
receivers:
9+
- name: 'web.hook'
10+
webhook_configs:
11+
- url: 'http://churn-inference:8000/webhook'

monitoring/alerts.yml

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
groups:
2+
- name: churn_model_alerts
3+
interval: 30s
4+
rules:
5+
# Model Performance Alerts
6+
- alert: ModelAccuracyDegraded
7+
expr: churn_model_accuracy < 0.75
8+
for: 10m
9+
labels:
10+
severity: warning
11+
component: model
12+
annotations:
13+
summary: "Model accuracy has degraded"
14+
description: "Model accuracy is {{ $value }}, below threshold of 0.75"
15+
16+
- alert: ModelAUCDegraded
17+
expr: churn_model_auc_roc < 0.80
18+
for: 10m
19+
labels:
20+
severity: warning
21+
component: model
22+
annotations:
23+
summary: "Model AUC-ROC has degraded"
24+
description: "Model AUC-ROC is {{ $value }}, below threshold of 0.80"
25+
26+
# Data Quality Alerts
27+
- alert: DataDriftDetected
28+
expr: churn_data_drift_detected == 1
29+
for: 5m
30+
labels:
31+
severity: critical
32+
component: data
33+
annotations:
34+
summary: "Data drift has been detected"
35+
description: "Significant data drift detected in production data"
36+
37+
- alert: DataQualityLow
38+
expr: churn_data_quality_score < 0.85
39+
for: 15m
40+
labels:
41+
severity: warning
42+
component: data
43+
annotations:
44+
summary: "Data quality score is low"
45+
description: "Data quality score is {{ $value }}, below threshold of 0.85"
46+
47+
# API Performance Alerts
48+
- alert: HighPredictionLatency
49+
expr: histogram_quantile(0.95, rate(churn_prediction_latency_seconds_bucket[5m])) > 1.0
50+
for: 5m
51+
labels:
52+
severity: warning
53+
component: api
54+
annotations:
55+
summary: "High prediction latency detected"
56+
description: "95th percentile latency is {{ $value }}s, above 1.0s threshold"
57+
58+
- alert: PredictionErrorRateHigh
59+
expr: rate(churn_prediction_errors_total[5m]) > 0.01
60+
for: 5m
61+
labels:
62+
severity: critical
63+
component: api
64+
annotations:
65+
summary: "High prediction error rate"
66+
description: "Error rate is {{ $value }} errors/sec"
67+
68+
- alert: APIDown
69+
expr: up{job="churn-inference"} == 0
70+
for: 2m
71+
labels:
72+
severity: critical
73+
component: api
74+
annotations:
75+
summary: "Churn Inference API is down"
76+
description: "API has been down for more than 2 minutes"
77+
78+
# System Alerts
79+
- alert: HighMemoryUsage
80+
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
81+
for: 5m
82+
labels:
83+
severity: warning
84+
component: system
85+
annotations:
86+
summary: "High memory usage"
87+
description: "Memory usage is above 90%"
88+
89+
- alert: HighCPUUsage
90+
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
91+
for: 5m
92+
labels:
93+
severity: warning
94+
component: system
95+
annotations:
96+
summary: "High CPU usage"
97+
description: "CPU usage is above 80%"
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
version: '3.8'
2+
3+
services:
4+
# Prometheus
5+
prometheus:
6+
image: prom/prometheus:latest
7+
container_name: churn-prometheus
8+
command:
9+
- '--config.file=/etc/prometheus/prometheus.yml'
10+
- '--storage.tsdb.path=/prometheus'
11+
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
12+
- '--web.console.templates=/usr/share/prometheus/consoles'
13+
- '--web.enable-lifecycle'
14+
- '--storage.tsdb.retention.time=30d'
15+
volumes:
16+
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
17+
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
18+
- prometheus_data:/prometheus
19+
ports:
20+
- "9090:9090"
21+
networks:
22+
- churn-pipeline_churn-network
23+
restart: unless-stopped
24+
healthcheck:
25+
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
26+
interval: 30s
27+
timeout: 10s
28+
retries: 3
29+
30+
# Grafana
31+
grafana:
32+
image: grafana/grafana:latest
33+
container_name: churn-grafana
34+
environment:
35+
- GF_SECURITY_ADMIN_USER=admin
36+
- GF_SECURITY_ADMIN_PASSWORD=admin
37+
- GF_USERS_ALLOW_SIGN_UP=false
38+
- GF_SERVER_ROOT_URL=http://localhost:3000
39+
volumes:
40+
- grafana_data:/var/lib/grafana
41+
- ./grafana/provisioning:/etc/grafana/provisioning:ro
42+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
43+
ports:
44+
- "3000:3000"
45+
networks:
46+
- churn-pipeline_churn-network
47+
depends_on:
48+
- prometheus
49+
restart: unless-stopped
50+
healthcheck:
51+
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
52+
interval: 30s
53+
timeout: 10s
54+
retries: 3
55+
56+
# Node Exporter
57+
node-exporter:
58+
image: prom/node-exporter:latest
59+
container_name: churn-node-exporter
60+
command:
61+
- '--path.rootfs=/host'
62+
- '--path.procfs=/host/proc'
63+
- '--path.sysfs=/host/sys'
64+
- '--no-collector.netstat'
65+
- '--no-collector.softnet'
66+
- '--no-collector.filesystem'
67+
volumes:
68+
- /proc:/host/proc:ro
69+
- /sys:/host/sys:ro
70+
- /:/host:ro
71+
ports:
72+
- "9100:9100"
73+
networks:
74+
- churn-pipeline_churn-network
75+
restart: unless-stopped
76+
77+
# Alert Manager
78+
alertmanager:
79+
image: prom/alertmanager:latest
80+
container_name: churn-alertmanager
81+
command:
82+
- '--config.file=/etc/alertmanager/alertmanager.yml'
83+
- '--storage.path=/alertmanager'
84+
- '--web.external-url=http://localhost:9093'
85+
volumes:
86+
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
87+
- alertmanager_data:/alertmanager
88+
ports:
89+
- "9093:9093"
90+
networks:
91+
- churn-pipeline_churn-network
92+
restart: unless-stopped
93+
healthcheck:
94+
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
95+
interval: 30s
96+
timeout: 10s
97+
retries: 3
98+
99+
volumes:
100+
prometheus_data:
101+
grafana_data:
102+
alertmanager_data:
103+
104+
networks:
105+
churn-pipeline_churn-network:
106+
external: true
107+
name: churn-pipeline_churn-network

0 commit comments

Comments
 (0)