Western-1
diff --git a/‎images/Grafana_full_load_test.png‎
549 KB b/‎images/Grafana_full_load_test.png‎
549 KB
diff --git a/‎images/Grafana_load_test.png‎
596 KB b/‎images/Grafana_load_test.png‎
596 KB
diff --git a/‎monitoring/alertmanager/alertmanager.yml‎
Lines changed: 11 additions & 0 deletions b/‎monitoring/alertmanager/alertmanager.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎monitoring/alerts.yml‎
Lines changed: 97 additions & 0 deletions b/‎monitoring/alerts.yml‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎monitoring/docker-compose.monitoring.yml‎
Lines changed: 107 additions & 0 deletions b/‎monitoring/docker-compose.monitoring.yml‎
Lines changed: 107 additions & 0 deletions
@@ -0,0 +1,11 @@
+route:
+  group_by: ['alertname']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 1h
+  receiver: 'web.hook'
+
+receivers:
+  - name: 'web.hook'
+    webhook_configs:
+      - url: 'http://churn-inference:8000/webhook'
@@ -0,0 +1,97 @@
+groups:
+  - name: churn_model_alerts
+    interval: 30s
+    rules:
+      # Model Performance Alerts
+      - alert: ModelAccuracyDegraded
+        expr: churn_model_accuracy < 0.75
+        for: 10m
+        labels:
+          severity: warning
+          component: model
+        annotations:
+          summary: "Model accuracy has degraded"
+          description: "Model accuracy is {{ $value }}, below threshold of 0.75"
+
+      - alert: ModelAUCDegraded
+        expr: churn_model_auc_roc < 0.80
+        for: 10m
+        labels:
+          severity: warning
+          component: model
+        annotations:
+          summary: "Model AUC-ROC has degraded"
+          description: "Model AUC-ROC is {{ $value }}, below threshold of 0.80"
+
+      # Data Quality Alerts
+      - alert: DataDriftDetected
+        expr: churn_data_drift_detected == 1
+        for: 5m
+        labels:
+          severity: critical
+          component: data
+        annotations:
+          summary: "Data drift has been detected"
+          description: "Significant data drift detected in production data"
+
+      - alert: DataQualityLow
+        expr: churn_data_quality_score < 0.85
+        for: 15m
+        labels:
+          severity: warning
+          component: data
+        annotations:
+          summary: "Data quality score is low"
+          description: "Data quality score is {{ $value }}, below threshold of 0.85"
+
+      # API Performance Alerts
+      - alert: HighPredictionLatency
+        expr: histogram_quantile(0.95, rate(churn_prediction_latency_seconds_bucket[5m])) > 1.0
+        for: 5m
+        labels:
+          severity: warning
+          component: api
+        annotations:
+          summary: "High prediction latency detected"
+          description: "95th percentile latency is {{ $value }}s, above 1.0s threshold"
+
+      - alert: PredictionErrorRateHigh
+        expr: rate(churn_prediction_errors_total[5m]) > 0.01
+        for: 5m
+        labels:
+          severity: critical
+          component: api
+        annotations:
+          summary: "High prediction error rate"
+          description: "Error rate is {{ $value }} errors/sec"
+
+      - alert: APIDown
+        expr: up{job="churn-inference"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: api
+        annotations:
+          summary: "Churn Inference API is down"
+          description: "API has been down for more than 2 minutes"
+
+      # System Alerts
+      - alert: HighMemoryUsage
+        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
+        for: 5m
+        labels:
+          severity: warning
+          component: system
+        annotations:
+          summary: "High memory usage"
+          description: "Memory usage is above 90%"
+
+      - alert: HighCPUUsage
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+          component: system
+        annotations:
+          summary: "High CPU usage"
+          description: "CPU usage is above 80%"
@@ -0,0 +1,107 @@
+version: '3.8'
+
+services:
+  # Prometheus
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: churn-prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
+      - '--web.enable-lifecycle'
+      - '--storage.tsdb.retention.time=30d'
+    volumes:
+      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./alerts.yml:/etc/prometheus/alerts.yml:ro
+      - prometheus_data:/prometheus
+    ports:
+      - "9090:9090"
+    networks:
+      - churn-pipeline_churn-network
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  # Grafana
+  grafana:
+    image: grafana/grafana:latest
+    container_name: churn-grafana
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_SERVER_ROOT_URL=http://localhost:3000
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+    ports:
+      - "3000:3000"
+    networks:
+      - churn-pipeline_churn-network
+    depends_on:
+      - prometheus
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  # Node Exporter
+  node-exporter:
+    image: prom/node-exporter:latest
+    container_name: churn-node-exporter
+    command:
+      - '--path.rootfs=/host'
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--no-collector.netstat'
+      - '--no-collector.softnet'
+      - '--no-collector.filesystem'
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/host:ro
+    ports:
+      - "9100:9100"
+    networks:
+      - churn-pipeline_churn-network
+    restart: unless-stopped
+
+  # Alert Manager
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: churn-alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+      - '--web.external-url=http://localhost:9093'
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - alertmanager_data:/alertmanager
+    ports:
+      - "9093:9093"
+    networks:
+      - churn-pipeline_churn-network
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+volumes:
+  prometheus_data:
+  grafana_data:
+  alertmanager_data:
+
+networks:
+  churn-pipeline_churn-network:
+    external: true
+    name: churn-pipeline_churn-network