diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 3ba9d01..306b9cc 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -5,13 +5,15 @@ on: workflows: [CI Pipeline] types: - completed + conclusion: success + workflow_dispatch: jobs: # Check if we should deploy based on the workflow_run event evaluate_deployment: name: Evaluate Deployment Conditions runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} + #if: ${{ github.event.workflow_run.conclusion == 'success' }} outputs: deploy_environment: ${{ steps.set-env.outputs.environment }} should_deploy: ${{ steps.set-env.outputs.should_deploy }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0339476..3713f3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -289,9 +289,14 @@ jobs: id: build-args run: | if [[ "${{ matrix.service }}" == "client" ]]; then - if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then + # Detect target branch for push or pull_request + TARGET_BRANCH="${{ github.ref }}" + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + TARGET_BRANCH="${{ github.base_ref }}" + fi + if [[ "$TARGET_BRANCH" == "refs/heads/main" || "$TARGET_BRANCH" == "main" ]]; then echo "build_env=production" >> $GITHUB_OUTPUT - elif [[ "${{ github.ref }}" == "refs/heads/dev" ]]; then + elif [[ "$TARGET_BRANCH" == "refs/heads/dev" || "$TARGET_BRANCH" == "dev" ]]; then echo "build_env=staging" >> $GITHUB_OUTPUT else echo "build_env=development" >> $GITHUB_OUTPUT diff --git a/gateway/src/main/java/de/tum/aet/devops25/JwtAuthenticationFilter.java b/gateway/src/main/java/de/tum/aet/devops25/JwtAuthenticationFilter.java index efe6758..41c1b1f 100644 --- a/gateway/src/main/java/de/tum/aet/devops25/JwtAuthenticationFilter.java +++ b/gateway/src/main/java/de/tum/aet/devops25/JwtAuthenticationFilter.java @@ -25,9 +25,15 @@ public Mono filter(ServerWebExchange exchange, WebFilterChain chain) { String path = exchange.getRequest().getURI().getPath(); String method = exchange.getRequest().getMethod().toString(); - // Skip authentication for actuator endpoints - if (path.startsWith("/actuator/")) { - //System.out.println("[AUTH_DEBUG] Skipping authentication for actuator endpoint: " + path); + // Skip authentication for endpoints that don't require it + if (path.startsWith("/actuator/") || + path.equals("/health") || + path.equals("/api/health") || + path.equals("/auth/register") || + path.equals("/api/auth/register") || + path.equals("/auth/login") || + path.equals("/api/auth/login")) { + System.out.println("[AUTH_DEBUG] Skipping authentication for public endpoint: " + path); return chain.filter(exchange); } diff --git a/genai-svc/requirements.txt b/genai-svc/requirements.txt index 39fb0ba..0e00bbf 100644 --- a/genai-svc/requirements.txt +++ b/genai-svc/requirements.txt @@ -40,3 +40,6 @@ boto3 >= 1.29.0 botocore >= 1.32.0 pypdf >= 3.15.1 python-docx >= 0.8.11 + +# Monitoring and metrics +prometheus_flask_exporter >= 0.20.0 diff --git a/helm/ai-event-concepter/templates/client-service.yaml b/helm/ai-event-concepter/templates/client-service.yaml index e4f711f..8e7f1e0 100644 --- a/helm/ai-event-concepter/templates/client-service.yaml +++ b/helm/ai-event-concepter/templates/client-service.yaml @@ -12,4 +12,5 @@ spec: - port: {{ .Values.client.service.port }} targetPort: {{ .Values.client.service.targetPort }} protocol: TCP + name: http type: {{ .Values.client.service.type }} diff --git a/helm/ai-event-concepter/templates/concept-svc-service.yaml b/helm/ai-event-concepter/templates/concept-svc-service.yaml index 71f1c52..157c6ce 100644 --- a/helm/ai-event-concepter/templates/concept-svc-service.yaml +++ b/helm/ai-event-concepter/templates/concept-svc-service.yaml @@ -12,4 +12,5 @@ spec: - port: {{ .Values.conceptsvc.service.port }} targetPort: {{ .Values.conceptsvc.service.targetPort }} protocol: TCP + name: http type: {{ .Values.conceptsvc.service.type }} diff --git a/helm/ai-event-concepter/templates/gateway-service.yaml b/helm/ai-event-concepter/templates/gateway-service.yaml index 687253e..e32d595 100644 --- a/helm/ai-event-concepter/templates/gateway-service.yaml +++ b/helm/ai-event-concepter/templates/gateway-service.yaml @@ -12,4 +12,5 @@ spec: - port: {{ .Values.gateway.service.port }} targetPort: {{ .Values.gateway.service.targetPort }} protocol: TCP + name: http type: {{ .Values.gateway.service.type }} diff --git a/helm/ai-event-concepter/templates/genai-svc-minio-service.yaml b/helm/ai-event-concepter/templates/genai-svc-minio-service.yaml index 472f6ee..88cd479 100644 --- a/helm/ai-event-concepter/templates/genai-svc-minio-service.yaml +++ b/helm/ai-event-concepter/templates/genai-svc-minio-service.yaml @@ -2,6 +2,9 @@ apiVersion: v1 kind: Service metadata: name: genai-svc-minio + labels: + app: genai-svc-minio + monitoring: "true" spec: selector: app: ai-event-concepter-genai-svc-minio-selector diff --git a/helm/ai-event-concepter/templates/genai-svc-service.yaml b/helm/ai-event-concepter/templates/genai-svc-service.yaml index e546ad8..7eace8d 100644 --- a/helm/ai-event-concepter/templates/genai-svc-service.yaml +++ b/helm/ai-event-concepter/templates/genai-svc-service.yaml @@ -12,4 +12,5 @@ spec: - port: {{ .Values.genaisvc.service.port }} targetPort: {{ .Values.genaisvc.service.targetPort }} protocol: TCP + name: http type: {{ .Values.genaisvc.service.type }} diff --git a/helm/ai-event-concepter/templates/genai-svc-t2v-transformers-service.yaml b/helm/ai-event-concepter/templates/genai-svc-t2v-transformers-service.yaml index e00ad24..6647ee3 100644 --- a/helm/ai-event-concepter/templates/genai-svc-t2v-transformers-service.yaml +++ b/helm/ai-event-concepter/templates/genai-svc-t2v-transformers-service.yaml @@ -2,10 +2,14 @@ apiVersion: v1 kind: Service metadata: name: genai-svc-t2v-transformers + labels: + app: genai-svc-t2v-transformers + monitoring: "true" spec: selector: app: ai-event-concepter-genai-svc-t2v-transformers-selector ports: - port: {{ .Values.genaisvct2vtransformers.service.port }} targetPort: {{ .Values.genaisvct2vtransformers.service.targetPort }} + name: http type: {{ .Values.genaisvct2vtransformers.service.type }} \ No newline at end of file diff --git a/helm/ai-event-concepter/templates/genai-svc-weaviate-service.yaml b/helm/ai-event-concepter/templates/genai-svc-weaviate-service.yaml index fc28950..f7e7c2c 100644 --- a/helm/ai-event-concepter/templates/genai-svc-weaviate-service.yaml +++ b/helm/ai-event-concepter/templates/genai-svc-weaviate-service.yaml @@ -2,6 +2,9 @@ apiVersion: v1 kind: Service metadata: name: genai-svc-weaviate + labels: + app: genai-svc-weaviate + monitoring: "true" spec: selector: app: ai-event-concepter-genai-svc-weaviate-selector diff --git a/helm/ai-event-concepter/templates/network-policies.yaml b/helm/ai-event-concepter/templates/network-policies.yaml index 201d49a..eeb346c 100644 --- a/helm/ai-event-concepter/templates/network-policies.yaml +++ b/helm/ai-event-concepter/templates/network-policies.yaml @@ -9,21 +9,31 @@ spec: matchLabels: app: ai-event-concepter-client-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: [] # Allow ingress traffic from anywhere (for user access) + - from: [] # Allow ingress traffic from anywhere (for user access) + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-gateway-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-gateway-selector + ports: + - port: {{ .Values.gateway.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- # Network policy for gateway apiVersion: networking.k8s.io/v1 @@ -35,30 +45,52 @@ spec: matchLabels: app: ai-event-concepter-gateway-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-client-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-client-selector + - from: [] # Allow access from ingress controller/external sources + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-user-svc-selector - - podSelector: - matchLabels: - app: ai-event-concepter-concept-svc-selector - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + # Allow communication with user-svc + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-user-svc-selector + ports: + - port: {{ .Values.usersvc.service.targetPort }} + protocol: TCP + # Allow communication with concept-svc + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-concept-svc-selector + ports: + - port: {{ .Values.conceptsvc.service.targetPort }} + protocol: TCP + # Allow communication with genai-svc + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-selector + ports: + - port: {{ .Values.genaisvc.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- # Network policy for genai-svc apiVersion: networking.k8s.io/v1 @@ -70,37 +102,52 @@ spec: matchLabels: app: ai-event-concepter-genai-svc-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-gateway-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-gateway-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-weaviate-selector - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-minio-selector - - podSelector: - matchLabels: - app: ai-event-concepter-concept-svc-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP - # Allow outbound internet access for Hugging Face models - - to: [] - ports: - - port: 443 - protocol: TCP - - port: 80 - protocol: TCP + # Allow communication with weaviate + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-weaviate-selector + ports: + - port: {{ .Values.genaisvcweaviate.service.targetPort }} + protocol: TCP + - port: {{ .Values.genaisvcweaviate.service.grpcTargetPort }} + protocol: TCP + # Allow communication with minio + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-minio-selector + ports: + - port: {{ .Values.genaisvcminio.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + # Allow outbound internet access for Hugging Face models + - to: [] + ports: + - port: 443 + protocol: TCP + - port: 80 + protocol: TCP --- # Network policy for weaviate apiVersion: networking.k8s.io/v1 @@ -112,41 +159,35 @@ spec: matchLabels: app: ai-event-concepter-genai-svc-weaviate-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP - ports: - - port: 8080 - protocol: TCP - - port: 50051 - protocol: TCP - ports: - - port: 8080 - protocol: TCP - - port: 50051 - protocol: TCP - egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-t2v-transformers-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + # Allow communication with t2v-transformers + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-t2v-transformers-selector + ports: + - port: {{ .Values.genaisvct2vtransformers.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- # Network policy for t2v-transformers apiVersion: networking.k8s.io/v1 @@ -158,27 +199,27 @@ spec: matchLabels: app: ai-event-concepter-genai-svc-t2v-transformers-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-weaviate-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-weaviate-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP - # Allow outbound internet access for Hugging Face models - - to: [] - ports: - - port: 443 - protocol: TCP - - port: 80 - protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- # Network policy for minio apiVersion: networking.k8s.io/v1 @@ -190,128 +231,151 @@ spec: matchLabels: app: ai-event-concepter-genai-svc-minio-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-genai-svc-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus + egress: + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- -# Network policy for concept-svc +# Network policy for user-svc apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: concept-svc-network-policy + name: user-svc-network-policy spec: podSelector: matchLabels: - app: ai-event-concepter-concept-svc-selector + app: ai-event-concepter-user-svc-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-gateway-selector - - podSelector: - matchLabels: - app: ai-event-concepter-genai-svc-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-gateway-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-concept-svc-db-selector - - podSelector: - matchLabels: - app: ai-event-concepter-user-svc-db-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-user-svc-db-selector + ports: + - port: {{ .Values.userdb.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- -# Network policy for concept-svc-db +# Network policy for concept-svc apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: concept-svc-db-network-policy + name: concept-svc-network-policy spec: podSelector: matchLabels: - app: ai-event-concepter-concept-svc-db-selector + app: ai-event-concepter-concept-svc-selector policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-concept-svc-selector + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-gateway-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus egress: - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + - to: + - podSelector: + matchLabels: + app: ai-event-concepter-concept-svc-db-selector + ports: + - port: {{ .Values.conceptdb.service.targetPort }} + protocol: TCP + # Allow DNS resolution + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP --- -# Network policy for user-svc +# Network policy for user-svc-db apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: user-svc-network-policy + name: user-svc-db-network-policy spec: podSelector: matchLabels: - app: ai-event-concepter-user-svc-selector + app: ai-event-concepter-user-svc-db-selector policyTypes: - - Ingress - - Egress + - Ingress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-gateway-selector - egress: - - to: - - podSelector: - matchLabels: - app: ai-event-concepter-user-svc-db-selector - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-user-svc-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus --- -# Network policy for user-svc-db +# Network policy for concept-svc-db apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: user-svc-db-network-policy + name: concept-svc-db-network-policy spec: podSelector: matchLabels: - app: ai-event-concepter-user-svc-db-selector + app: ai-event-concepter-concept-svc-db-selector policyTypes: - - Ingress - - Egress + - Ingress ingress: - - from: - - podSelector: - matchLabels: - app: ai-event-concepter-user-svc-selector - - podSelector: - matchLabels: - app: ai-event-concepter-concept-svc-selector - egress: - # Allow DNS resolution - - ports: - - port: 53 - protocol: UDP - - port: 53 - protocol: TCP + - from: + - podSelector: + matchLabels: + app: ai-event-concepter-concept-svc-selector + - from: # Allow Prometheus monitoring + - namespaceSelector: + matchLabels: + name: team-git-push-force-monitor + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus {{- end }} diff --git a/helm/ai-event-concepter/templates/user-svc-service.yaml b/helm/ai-event-concepter/templates/user-svc-service.yaml index 7845193..15f5414 100644 --- a/helm/ai-event-concepter/templates/user-svc-service.yaml +++ b/helm/ai-event-concepter/templates/user-svc-service.yaml @@ -12,4 +12,5 @@ spec: - port: {{ .Values.usersvc.service.port }} targetPort: {{ .Values.usersvc.service.targetPort }} protocol: TCP + name: http type: {{ .Values.usersvc.service.type }} diff --git a/helm/ai-event-concepter/values.yaml b/helm/ai-event-concepter/values.yaml index 8587ab5..2b43109 100644 --- a/helm/ai-event-concepter/values.yaml +++ b/helm/ai-event-concepter/values.yaml @@ -165,6 +165,8 @@ genaisvcweaviate: value: "http://genai-svc-t2v-transformers:8080" - name: MODULES_TRANSFORMERS_INFERENCE_API_TIMEOUT_SECONDS value: "300" + - name: PROMETHEUS_MONITORING_ENABLED + value: "true" persistence: enabled: false size: 1Gi @@ -224,6 +226,8 @@ genaisvcminio: value: "minioadmin" - name: MINIO_ROOT_PASSWORD value: "minioadmin" + - name: MINIO_PROMETHEUS_AUTH_TYPE + value: "public" persistence: enabled: false size: 10Gi diff --git a/helm/monitor/dashboards/sample-dashboard.json b/helm/monitor/dashboards/sample-dashboard.json index 2c1eb74..4c5ff6e 100644 --- a/helm/monitor/dashboards/sample-dashboard.json +++ b/helm/monitor/dashboards/sample-dashboard.json @@ -1,60 +1,117 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": "PBFA97CFB590B2093", + "enable": true, + "expr": "changes(kube_deployment_status_replicas_updated_total[1h])", + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Deployments", + "type": "query" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 20, + "links": [], + "liveNow": false, + "panels": [ { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "8.0.0" + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "title": "Service Health", + "type": "row" }, { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], - "id": null, - "title": "AI Event Concepter - Application Overview", - "uid": "ai-event-concepter", - "tags": [ - "ai-event-concepter", - "microservices", - "monitoring" - ], - "style": "dark", - "timezone": "browser", - "panels": [ - { + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 1 + }, "id": 1, - "title": "Service Health Overview", - "type": "stat", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", "targets": [ { - "expr": "up", - "legendFormat": "{{job}}:{{instance}}", - "datasource": "Prometheus" + "expr": "min by(job)(up)", + "legendFormat": "{{job}}", + "refId": "A" } ], + "title": "Service Health Overview", + "type": "stat" + }, + { + "datasource": "PBFA97CFB590B2093", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "mappings": [], "thresholds": { + "mode": "absolute", "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "green", @@ -62,137 +119,1071 @@ } ] } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 5 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "expr": "min(up)", + "legendFormat": "healthy", + "refId": "A" } + ], + "title": "All Services Healthy?", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 }, + "id": 200, + "title": "Traffic & Errors", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 0 - } - }, - { + "y": 10 + }, + "hiddenSeries": false, "id": 2, - "title": "Request Rate", - "type": "graph", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum by (job, method, uri) (rate(http_server_requests_seconds_count[5m]))", - "legendFormat": "{{job}} - {{method}} {{uri}}", - "datasource": "Prometheus" + "expr": "sum by (job) (rate(http_server_requests_seconds_count[5m]))", + "legendFormat": "{{job}}", + "refId": "A" + }, + { + "expr": "sum by (job) (rate(flask_http_request_duration_seconds_count[5m]))", + "legendFormat": "{{job}}", + "refId": "B" + }, + { + "expr": "sum(rate(nginx_ingress_controller_requests{service=~\"client-svc.*\"}[5m]))", + "legendFormat": "client", + "refId": "C" } ], - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 + "thresholds": [], + "timeRegions": [], + "title": "Request Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } }, { - "id": 3, - "title": "Response Time (95th percentile)", - "type": "graph", + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 99.9 + }, + { + "color": "green", + "value": 99.95 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 12, + "y": 10 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le, job) (rate(http_server_requests_seconds_bucket[5m])))", - "legendFormat": "{{job}}", - "datasource": "Prometheus" + "expr": "(sum(rate(http_server_requests_seconds_count{status!~\"5..\"}[5m])) / sum(rate(http_server_requests_seconds_count[5m]))) * 100", + "legendFormat": "success_rate", + "refId": "A" } ], + "title": "HTTP Success Rate (%)", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 8 - } - }, - { + "y": 18 + }, + "hiddenSeries": false, "id": 4, - "title": "Error Rate", - "type": "graph", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { "expr": "sum by (job) (rate(http_server_requests_seconds_count{status=~\"5..\"}[5m]))", "legendFormat": "{{job}} - 5xx errors", - "datasource": "Prometheus" + "refId": "A" }, { "expr": "sum by (job) (rate(http_server_requests_seconds_count{status=~\"4..\"}[5m]))", "legendFormat": "{{job}} - 4xx errors", - "datasource": "Prometheus" + "refId": "B" + }, + { + "expr": "sum by (job) (rate(flask_http_request_duration_seconds_count{status=~\"5..\"}[5m]))", + "legendFormat": "{{job}} - 5xx errors", + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true } ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 300, + "title": "Latency", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 8 + "w": 24, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, job) (rate(http_server_requests_seconds_bucket[5m])))", + "legendFormat": "{{job}}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le, job) (rate(flask_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "{{job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Latency: Response Time (95th percentile)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } }, { - "id": 5, - "title": "Memory Usage", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum by (le, job) (rate(http_server_requests_seconds_bucket[5m])))", + "legendFormat": "P50 - {{job}}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum by (le, job) (rate(http_server_requests_seconds_bucket[5m])))", + "legendFormat": "P99 - {{job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Latency: P50 & P99", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 43 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", "targets": [ { - "expr": "jvm_memory_used_bytes / jvm_memory_max_bytes * 100", - "legendFormat": "{{job}} - {{area}}", - "datasource": "Prometheus" + "expr": "max_over_time(histogram_quantile(0.99, sum by (le, job) (rate(http_server_requests_seconds_bucket[1m])))[5m:1m])", + "legendFormat": "max_p99", + "refId": "A" } ], + "title": "Max Observed Latency", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 400, + "title": "Resource Usage", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 48 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (jvm_memory_used_bytes)", + "legendFormat": "{{job}}", + "refId": "A" + }, + { + "expr": "sum by (job) (python_memory_bytes_used)", + "legendFormat": "{{job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } }, { - "id": 6, - "title": "CPU Usage", - "type": "graph", + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 12, + "y": 48 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", "targets": [ { - "expr": "rate(process_cpu_seconds_total[5m]) * 100", - "legendFormat": "{{job}}", - "datasource": "Prometheus" + "expr": "100 * sum(container_memory_usage_bytes{container!=\"POD\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{container!=\"POD\"}) by (pod)", + "legendFormat": "mem_saturation", + "refId": "A" } ], + "title": "Memory Saturation (% of Limit)", + "type": "stat" + }, + { + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 12, + "y": 52 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "expr": "100 * sum(rate(container_cpu_usage_seconds_total{container!=\"POD\"}[5m])) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{container!=\"POD\"}) by (pod)", + "legendFormat": "cpu_saturation", + "refId": "A" + } + ], + "title": "CPU Saturation (% of Limit)", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 16 + "x": 0, + "y": 56 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total[5m]) * 100", + "legendFormat": "{{job}}", + "refId": "A" + }, + { + "expr": "rate(python_cpu_seconds_total[5m]) * 100", + "legendFormat": "{{job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } }, { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 500, + "title": "Database", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 65 + }, + "hiddenSeries": false, "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(pg_stat_activity_count{datname!~\"postgres|template0|template1\"}) by (datname, state)", + "legendFormat": "{{datname}} - {{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], "title": "Database Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, "type": "graph", - "targets": [ + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ { - "expr": "hikaricp_connections_active", - "legendFormat": "{{job}} - Active", - "datasource": "Prometheus" + "format": "short", + "logBase": 1, + "show": true }, { - "expr": "hikaricp_connections_idle", - "legendFormat": "{{job}} - Idle", - "datasource": "Prometheus" + "format": "short", + "logBase": 1, + "show": true } ], + "yaxis": { + "align": false + } + }, + { + "datasource": "PBFA97CFB590B2093", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 12, + "y": 65 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "expr": "sum(pg_stat_activity_count) / 100 * 100", + "legendFormat": "db_conn_saturation", + "refId": "A" + } + ], + "title": "DB Connection Saturation", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PBFA97CFB590B2093", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 24 + "y": 73 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (datname) (rate(pg_stat_database_xact_commit{datname!~\"postgres|template0|template1\"}[5m]))", + "legendFormat": "{{datname}} - commits", + "refId": "A" + }, + { + "expr": "sum by (datname) (rate(pg_stat_database_xact_rollback{datname!~\"postgres|template0|template1\"}[5m]))", + "legendFormat": "{{datname}} - rollbacks", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Database Transaction Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } }, { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 81 + }, + "id": 600, + "title": "Service Version & Deploys", + "type": "row" + }, + { + "datasource": "PBFA97CFB590B2093", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 82 + }, "id": 8, - "title": "Service Version Overview", - "type": "table", "targets": [ { "expr": "app_version_info", @@ -200,27 +1191,111 @@ "refId": "A" } ], + "title": "Service Version Overview", "transformations": [ { "id": "labelsToFields", "options": {} } ], + "type": "table" + }, + { + "datasource": "PBFA97CFB590B2093", "fieldConfig": { - "defaults": {}, + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, "overrides": [] }, "gridPos": { "h": 8, - "w": 24, - "x": 0, - "y": 32 + "w": 12, + "x": 12, + "y": 73 + }, + "id": 1001, + "options": { + "alertThreshold": true + }, + "pluginVersion": "10.1.5", + "targets": [ + { + "expr": "avg_over_time(scrape_duration_seconds{job=\"postgres-exporter-concept-db\"}[5m])", + "legendFormat": "scrape duration", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Postgres Exporter Scrape Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } } ], + "refresh": false, + "schemaVersion": 38, + "style": "dark", + "tags": [ + "ai-event-concepter", + "microservices", + "monitoring" + ], + "templating": { + "list": [] + }, "time": { - "from": "now-1h", - "to": "now" + "from": "2025-07-19T22:10:58.545Z", + "to": "2025-07-20T00:10:58.547Z" }, - "refresh": "30s" + "timepicker": {}, + "timezone": "browser", + "title": "AI Event Concepter - Application Overview (Working Example)", + "uid": "ai-event-concepter-example", + "version": 1, + "weekStart": "" } \ No newline at end of file diff --git a/helm/monitor/templates/alertmanager-ingress.yaml b/helm/monitor/templates/alertmanager-ingress.yaml deleted file mode 100644 index 0534fe9..0000000 --- a/helm/monitor/templates/alertmanager-ingress.yaml +++ /dev/null @@ -1,29 +0,0 @@ -{{- if .Values.alertmanager.ingress.enabled -}} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: alertmanager-ingress - labels: - app: alertmanager - annotations: - {{- toYaml .Values.alertmanager.ingress.annotations | nindent 4 }} -spec: - ingressClassName: {{ .Values.alertmanager.ingress.className }} - {{- if .Values.alertmanager.ingress.tls }} - tls: - - hosts: - - {{ .Values.alertmanager.ingress.host }} - secretName: alertmanager-tls - {{- end }} - rules: - - host: {{ .Values.alertmanager.ingress.host }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: alertmanager - port: - number: 9093 -{{- end }} \ No newline at end of file diff --git a/helm/monitor/templates/database-exporters.yaml b/helm/monitor/templates/database-exporters.yaml new file mode 100644 index 0000000..491d50a --- /dev/null +++ b/helm/monitor/templates/database-exporters.yaml @@ -0,0 +1,136 @@ +{{- if .Values.postgresExporter.userDb.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-exporter-user-db + namespace: {{ .Values.namespace.name }} + labels: + app: postgres-exporter-user-db +spec: + replicas: 1 + selector: + matchLabels: + app: postgres-exporter-user-db + template: + metadata: + labels: + app: postgres-exporter-user-db + spec: + containers: + - name: postgres-exporter + image: "{{ .Values.postgresExporter.userDb.image.repository }}:{{ .Values.postgresExporter.userDb.image.tag }}" + env: + - name: DATA_SOURCE_NAME + value: "postgresql://{{ .Values.postgresExporter.userDb.database.user }}:{{ .Values.postgresExporter.userDb.database.password }}@{{ .Values.postgresExporter.userDb.database.host }}:{{ .Values.postgresExporter.userDb.database.port }}/{{ .Values.postgresExporter.userDb.database.name }}?sslmode=disable" + ports: + - containerPort: 9187 + name: http + resources: + {{- toYaml .Values.postgresExporter.userDb.resources | nindent 10 }} +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-exporter-user-db + namespace: {{ .Values.namespace.name }} + labels: + app: postgres-exporter-user-db +spec: + ports: + - name: http + port: 9187 + targetPort: http + selector: + app: postgres-exporter-user-db +{{- end }} +--- +{{- if .Values.postgresExporter.conceptDb.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-exporter-concept-db + namespace: {{ .Values.namespace.name }} + labels: + app: postgres-exporter-concept-db +spec: + replicas: 1 + selector: + matchLabels: + app: postgres-exporter-concept-db + template: + metadata: + labels: + app: postgres-exporter-concept-db + spec: + containers: + - name: postgres-exporter + image: "{{ .Values.postgresExporter.conceptDb.image.repository }}:{{ .Values.postgresExporter.conceptDb.image.tag }}" + env: + - name: DATA_SOURCE_NAME + value: "postgresql://{{ .Values.postgresExporter.conceptDb.database.user }}:{{ .Values.postgresExporter.conceptDb.database.password }}@{{ .Values.postgresExporter.conceptDb.database.host }}:{{ .Values.postgresExporter.conceptDb.database.port }}/{{ .Values.postgresExporter.conceptDb.database.name }}?sslmode=disable" + ports: + - containerPort: 9187 + name: http + resources: + {{- toYaml .Values.postgresExporter.conceptDb.resources | nindent 10 }} +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-exporter-concept-db + namespace: {{ .Values.namespace.name }} + labels: + app: postgres-exporter-concept-db +spec: + ports: + - name: http + port: 9187 + targetPort: http + selector: + app: postgres-exporter-concept-db +{{- end }} +--- +{{- if .Values.redisExporter.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-exporter + namespace: {{ .Values.namespace.name }} + labels: + app: redis-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: redis-exporter + template: + metadata: + labels: + app: redis-exporter + spec: + containers: + - name: redis-exporter + image: "{{ .Values.redisExporter.image.repository }}:{{ .Values.redisExporter.image.tag }}" + args: + - "--redis.addr={{ .Values.redisExporter.redisAddress }}" + ports: + - containerPort: 9121 + name: http-metrics + resources: + {{- toYaml .Values.redisExporter.resources | nindent 10 }} +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-exporter + namespace: {{ .Values.namespace.name }} + labels: + app: redis-exporter +spec: + ports: + - name: http-metrics + port: 9121 + targetPort: http-metrics + selector: + app: redis-exporter +{{- end }} \ No newline at end of file diff --git a/helm/monitor/templates/database-servicemonitors.yaml b/helm/monitor/templates/database-servicemonitors.yaml new file mode 100644 index 0000000..521e2e6 --- /dev/null +++ b/helm/monitor/templates/database-servicemonitors.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: postgres-exporter-user-db-service-monitor + labels: + app: prometheus +spec: + endpoints: + - port: http + interval: 15s + namespaceSelector: + matchNames: + - team-git-push-force-monitor + selector: + matchLabels: + app: postgres-exporter-user-db +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: postgres-exporter-concept-db-service-monitor + labels: + app: prometheus +spec: + endpoints: + - port: http + interval: 15s + namespaceSelector: + matchNames: + - team-git-push-force-monitor + selector: + matchLabels: + app: postgres-exporter-concept-db +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: redis-exporter-service-monitor + labels: + app: prometheus +spec: + endpoints: + - port: http-metrics + interval: 15s + relabelings: + - sourceLabels: [__meta_kubernetes_service_label_app] + targetLabel: service_name + - sourceLabels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + targetLabel: managed_by + namespaceSelector: + matchNames: + - team-git-push-force-monitor + selector: + matchLabels: + app: redis-exporter \ No newline at end of file diff --git a/helm/monitor/templates/grafana-infrastructure-dashboard.yaml b/helm/monitor/templates/grafana-config.yaml similarity index 52% rename from helm/monitor/templates/grafana-infrastructure-dashboard.yaml rename to helm/monitor/templates/grafana-config.yaml index 795edba..84d795b 100644 --- a/helm/monitor/templates/grafana-infrastructure-dashboard.yaml +++ b/helm/monitor/templates/grafana-config.yaml @@ -1,3 +1,254 @@ +# Grafana Datasource Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + labels: + app: grafana +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true +--- +# Grafana Dashboard Provider Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards + labels: + app: grafana +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards +--- +# Grafana Application Dashboard +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-sample-dashboard + labels: + grafana_dashboard: "1" +data: + ai-event-concepter-dashboard.json: | + { + "id": null, + "title": "AI Event Concepter - Application Overview", + "uid": "ai-event-concepter", + "tags": [ + "ai-event-concepter", + "microservices", + "monitoring" + ], + "style": "dark", + "timezone": "browser", + "panels": [ + { + "id": 1, + "title": "Service Health Overview", + "type": "stat", + "targets": [ + { + "expr": "up", + "legendFormat": "{{"{{"}}service{{"}}"}} ({{"{{"}}job{{"}}"}})", + "datasource": "Prometheus" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "sum by (service, method, uri) (rate(http_server_requests_seconds_count[5m]))", + "legendFormat": "{{"{{"}}service{{"}}"}} - {{"{{"}}method{{"}}"}} {{"{{"}}uri{{"}}"}}", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + }, + { + "id": 3, + "title": "Response Time (95th percentile)", + "type": "graph", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, service) (rate(http_server_requests_seconds_bucket[5m])))", + "legendFormat": "{{"{{"}}service{{"}}"}}", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + } + }, + { + "id": 4, + "title": "Error Rate", + "type": "graph", + "targets": [ + { + "expr": "sum by (service) (rate(http_server_requests_seconds_count{status=~\"5..\"}[5m]))", + "legendFormat": "{{"{{"}}service{{"}}"}} - 5xx errors", + "datasource": "Prometheus" + }, + { + "expr": "sum by (service) (rate(http_server_requests_seconds_count{status=~\"4..\"}[5m]))", + "legendFormat": "{{"{{"}}service{{"}}"}} - 4xx errors", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + } + }, + { + "id": 5, + "title": "Memory Usage", + "type": "graph", + "targets": [ + { + "expr": "jvm_memory_used_bytes / jvm_memory_max_bytes * 100", + "legendFormat": "{{"{{"}}service{{"}}"}} - {{"{{"}}area{{"}}"}}", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + } + }, + { + "id": 6, + "title": "CPU Usage", + "type": "graph", + "targets": [ + { + "expr": "rate(process_cpu_seconds_total[5m]) * 100", + "legendFormat": "{{"{{"}}service{{"}}"}}", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + } + }, + { + "id": 7, + "title": "Database Connections", + "type": "graph", + "targets": [ + { + "expr": "hikaricp_connections_active", + "legendFormat": "{{"{{"}}service{{"}}"}} - Active", + "datasource": "Prometheus" + }, + { + "expr": "hikaricp_connections_idle", + "legendFormat": "{{"{{"}}service{{"}}"}} - Idle", + "datasource": "Prometheus" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + } + }, + { + "id": 8, + "title": "Service Version Overview", + "type": "table", + "targets": [ + { + "expr": "app_version_info", + "format": "table", + "refId": "A" + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "30s" + } +--- +# Grafana Infrastructure Dashboard apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/monitor/templates/grafana-dashboards-configmap.yaml b/helm/monitor/templates/grafana-dashboards-configmap.yaml deleted file mode 100644 index 1b046d3..0000000 --- a/helm/monitor/templates/grafana-dashboards-configmap.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards - labels: - app: grafana -data: - dashboards.yaml: | - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/helm/monitor/templates/grafana-datasource-configmap.yaml b/helm/monitor/templates/grafana-datasource-configmap.yaml deleted file mode 100644 index 6e13dc5..0000000 --- a/helm/monitor/templates/grafana-datasource-configmap.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-datasources - labels: - app: grafana -data: - datasources.yaml: | - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true \ No newline at end of file diff --git a/helm/monitor/templates/grafana-ingress.yaml b/helm/monitor/templates/grafana-ingress.yaml deleted file mode 100644 index 6b6c373..0000000 --- a/helm/monitor/templates/grafana-ingress.yaml +++ /dev/null @@ -1,32 +0,0 @@ -{{- if .Values.ingress.enabled -}} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: grafana-ingress - {{- $annotations := .Values.ingress.annotations | default dict }} - {{- if $annotations }} - annotations: - {{- toYaml $annotations | nindent 4 }} - {{- end }} -spec: - {{- if .Values.ingress.className }} - ingressClassName: {{ .Values.ingress.className }} - {{- end }} - {{- if .Values.ingress.tls }} - tls: - - hosts: - - "{{ .Values.ingress.host }}" - secretName: "grafana-tls" - {{- end }} - rules: - - host: "{{ .Values.ingress.host }}" - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: grafana - port: - number: 3000 -{{- end }} \ No newline at end of file diff --git a/helm/monitor/templates/grafana-rbac.yaml b/helm/monitor/templates/grafana-rbac.yaml deleted file mode 100644 index 5ed08ef..0000000 --- a/helm/monitor/templates/grafana-rbac.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: grafana - labels: - app: grafana \ No newline at end of file diff --git a/helm/monitor/templates/grafana-sample-dashboard-configmap.yaml b/helm/monitor/templates/grafana-sample-dashboard-configmap.yaml deleted file mode 100644 index 65b867c..0000000 --- a/helm/monitor/templates/grafana-sample-dashboard-configmap.yaml +++ /dev/null @@ -1,210 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-sample-dashboard - labels: - grafana_dashboard: "1" -data: - ai-event-concepter-dashboard.json: | - { - "id": null, - "title": "AI Event Concepter - Application Overview", - "uid": "ai-event-concepter", - "tags": [ - "ai-event-concepter", - "microservices", - "monitoring" - ], - "style": "dark", - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "Service Health Overview", - "type": "stat", - "targets": [ - { - "expr": "up", - "legendFormat": "{{"{{"}}job{{"}}"}}:{{"{{"}}instance{{"}}"}}", - "datasource": "Prometheus" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - } - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - } - }, - { - "id": 2, - "title": "Request Rate", - "type": "graph", - "targets": [ - { - "expr": "sum by (job, method, uri) (rate(http_server_requests_seconds_count[5m]))", - "legendFormat": "{{"{{"}}job{{"}}"}} - {{"{{"}}method{{"}}"}} {{"{{"}}uri{{"}}"}}", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - } - }, - { - "id": 3, - "title": "Response Time (95th percentile)", - "type": "graph", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, job) (rate(http_server_requests_seconds_bucket[5m])))", - "legendFormat": "{{"{{"}}job{{"}}"}}", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - } - }, - { - "id": 4, - "title": "Error Rate", - "type": "graph", - "targets": [ - { - "expr": "sum by (job) (rate(http_server_requests_seconds_count{status=~\"5..\"}[5m]))", - "legendFormat": "{{"{{"}}job{{"}}"}} - 5xx errors", - "datasource": "Prometheus" - }, - { - "expr": "sum by (job) (rate(http_server_requests_seconds_count{status=~\"4..\"}[5m]))", - "legendFormat": "{{"{{"}}job{{"}}"}} - 4xx errors", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - } - }, - { - "id": 5, - "title": "Memory Usage", - "type": "graph", - "targets": [ - { - "expr": "jvm_memory_used_bytes / jvm_memory_max_bytes * 100", - "legendFormat": "{{"{{"}}job{{"}}"}} - {{"{{"}}area{{"}}"}}", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - } - }, - { - "id": 6, - "title": "CPU Usage", - "type": "graph", - "targets": [ - { - "expr": "rate(process_cpu_seconds_total[5m]) * 100", - "legendFormat": "{{"{{"}}job{{"}}"}}", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - } - }, - { - "id": 7, - "title": "Database Connections", - "type": "graph", - "targets": [ - { - "expr": "hikaricp_connections_active", - "legendFormat": "{{"{{"}}job{{"}}"}} - Active", - "datasource": "Prometheus" - }, - { - "expr": "hikaricp_connections_idle", - "legendFormat": "{{"{{"}}job{{"}}"}} - Idle", - "datasource": "Prometheus" - } - ], - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - } - }, - { - "id": 8, - "title": "Service Version Overview", - "type": "table", - "targets": [ - { - "expr": "app_version_info", - "format": "table", - "refId": "A" - } - ], - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ], - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 32 - } - } - ], - "time": { - "from": "now-1h", - "to": "now" - }, - "refresh": "30s" - } \ No newline at end of file diff --git a/helm/monitor/templates/grafana-service.yaml b/helm/monitor/templates/grafana-service.yaml deleted file mode 100644 index b7713a1..0000000 --- a/helm/monitor/templates/grafana-service.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: grafana - labels: - app: grafana - {{- include "monitoring.labels" . | nindent 4 }} -spec: - type: ClusterIP - ports: - - port: {{ .Values.grafana.service.port }} - targetPort: {{ .Values.grafana.service.port }} - protocol: TCP - name: http - selector: - app: grafana \ No newline at end of file diff --git a/helm/monitor/templates/grafana-deployment.yaml b/helm/monitor/templates/grafana.yaml similarity index 59% rename from helm/monitor/templates/grafana-deployment.yaml rename to helm/monitor/templates/grafana.yaml index 969c9d6..da6ea35 100644 --- a/helm/monitor/templates/grafana-deployment.yaml +++ b/helm/monitor/templates/grafana.yaml @@ -1,3 +1,13 @@ +# Grafana ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana + labels: + app: grafana + {{- include "monitoring.labels" . | nindent 4 }} +--- +# Grafana Deployment apiVersion: apps/v1 kind: Deployment metadata: @@ -54,6 +64,7 @@ spec: name: grafana-dashboards {{- if .Values.grafana.persistence.enabled }} --- +# Grafana PVC apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -69,4 +80,56 @@ spec: {{- if .Values.grafana.persistence.storageClassName }} storageClassName: {{ .Values.grafana.persistence.storageClassName }} {{- end }} +{{- end }} +--- +# Grafana Service +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app: grafana + {{- include "monitoring.labels" . | nindent 4 }} +spec: + type: ClusterIP + ports: + - port: {{ .Values.grafana.service.port }} + targetPort: {{ .Values.grafana.service.port }} + protocol: TCP + name: http + selector: + app: grafana +{{- if .Values.ingress.enabled }} +--- +# Grafana Ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana-ingress + {{- $annotations := .Values.ingress.annotations | default dict }} + {{- if $annotations }} + annotations: + {{- toYaml $annotations | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + - hosts: + - "{{ .Values.ingress.host }}" + secretName: "grafana-tls" + {{- end }} + rules: + - host: "{{ .Values.ingress.host }}" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana + port: + number: 3000 {{- end }} \ No newline at end of file diff --git a/helm/monitor/templates/prometheus-ingress.yaml b/helm/monitor/templates/ingress.yaml similarity index 51% rename from helm/monitor/templates/prometheus-ingress.yaml rename to helm/monitor/templates/ingress.yaml index 355f88e..f79ef3d 100644 --- a/helm/monitor/templates/prometheus-ingress.yaml +++ b/helm/monitor/templates/ingress.yaml @@ -1,4 +1,6 @@ -{{- if .Values.prometheus.ingress.enabled -}} +{{- if .Values.prometheus.ingress.enabled }} +--- +# Prometheus Ingress apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -31,4 +33,35 @@ spec: name: prometheus port: number: 9090 -{{- end }} \ No newline at end of file +{{- end }} +{{- if .Values.alertmanager.ingress.enabled }} +--- +# Alertmanager Ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: alertmanager-ingress + labels: + app: alertmanager + annotations: + {{- toYaml .Values.alertmanager.ingress.annotations | nindent 4 }} +spec: + ingressClassName: {{ .Values.alertmanager.ingress.className }} + {{- if .Values.alertmanager.ingress.tls }} + tls: + - hosts: + - {{ .Values.alertmanager.ingress.host }} + secretName: alertmanager-tls + {{- end }} + rules: + - host: {{ .Values.alertmanager.ingress.host }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: alertmanager + port: + number: 9093 +{{- end }} \ No newline at end of file diff --git a/helm/monitor/templates/monitoring-network-policies.yaml b/helm/monitor/templates/monitoring-network-policies.yaml deleted file mode 100644 index 937d3e3..0000000 --- a/helm/monitor/templates/monitoring-network-policies.yaml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: allow-monitoring-access - namespace: team-git-push-force-dev -spec: - podSelector: {} # Apply to all pods in the namespace - policyTypes: - - Ingress - ingress: - - from: - - namespaceSelector: - matchLabels: - name: team-git-push-force-monitor - ports: - - protocol: TCP - port: 8080 # Spring Boot Actuator - - protocol: TCP - port: 8083 # GenAI service - - protocol: TCP - port: 5432 # PostgreSQL ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: allow-prometheus-egress - namespace: team-git-push-force-monitor -spec: - podSelector: - matchLabels: - app: prometheus - policyTypes: - - Egress - egress: - - to: - - namespaceSelector: - matchLabels: - name: team-git-push-force-dev - ports: - - protocol: TCP - port: 8080 # Spring Boot services - - protocol: TCP - port: 8083 # GenAI service - - protocol: TCP - port: 5432 # PostgreSQL - - to: [] # Allow DNS resolution and other necessary egress - ports: - - protocol: UDP - port: 53 - - protocol: TCP - port: 53 \ No newline at end of file diff --git a/helm/monitor/templates/node-exporter-service.yaml b/helm/monitor/templates/node-exporter-service.yaml new file mode 100644 index 0000000..ff1f3ab --- /dev/null +++ b/helm/monitor/templates/node-exporter-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: team-git-push-force-monitor + labels: + app: node-exporter + monitoring: "true" +spec: + selector: + app: node-exporter + ports: + - name: metrics + port: 9100 + targetPort: 9100 + type: ClusterIP \ No newline at end of file diff --git a/helm/monitor/templates/postgres-exporter.yaml b/helm/monitor/templates/postgres-exporter.yaml deleted file mode 100644 index 8130133..0000000 --- a/helm/monitor/templates/postgres-exporter.yaml +++ /dev/null @@ -1,70 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: postgres-exporter - labels: - app: postgres-exporter -spec: - type: ClusterIP - ports: - - port: 9187 - targetPort: 9187 - protocol: TCP - name: metrics - selector: - app: postgres-exporter ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: postgres-exporter -spec: - replicas: 1 - selector: - matchLabels: - app: postgres-exporter - template: - metadata: - labels: - app: postgres-exporter - spec: - containers: - - name: postgres-exporter - image: prometheuscommunity/postgres-exporter:v0.15.0 - args: - - "--extend.query-path=/etc/postgres_exporter/queries.yaml" - env: - - name: DATA_SOURCE_NAME - value: "postgresql://postgres:password@postgres:5432/postgres?sslmode=disable" - ports: - - containerPort: 9187 - volumeMounts: - - name: postgres-exporter-config - mountPath: /etc/postgres_exporter/ - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "256Mi" - cpu: "200m" - volumes: - - name: postgres-exporter-config - configMap: - name: postgres-exporter-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: postgres-exporter-config - labels: - app: postgres-exporter -data: - queries.yaml: | - pg_replication: - query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag" - master: true - metrics: - - lag: - usage: "GAUGE" - description: "Replication lag behind master in seconds" \ No newline at end of file diff --git a/helm/monitor/templates/probe-client-svc.yaml b/helm/monitor/templates/probe-client-svc.yaml new file mode 100644 index 0000000..94de03b --- /dev/null +++ b/helm/monitor/templates/probe-client-svc.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + name: client-svc + namespace: team-git-push-force-monitor +spec: + jobName: client-svc + interval: 30s + module: http_2xx + prober: + url: blackbox-exporter-prometheus-blackbox-exporter.team-git-push-force-monitor.svc.cluster.local:9115 + targets: + staticConfig: + static: + - http://client-svc.team-git-push-force-dev.svc.cluster.local/auth/login \ No newline at end of file diff --git a/helm/monitor/templates/probe-genai-t2v.yaml b/helm/monitor/templates/probe-genai-t2v.yaml new file mode 100644 index 0000000..9fc4ceb --- /dev/null +++ b/helm/monitor/templates/probe-genai-t2v.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + name: genai-svc-t2v-transformers + namespace: team-git-push-force-monitor +spec: + jobName: genai-svc-t2v-transformers + interval: 30s + module: http_2xx + prober: + url: blackbox-exporter-prometheus-blackbox-exporter.team-git-push-force-monitor.svc.cluster.local:9115 + targets: + staticConfig: + static: + - http://genai-svc-t2v-transformers.team-git-push-force-dev.svc.cluster.local/ \ No newline at end of file diff --git a/helm/monitor/templates/probe-genai-weaviate.yaml b/helm/monitor/templates/probe-genai-weaviate.yaml new file mode 100644 index 0000000..758e308 --- /dev/null +++ b/helm/monitor/templates/probe-genai-weaviate.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + name: genai-svc-weaviate + namespace: team-git-push-force-monitor +spec: + jobName: genai-svc-weaviate + interval: 30s + module: http_2xx + prober: + url: blackbox-exporter-prometheus-blackbox-exporter.team-git-push-force-monitor.svc.cluster.local:9115 + targets: + staticConfig: + static: + - http://genai-svc-weaviate.team-git-push-force-dev.svc.cluster.local/v1/meta \ No newline at end of file diff --git a/helm/monitor/templates/prometheus-configmap.yaml b/helm/monitor/templates/prometheus-configmap.yaml index e3ceb99..113f442 100644 --- a/helm/monitor/templates/prometheus-configmap.yaml +++ b/helm/monitor/templates/prometheus-configmap.yaml @@ -16,73 +16,15 @@ data: rule_files: - "rules/*.yml" + # ServiceMonitor discovery is handled automatically by the Prometheus Operator + # Do not add manual scrape_configs for services here. scrape_configs: - # Prometheus itself - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] metrics_path: '/metrics' scrape_interval: 15s - # Spring Boot Services with Actuator endpoints - using FQDNs for cross-namespace access - - job_name: 'spring-boot-services' - metrics_path: '/actuator/prometheus' - scrape_interval: 15s - static_configs: - - targets: - - 'gateway.team-git-push-force-dev.svc.cluster.local:8080' - - 'user-svc.team-git-push-force-dev.svc.cluster.local:8080' - - 'concept-svc.team-git-push-force-dev.svc.cluster.local:8080' - relabel_configs: - - source_labels: [__address__] - target_label: instance - - source_labels: [__address__] - regex: '([^:]+)\..*' - target_label: service - - source_labels: [__address__] - regex: '([^:]+)\..*' - target_label: app - - target_label: monitoring - replacement: "true" - - target_label: environment - replacement: "development" - - # Node Exporter for system metrics - - job_name: 'node-exporter' - static_configs: - - targets: ['node-exporter:9100'] - relabel_configs: - - target_label: monitoring - replacement: "true" - - {{- if .Values.cadvisor.enabled }} - # cAdvisor for container metrics - - job_name: 'cadvisor' - static_configs: - - targets: ['cadvisor:8080'] - relabel_configs: - - target_label: monitoring - replacement: "true" - {{- end }} - - # PostgreSQL metrics (if using postgres_exporter) - - job_name: 'postgres' - static_configs: - - targets: ['postgres-exporter:9187'] - relabel_configs: - - target_label: monitoring - replacement: "true" - - # GenAI Service (Flask) - using FQDN for cross-namespace access - - job_name: 'genai-svc' - metrics_path: /metrics - static_configs: - - targets: ['genai-svc.team-git-push-force-dev.svc.cluster.local:8083'] - relabel_configs: - - target_label: monitoring - replacement: "true" - - # Alertmanager configuration alerting: alertmanagers: - static_configs: diff --git a/helm/monitor/templates/prometheus-deployment.yaml b/helm/monitor/templates/prometheus-deployment.yaml deleted file mode 100644 index c7a7866..0000000 --- a/helm/monitor/templates/prometheus-deployment.yaml +++ /dev/null @@ -1,326 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: prometheus - labels: - app: prometheus -spec: - type: ClusterIP - ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - name: web - selector: - app: prometheus ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app: prometheus - template: - metadata: - labels: - app: prometheus - spec: - securityContext: - runAsUser: 0 - runAsGroup: 0 - fsGroup: 0 - containers: - - name: prometheus - image: prom/prometheus:v2.52.0 - args: - - "--config.file=/etc/prometheus/prometheus.yml" - - "--storage.tsdb.path=/prometheus/" - - "--web.enable-lifecycle" - - "--storage.tsdb.retention.time=15d" - - "--storage.tsdb.retention.size=10GB" - ports: - - containerPort: 9090 - volumeMounts: - - name: prometheus-config - mountPath: /etc/prometheus/ - - name: prometheus-rules - mountPath: /etc/prometheus/rules/ - {{- if .Values.prometheus.persistence.enabled }} - - name: prometheus-storage - mountPath: /prometheus/ - {{- end }} - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "1Gi" - cpu: "500m" - volumes: - - name: prometheus-config - configMap: - name: prometheus-config - - name: prometheus-rules - configMap: - name: prometheus-rules - {{- if .Values.prometheus.persistence.enabled }} - - name: prometheus-storage - persistentVolumeClaim: - claimName: prometheus-pvc - {{- end }} -{{- if .Values.prometheus.persistence.enabled }} ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: prometheus-pvc -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: {{ .Values.prometheus.persistence.size }} - {{- if .Values.prometheus.persistence.storageClassName }} - storageClassName: {{ .Values.prometheus.persistence.storageClassName }} - {{- end }} -{{- end }} ---- -# Node Exporter for system metrics -apiVersion: v1 -kind: Service -metadata: - name: node-exporter - labels: - app: node-exporter -spec: - type: ClusterIP - ports: - - port: 9100 - targetPort: 9100 - protocol: TCP - name: metrics - selector: - app: node-exporter ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: node-exporter -spec: - selector: - matchLabels: - app: node-exporter - template: - metadata: - labels: - app: node-exporter - spec: - containers: - - name: node-exporter - image: prom/node-exporter:v1.6.1 - args: - - "--path.procfs=/host/proc" - - "--path.sysfs=/host/sys" - - "--path.rootfs=/host/root" - - "--web.listen-address=:9100" - ports: - - containerPort: 9100 - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - readOnly: true - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / -{{- if .Values.cadvisor.enabled }} ---- -# cAdvisor for container metrics -apiVersion: v1 -kind: Service -metadata: - name: cadvisor - labels: - app: cadvisor -spec: - type: ClusterIP - ports: - - port: 8080 - targetPort: 8080 - protocol: TCP - name: metrics - selector: - app: cadvisor ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: cadvisor -spec: - selector: - matchLabels: - app: cadvisor - template: - metadata: - labels: - app: cadvisor - spec: - containers: - - name: cadvisor - image: gcr.io/cadvisor/cadvisor:v0.47.2 - args: - - "--logtostderr" - - "--docker_only" - - "--metrics_only" - ports: - - containerPort: 8080 - volumeMounts: - - name: rootfs - mountPath: /rootfs - readOnly: true - - name: var-run - mountPath: /var/run - readOnly: true - - name: sys - mountPath: /sys - readOnly: true - - name: var-lib-docker - mountPath: /var/lib/docker - readOnly: true - - name: devicemapper - mountPath: /dev/disk - readOnly: true - volumes: - - name: rootfs - hostPath: - path: / - - name: var-run - hostPath: - path: /var/run - - name: sys - hostPath: - path: /sys - - name: var-lib-docker - hostPath: - path: /var/lib/docker - - name: devicemapper - hostPath: - path: /dev/disk -{{- end }} ---- -# Alertmanager -apiVersion: v1 -kind: Service -metadata: - name: alertmanager - labels: - app: alertmanager -spec: - type: ClusterIP - ports: - - port: 9093 - targetPort: 9093 - protocol: TCP - name: web - selector: - app: alertmanager ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: alertmanager -spec: - replicas: 1 - selector: - matchLabels: - app: alertmanager - template: - metadata: - labels: - app: alertmanager - spec: - containers: - - name: alertmanager - image: prom/alertmanager:v0.26.0 - args: - - "--config.file=/etc/alertmanager/alertmanager.yml" - - "--storage.path=/alertmanager/" - ports: - - containerPort: 9093 - volumeMounts: - - name: alertmanager-config - mountPath: /etc/alertmanager/ - {{- if .Values.alertmanager.persistence.enabled }} - - name: alertmanager-storage - mountPath: /alertmanager/ - {{- end }} - resources: - requests: - memory: "128Mi" - cpu: "50m" - limits: - memory: "512Mi" - cpu: "200m" - volumes: - - name: alertmanager-config - configMap: - name: alertmanager-config - {{- if .Values.alertmanager.persistence.enabled }} - - name: alertmanager-storage - persistentVolumeClaim: - claimName: alertmanager-pvc - {{- end }} -{{- if .Values.alertmanager.persistence.enabled }} ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: alertmanager-pvc -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: {{ .Values.alertmanager.persistence.size }} - {{- if .Values.alertmanager.persistence.storageClassName }} - storageClassName: {{ .Values.alertmanager.persistence.storageClassName }} - {{- end }} -{{- end }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-config - labels: - app: alertmanager -data: - alertmanager.yml: | - global: - resolve_timeout: 5m - - route: - group_by: ['alertname'] - group_wait: 10s - group_interval: 10s - repeat_interval: 1h - receiver: 'web.hook' - - receivers: - - name: 'web.hook' - webhook_configs: - - url: 'http://127.0.0.1:5001/' \ No newline at end of file diff --git a/helm/monitor/templates/prometheus-operator.yaml b/helm/monitor/templates/prometheus-operator.yaml new file mode 100644 index 0000000..ec52fb7 --- /dev/null +++ b/helm/monitor/templates/prometheus-operator.yaml @@ -0,0 +1,111 @@ +# Prometheus Custom Resource for Prometheus Operator +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + namespace: team-git-push-force-monitor + labels: + {{- include "monitoring.labels" . | nindent 4 }} +spec: + serviceAccountName: prometheus + serviceMonitorSelector: {} + serviceMonitorNamespaceSelector: {} + ruleSelector: {} + ruleNamespaceSelector: {} + probeSelector: {} + probeNamespaceSelector: {} + podMonitorSelector: {} + podMonitorNamespaceSelector: {} + alertmanagerSelector: {} + alertmanagerNamespaceSelector: {} + + alerting: + alertmanagers: + - namespace: team-git-push-force-monitor + name: alertmanager + port: web + resources: + requests: + memory: 256Mi + cpu: 100m + limits: + memory: 1Gi + cpu: 500m + retention: 15d + storage: + volumeClaimTemplate: + spec: + {{- if .Values.prometheus.persistence.storageClassName }} + storageClassName: {{ .Values.prometheus.persistence.storageClassName }} + {{- end }} + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: {{ .Values.prometheus.persistence.size | default "10Gi" }} + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + version: v2.52.0 +--- +# PrometheusRule for alerting rules +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-rules + namespace: team-git-push-force-monitor + labels: + app: prometheus + {{- include "monitoring.labels" . | nindent 4 }} +spec: + groups: + - name: ai-event-concepter + rules: + # Service availability alerts + - alert: ServiceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{`{{`}} $labels.instance {{`}}`}} is down" + description: "Service {{`{{`}} $labels.instance {{`}}`}} has been down for more than 1 minute." + + # High memory usage + - alert: HighMemoryUsage + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{`{{`}} $labels.instance {{`}}`}}" + description: "Memory usage is above 85% on {{`{{`}} $labels.instance {{`}}`}}" + + # High CPU usage + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{`{{`}} $labels.instance {{`}}`}}" + description: "CPU usage is above 80% on {{`{{`}} $labels.instance {{`}}`}}" + + # Spring Boot specific alerts + - alert: SpringBootHighErrorRate + expr: rate(http_server_requests_seconds_count{status=~"5.."}[5m]) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High error rate in Spring Boot service {{`{{`}} $labels.service {{`}}`}}" + description: "Error rate is above 0.1 errors per second in {{`{{`}} $labels.service {{`}}`}}" + + - alert: SpringBootHighResponseTime + expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[5m])) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "High response time in Spring Boot service {{`{{`}} $labels.service {{`}}`}}" + description: "95th percentile response time is above 2 seconds in {{`{{`}} $labels.service {{`}}`}}" \ No newline at end of file diff --git a/helm/monitor/templates/prometheus-rules-configmap.yaml b/helm/monitor/templates/prometheus-rules-configmap.yaml deleted file mode 100644 index 9aca8f9..0000000 --- a/helm/monitor/templates/prometheus-rules-configmap.yaml +++ /dev/null @@ -1,89 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-rules - labels: - app: prometheus -data: - rules.yml: | - groups: - - name: ai-event-concepter - rules: - # Service availability alerts - - alert: ServiceDown - expr: up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Service {{`{{`}} $labels.instance {{`}}`}} is down" - description: "Service {{`{{`}} $labels.instance {{`}}`}} has been down for more than 1 minute." - - # High memory usage - - alert: HighMemoryUsage - expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 - for: 5m - labels: - severity: warning - annotations: - summary: "High memory usage on {{`{{`}} $labels.instance {{`}}`}}" - description: "Memory usage is above 85% on {{`{{`}} $labels.instance {{`}}`}}" - - # High CPU usage - - alert: HighCPUUsage - expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 - for: 5m - labels: - severity: warning - annotations: - summary: "High CPU usage on {{`{{`}} $labels.instance {{`}}`}}" - description: "CPU usage is above 80% on {{`{{`}} $labels.instance {{`}}`}}" - - # Disk space running out - - alert: DiskSpaceFilling - expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Disk space filling up on {{`{{`}} $labels.instance {{`}}`}}" - description: "Disk usage is above 90% on {{`{{`}} $labels.instance {{`}}`}}" - - # Spring Boot specific alerts - - alert: SpringBootHighErrorRate - expr: rate(http_server_requests_seconds_count{status=~"5.."}[5m]) > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: "High error rate in Spring Boot service {{`{{`}} $labels.service {{`}}`}}" - description: "Error rate is above 0.1 errors per second in {{`{{`}} $labels.service {{`}}`}}" - - - alert: SpringBootHighResponseTime - expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[5m])) > 2 - for: 5m - labels: - severity: warning - annotations: - summary: "High response time in Spring Boot service {{`{{`}} $labels.service {{`}}`}}" - description: "95th percentile response time is above 2 seconds in {{`{{`}} $labels.service {{`}}`}}" - - # Database alerts - - alert: PostgresHighConnections - expr: pg_stat_database_numbackends > 80 - for: 5m - labels: - severity: warning - annotations: - summary: "High number of PostgreSQL connections" - description: "PostgreSQL has more than 80 active connections" - - # Container alerts - - alert: ContainerHighMemoryUsage - expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes * 100) > 85 - for: 5m - labels: - severity: warning - annotations: - summary: "High memory usage in container {{`{{`}} $labels.name {{`}}`}}" - description: "Container {{`{{`}} $labels.name {{`}}`}} is using more than 85% of its memory limit" \ No newline at end of file diff --git a/helm/monitor/templates/prometheus-service.yaml b/helm/monitor/templates/prometheus-service.yaml index 47f74c0..74836a1 100644 --- a/helm/monitor/templates/prometheus-service.yaml +++ b/helm/monitor/templates/prometheus-service.yaml @@ -2,15 +2,18 @@ apiVersion: v1 kind: Service metadata: name: prometheus + namespace: {{ .Release.Namespace }} labels: app: prometheus - {{- include "monitoring.labels" . | nindent 4 }} + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: prometheus spec: type: ClusterIP ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - name: web + - name: web + port: 9090 + targetPort: 9090 + protocol: TCP selector: - app: prometheus \ No newline at end of file + app.kubernetes.io/name: prometheus + prometheus: prometheus \ No newline at end of file diff --git a/helm/monitor/templates/rbac.yaml b/helm/monitor/templates/rbac.yaml new file mode 100644 index 0000000..ce4d578 --- /dev/null +++ b/helm/monitor/templates/rbac.yaml @@ -0,0 +1,88 @@ +# Role for Prometheus service discovery in monitoring namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-discovery-monitor + namespace: team-git-push-force-monitor + labels: + {{- include "monitoring.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: ["monitoring.coreos.com"] + resources: + - servicemonitors + - podmonitors + verbs: ["get", "list", "watch"] +--- +# Role for Prometheus service discovery in application namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-discovery-app + namespace: team-git-push-force-dev + labels: + {{- include "monitoring.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: ["monitoring.coreos.com"] + resources: + - servicemonitors + - podmonitors + verbs: ["get", "list", "watch"] +--- +# RoleBinding for Prometheus in monitoring namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-discovery-monitor + namespace: team-git-push-force-monitor + labels: + {{- include "monitoring.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-discovery-monitor +subjects: +- kind: ServiceAccount + name: prometheus + namespace: team-git-push-force-monitor +--- +# RoleBinding for Prometheus in application namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-discovery-app + namespace: team-git-push-force-dev + labels: + {{- include "monitoring.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-discovery-app +subjects: +- kind: ServiceAccount + name: prometheus + namespace: team-git-push-force-monitor +--- +# ServiceAccount for Prometheus +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: team-git-push-force-monitor + labels: + {{- include "monitoring.labels" . | nindent 4 }} \ No newline at end of file diff --git a/helm/monitor/templates/servicemonitors.yaml b/helm/monitor/templates/servicemonitors.yaml new file mode 100644 index 0000000..319dfc1 --- /dev/null +++ b/helm/monitor/templates/servicemonitors.yaml @@ -0,0 +1,139 @@ +# ServiceMonitor for Gateway +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: gateway-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /actuator/prometheus + relabelings: + - sourceLabels: [__meta_kubernetes_service_label_app] + targetLabel: service_name + - sourceLabels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + targetLabel: managed_by + port: http + namespaceSelector: + matchNames: + - team-git-push-force-dev + selector: + matchLabels: + app: gateway + monitoring: "true" +--- +# ServiceMonitor for User Service +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: user-svc-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /actuator/prometheus + relabelings: + - sourceLabels: [__meta_kubernetes_service_label_app] + targetLabel: service_name + - sourceLabels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + targetLabel: managed_by + port: http + namespaceSelector: + matchNames: + - team-git-push-force-dev + selector: + matchLabels: + app: user-svc + monitoring: "true" +--- +# ServiceMonitor for Concept Service +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: concept-svc-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /actuator/prometheus + relabelings: + - sourceLabels: [__meta_kubernetes_service_label_app] + targetLabel: service_name + - sourceLabels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + targetLabel: managed_by + port: http + namespaceSelector: + matchNames: + - team-git-push-force-dev + selector: + matchLabels: + app: concept-svc + monitoring: "true" +--- +# ServiceMonitor for GenAI Service (Python Flask/FastAPI) +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: genai-svc-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /metrics + port: http + namespaceSelector: + matchNames: + - team-git-push-force-dev + selector: + matchLabels: + app: genai-svc + monitoring: "true" +--- + +# ServiceMonitor for MinIO Object Storage - Metrics endpoint +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: genai-svc-minio-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /minio/v2/metrics/cluster + relabelings: + - sourceLabels: [__meta_kubernetes_service_label_app] + targetLabel: service_name + - sourceLabels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] + targetLabel: managed_by + port: api + namespaceSelector: + matchNames: + - team-git-push-force-dev + selector: + matchLabels: + app: genai-svc-minio + +# ServiceMonitor for Node Exporter +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: node-exporter-service-monitor + labels: + app: prometheus +spec: + endpoints: + - interval: 30s + path: /metrics + port: metrics + namespaceSelector: + matchNames: + - team-git-push-force-monitor + selector: + matchLabels: + app: node-exporter + monitoring: "true" \ No newline at end of file diff --git a/helm/monitor/values.yaml b/helm/monitor/values.yaml index 1f68122..4c8b137 100644 --- a/helm/monitor/values.yaml +++ b/helm/monitor/values.yaml @@ -1,7 +1,7 @@ # Namespace configuration # NOTE: The namespace must already exist. This chart will NOT create it. namespace: - name: "team-git-push-force-monitoring" + name: "team-git-push-force-monitor" ingress: enabled: true @@ -55,8 +55,8 @@ prometheus: repository: prom/prometheus tag: v2.52.0 # Use the latest stable version as needed serviceAccount: - create: false - name: default + create: true + name: prometheus resources: requests: memory: "256Mi" @@ -75,9 +75,9 @@ prometheus: service: port: 9090 rbac: - create: false + create: true serviceAccount: - create: false + create: true ingress: enabled: true className: "nginx" @@ -119,23 +119,57 @@ cadvisor: # PostgreSQL Exporter configuration postgresExporter: + userDb: + enabled: true + image: + repository: prometheuscommunity/postgres-exporter + tag: v0.15.0 + database: + host: user-svc-db.team-git-push-force-dev.svc.cluster.local + port: 5432 + name: userdb + user: postgres + password: postgres + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + conceptDb: + enabled: true + image: + repository: prometheuscommunity/postgres-exporter + tag: v0.15.0 + database: + host: concept-svc-db.team-git-push-force-dev.svc.cluster.local + port: 5432 + name: conceptdb + user: postgres + password: postgres + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + +# Redis Exporter configuration +redisExporter: enabled: true image: - repository: prometheuscommunity/postgres-exporter - tag: v0.15.0 - database: - host: postgres - port: 5432 - name: postgres - user: postgres - password: password + repository: oliver006/redis_exporter + tag: v1.59.0 + redisAddress: "redis://redis-master.team-git-push-force-dev.svc.cluster.local:6379" resources: requests: memory: "64Mi" cpu: "50m" limits: - memory: "256Mi" - cpu: "200m" + memory: "128Mi" + cpu: "100m" # Alertmanager configuration alertmanager: @@ -163,4 +197,4 @@ alertmanager: cert-manager.io/cluster-issuer: "letsencrypt-prod" kubernetes.io/ingress.class: nginx nginx.ingress.kubernetes.io/ssl-redirect: "true" - nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" \ No newline at end of file