diff --git a/charts/dso-grafana/Chart.yaml b/charts/dso-grafana/Chart.yaml index 6ef3eb8..fe71c0c 100644 --- a/charts/dso-grafana/Chart.yaml +++ b/charts/dso-grafana/Chart.yaml @@ -2,11 +2,16 @@ apiVersion: v2 name: dso-grafana description: This Helm chart deploy Grafana instances and default dashboards for each projects read from values file. type: application -version: 1.7.0 +version: 1.8.0 dependencies: - name: grafana-operator repository: oci://ghcr.io/grafana/helm-charts version: v5.10.0 +- name: alloy + alias: rules + version: 1.6.2 + repository: https://grafana.github.io/helm-charts + condition: rules.enabled maintainers: - name: cloud-pi-native email: cloudpinative-relations@interieur.gouv.fr diff --git a/charts/dso-grafana/README.md b/charts/dso-grafana/README.md index 7b0fbe0..6961dad 100644 --- a/charts/dso-grafana/README.md +++ b/charts/dso-grafana/README.md @@ -1,6 +1,6 @@ # dso-grafana -![Version: 1.7.0](https://img.shields.io/badge/Version-1.7.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 1.8.0](https://img.shields.io/badge/Version-1.8.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) This Helm chart deploy Grafana instances and default dashboards for each projects read from values file. @@ -14,6 +14,7 @@ This Helm chart deploy Grafana instances and default dashboards for each project | Repository | Name | Version | |------------|------|---------| +| https://grafana.github.io/helm-charts | rules(alloy) | 1.6.2 | | oci://ghcr.io/grafana/helm-charts | grafana-operator | v5.10.0 | ## Values @@ -35,6 +36,10 @@ This Helm chart deploy Grafana instances and default dashboards for each project | oauth.secret | string | `""` | Shared secret to autorize OAuth usage | | oauth.url | string | `"https://grafana.example.com/realms/dso"` | URL of the Idp provider to use | | projects | list | `[{"envs":{"hprod":{"groups":["Editors","Viewers"]},"prod":{"groups":["Editors","Viewers"]}},"projectName":"console","projectRepository":{"path":".","url":"https://gitlab.com/project/infra-observability.git"}}]` | Default list of sample projects. One Grafana instance is created for each item. This is list is overriden by the global.projects list if provided. | +| rules | object | `{"alloy":{"configMap":{"create":false,"key":"config.alloy","name":"alloy-rules-forwarder"}},"controller":{"replicas":"2","type":"deployment"},"crds":{"create":false},"enabled":false,"endpoints":{"logs":"","metrics":""}}` | Alloy instance configuration to forward rules to loki/mimir | +| rules.enabled | bool | `false` | Enabling alloy rules forwarder | +| rules.endpoints.logs | string | `""` | URL of the loki ruler | +| rules.endpoints.metrics | string | `""` | URL of the Mimir ruler | | server | object | `{"certManager":{"enabled":true},"ingressClassName":"nginx","url":"grafana.example.com"}` | Ingress server configuration | | server.certManager | object | `{"enabled":true}` | Enabling cert manager configuration | | server.ingressClassName | string | `"nginx"` | Class name of the ingress controller to use | diff --git a/charts/dso-grafana/templates/alloy-rules-forwarder.yaml b/charts/dso-grafana/templates/alloy-rules-forwarder.yaml new file mode 100644 index 0000000..cd915f9 --- /dev/null +++ b/charts/dso-grafana/templates/alloy-rules-forwarder.yaml @@ -0,0 +1,37 @@ +{{- if .Values.rules.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: alloy-rules-forwarder + namespace: {{ $.Values.grafanaNamespace }} +data: + config.alloy: | + {{- range $project := default .Values.projects .Values.global.projects }} + {{- range $env, $val := $project.envs }} + {{- range $tenant, $val := $val.tenants }} + mimir.rules.kubernetes {{ regexReplaceAll "\\W+" $tenant "_" | quote }} { + address = {{ $.Values.rules.endpoints.metrics | quote }} + tenant_id = {{ $tenant | quote }} + rule_selector { + match_labels = { + type = "metrics", + tenant_id = {{ $tenant | quote }}, + } + } + } + + loki.rules.kubernetes {{ regexReplaceAll "\\W+" $tenant "_" | quote }} { + address = {{ $.Values.rules.endpoints.logs | quote }} + tenant_id = {{ $tenant | quote }} + rule_selector { + match_labels = { + type = "logs", + tenant_id = {{ $tenant | quote }}, + } + } + } + + {{- end }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/dso-grafana/templates/default-rules.yaml b/charts/dso-grafana/templates/default-rules.yaml new file mode 100644 index 0000000..516d025 --- /dev/null +++ b/charts/dso-grafana/templates/default-rules.yaml @@ -0,0 +1,271 @@ +{{- range $project := default .Values.projects .Values.global.projects }} +{{- range $env, $val := $project.envs }} +{{- range $tenant, $val := $val.tenants }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + type: "metrics" + tenant_id: {{ lower ($tenant) | quote }} + name: "kubernetes-monitoring-rules-{{ lower ($tenant) }}" +spec: + groups: + - name: k8s.rules.container_cpu_usage_seconds_total + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - name: k8s.rules.container_memory_working_set_bytes + rules: + - expr: | + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - name: k8s.rules.container_memory_rss + rules: + - expr: | + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - name: k8s.rules.container_memory_cache + rules: + - expr: | + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - name: k8s.rules.container_memory_swap + rules: + - expr: | + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - name: k8s.rules.container_resource + rules: + - expr: | + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - name: k8s.rules.pod_owner + rules: + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: | + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - expr: | + avg by (cluster, node) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + record: node:node_cpu_utilization:ratio_rate5m + - expr: | + avg by (cluster) ( + node:node_cpu_utilization:ratio_rate5m + ) + record: cluster:node_cpu:ratio_rate5m + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile +--- +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/dso-grafana/values.yaml b/charts/dso-grafana/values.yaml index 16646ff..3287285 100644 --- a/charts/dso-grafana/values.yaml +++ b/charts/dso-grafana/values.yaml @@ -69,3 +69,23 @@ dashboards: - kyverno-policy-reporter.json - security-overview.json - trivy-dashboard.json + +# -- Alloy instance configuration to forward rules to loki/mimir +rules: + # -- Enabling alloy rules forwarder + enabled: false + endpoints: + # -- URL of the loki ruler + logs: "" + # -- URL of the Mimir ruler + metrics: "" + crds: + create: false + alloy: + configMap: + create: false + name: alloy-rules-forwarder + key: config.alloy + controller: + type: "deployment" + replicas: "2" \ No newline at end of file