Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion charts/dso-grafana/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@ apiVersion: v2
name: dso-grafana
description: This Helm chart deploy Grafana instances and default dashboards for each projects read from values file.
type: application
version: 1.7.0
version: 1.8.0
dependencies:
- name: grafana-operator
repository: oci://ghcr.io/grafana/helm-charts
version: v5.10.0
- name: alloy
alias: rules
version: 1.6.2
repository: https://grafana.github.io/helm-charts
condition: rules.enabled
maintainers:
- name: cloud-pi-native
email: cloudpinative-relations@interieur.gouv.fr
Expand Down
7 changes: 6 additions & 1 deletion charts/dso-grafana/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# dso-grafana

![Version: 1.7.0](https://img.shields.io/badge/Version-1.7.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
![Version: 1.8.0](https://img.shields.io/badge/Version-1.8.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)

This Helm chart deploy Grafana instances and default dashboards for each projects read from values file.

Expand All @@ -14,6 +14,7 @@ This Helm chart deploy Grafana instances and default dashboards for each project

| Repository | Name | Version |
|------------|------|---------|
| https://grafana.github.io/helm-charts | rules(alloy) | 1.6.2 |
| oci://ghcr.io/grafana/helm-charts | grafana-operator | v5.10.0 |

## Values
Expand All @@ -35,6 +36,10 @@ This Helm chart deploy Grafana instances and default dashboards for each project
| oauth.secret | string | `""` | Shared secret to autorize OAuth usage |
| oauth.url | string | `"https://grafana.example.com/realms/dso"` | URL of the Idp provider to use |
| projects | list | `[{"envs":{"hprod":{"groups":["Editors","Viewers"]},"prod":{"groups":["Editors","Viewers"]}},"projectName":"console","projectRepository":{"path":".","url":"https://gitlab.com/project/infra-observability.git"}}]` | Default list of sample projects. One Grafana instance is created for each item. This is list is overriden by the global.projects list if provided. |
| rules | object | `{"alloy":{"configMap":{"create":false,"key":"config.alloy","name":"alloy-rules-forwarder"}},"controller":{"replicas":"2","type":"deployment"},"crds":{"create":false},"enabled":false,"endpoints":{"logs":"","metrics":""}}` | Alloy instance configuration to forward rules to loki/mimir |
| rules.enabled | bool | `false` | Enabling alloy rules forwarder |
| rules.endpoints.logs | string | `""` | URL of the loki ruler |
| rules.endpoints.metrics | string | `""` | URL of the Mimir ruler |
| server | object | `{"certManager":{"enabled":true},"ingressClassName":"nginx","url":"grafana.example.com"}` | Ingress server configuration |
| server.certManager | object | `{"enabled":true}` | Enabling cert manager configuration |
| server.ingressClassName | string | `"nginx"` | Class name of the ingress controller to use |
Expand Down
37 changes: 37 additions & 0 deletions charts/dso-grafana/templates/alloy-rules-forwarder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{{- if .Values.rules.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: alloy-rules-forwarder
namespace: {{ $.Values.grafanaNamespace }}
data:
config.alloy: |
{{- range $project := default .Values.projects .Values.global.projects }}
{{- range $env, $val := $project.envs }}
{{- range $tenant, $val := $val.tenants }}
mimir.rules.kubernetes {{ regexReplaceAll "\\W+" $tenant "_" | quote }} {
address = {{ $.Values.rules.endpoints.metrics | quote }}
tenant_id = {{ $tenant | quote }}
rule_selector {
match_labels = {
type = "metrics",
tenant_id = {{ $tenant | quote }},
}
}
}

loki.rules.kubernetes {{ regexReplaceAll "\\W+" $tenant "_" | quote }} {
address = {{ $.Values.rules.endpoints.logs | quote }}
tenant_id = {{ $tenant | quote }}
rule_selector {
match_labels = {
type = "logs",
tenant_id = {{ $tenant | quote }},
}
}
}

{{- end }}
{{- end }}
{{- end }}
{{- end }}
271 changes: 271 additions & 0 deletions charts/dso-grafana/templates/default-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
{{- range $project := default .Values.projects .Values.global.projects }}
{{- range $env, $val := $project.envs }}
{{- range $tenant, $val := $val.tenants }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
type: "metrics"
tenant_id: {{ lower ($tenant) | quote }}
name: "kubernetes-monitoring-rules-{{ lower ($tenant) }}"
spec:
groups:
- name: k8s.rules.container_cpu_usage_seconds_total
rules:
- expr: |
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- name: k8s.rules.container_memory_working_set_bytes
rules:
- expr: |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes
- name: k8s.rules.container_memory_rss
rules:
- expr: |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
- name: k8s.rules.container_memory_cache
rules:
- expr: |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
- name: k8s.rules.container_memory_swap
rules:
- expr: |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
- name: k8s.rules.container_resource
rules:
- expr: |
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: |
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- expr: |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- expr: |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum
- name: k8s.rules.pod_owner
rules:
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel
- name: kube-scheduler.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules
rules:
- expr: |
topk by(cluster, namespace, pod) (1,
max by (cluster, node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
record: :node_memory_MemAvailable_bytes:sum
- expr: |
avg by (cluster, node) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- expr: |
avg by (cluster) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m
- name: kubelet.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.99"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.9"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.5"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
---
{{- end }}
{{- end }}
{{- end }}
20 changes: 20 additions & 0 deletions charts/dso-grafana/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,23 @@ dashboards:
- kyverno-policy-reporter.json
- security-overview.json
- trivy-dashboard.json

# -- Alloy instance configuration to forward rules to loki/mimir
rules:
# -- Enabling alloy rules forwarder
enabled: false
endpoints:
# -- URL of the loki ruler
logs: ""
# -- URL of the Mimir ruler
metrics: ""
crds:
create: false
alloy:
configMap:
create: false
name: alloy-rules-forwarder
key: config.alloy
controller:
type: "deployment"
replicas: "2"